diff --git a/.travis.yml b/.travis.yml index 7d8202d..aea9647 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,12 +20,12 @@ install: # TODO: Not in repos for 14.04 trustz but comes 16.04 xenial #- sudo apt-get install -y libpnetcdf-dev pnetcdf-bin # Install HDFS - # TODO: Not sure with which c libray hdfs should be used and if it is in + # TODO: Not sure with which c library hdfs should be used and if it is in # the ubuntu repos # Probably hadoop needs to be installed an provides native API. # Install Amazon S3 # TODO: The needed library needs to be installed. Follow the instructions in - # aiori-S3.c to achive this. + # aiori-S3.c to achieve this. # GPFS # NOTE: Think GPFS need a license and is therefore not testable with travis. script: diff --git a/Makefile.am b/Makefile.am index d874a90..d6465a8 100755 --- a/Makefile.am +++ b/Makefile.am @@ -10,4 +10,5 @@ ACLOCAL_AMFLAGS = -I config # `make dist` and `make test` for simple test binaries that do not require any # special environment. #TESTS = testing/basic-tests.sh -#DISTCLEANFILES = -r test test_out + +DISTCLEANFILES = ./src/build.conf diff --git a/NEWS b/NEWS index 9367112..00b98ff 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Version 3.3.0+dev +Version 3.4.0+dev -------------------------------------------------------------------------------- New major features: @@ -7,6 +7,54 @@ New minor features: Bugfixes: + +Version 3.3.0 +-------------------------------------------------------------------------------- + +New major features: + +- Add CephFS AIORI (Mark Nelson) +- Add Gfarm AIORI (Osamu Tatebe) +- Add DAOS AIORI (Mohamad Chaarawi) +- Add DAOS DFS AIORI (Mohamad Chaarawi) +- -B option has been replaced with --posix.odirect + +New minor features: + +- Display outlier host names (Jean-Yves Vet) +- Enable global default dir layout for subdirs in Lustre (Petros Koutoupis) +- Removed pound signs (#) from mdtest output file names (Julian Kunkel) +- Print I/O hints from NCMPI (Wei-keng Liao) +- Add mknod support to mdtest (Gu Zheng) +- Refactor AIORI-specific options (Julian Kunkel) +- Enable IME native backend for mdtest (Jean-Yves Vet) +- Enable mkdir/rmdir to IME AIORI (Jean-Yves Vet) +- Add HDF5 collective metadata option (Rob Latham) +- Add support for sync to AIORIs (Julian Kunkel) + +General user improvements and bug fixes: + +- Allocate aligned buffers to support DirectIO for BeeGFS (Sven Breuner) +- Added IOPS and latency results to json output (Robert LeBlanc) +- Fixed case where numTasks is not evenly divisible by tasksPerNode (J. Schwartz) +- Fix several memory leaks and buffer alignment problems (J. Schwartz, Axel Huebl, Sylvain Didelot) +- Add mdtest data verification (Julian Kunkel) +- Clean up functionality of stonewall (Julian Kunkel) +- Fix checks for lustre_user.h (Andreas Dilger) +- Make write verification work without read test (Jean-Yves Vet) +- Documentation updates (Vaclav Hapla, Glenn Lockwood) +- Add more debugging support (J. Schwartz) + +General developer improvements: + +- Fix type casting errors (Vaclav Hapla) +- Add basic test infrastructure (Julian Kunkel, Glenn Lockwood) +- Conform to strict C99 (Glenn Lockwood) + +Known issues: + +- S3 and HDFS backends may not compile with new versions of respective libraries + Version 3.2.1 -------------------------------------------------------------------------------- @@ -63,7 +111,7 @@ Known issues: because `-u`/`-c`/`-p` cannot be specified (issue #98) - `writeCheck` cannot be enabled for write-only tests using some AIORIs such as MPI-IO (pull request #89) - + Version 3.0.2 -------------------------------------------------------------------------------- @@ -91,7 +139,7 @@ Version 2.10.3 Contributed by demyn@users.sourceforge.net - Ported to Windows. Required changes related to 'long' types, which on Windows are always 32-bits, even on 64-bit systems. Missing system headers and - functions acount for most of the remaining changes. + functions account for most of the remaining changes. New files for Windows: - IOR/ior.vcproj - Visual C project file - IOR/src/C/win/getopt.{h,c} - GNU getopt() support @@ -151,7 +199,7 @@ Version 2.9.5 - Added notification for "Using reorderTasks '-C' (expecting block, not cyclic, task assignment)" - Corrected bug with read performance with stonewalling (was using full size, - stat'ed file instead of bytes transfered). + stat'ed file instead of bytes transferred). Version 2.9.4 -------------------------------------------------------------------------------- diff --git a/README.md b/README.md index c1c73a8..081752b 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# HPC IO Benchmark Repository [![Build Status](https://travis-ci.org/hpc/ior.svg?branch=master)](https://travis-ci.org/hpc/ior) +# HPC IO Benchmark Repository [![Build Status](https://travis-ci.org/hpc/ior.svg?branch=main)](https://travis-ci.org/hpc/ior) This repository contains the IOR and mdtest parallel I/O benchmarks. The -[official IOR/mdtest documention][] can be found in the `docs/` subdirectory or -on Read the Docs. +[official IOR/mdtest documentation][] can be found in the `docs/` subdirectory +or on Read the Docs. ## Building @@ -28,4 +28,4 @@ on Read the Docs. distributions at once. [official IOR release]: https://github.com/hpc/ior/releases -[official IOR/mdtest documention]: http://ior.readthedocs.org/ +[official IOR/mdtest documentation]: http://ior.readthedocs.org/ diff --git a/README_DAOS b/README_DAOS index ed98bd6..0314277 100644 --- a/README_DAOS +++ b/README_DAOS @@ -4,55 +4,13 @@ Building The DAOS library must be installed on the system. ./bootstrap -./configure --prefix=iorInstallDir --with-daos=DIR --with-cart=DIR - -One must specify "--with-daos=/path/to/daos/install and --with-cart". When that -is specified the DAOS and DFS driver will be built. - -The DAOS driver uses the DAOS API to open a container (or create it if it -doesn't exist first) then create an array object in that container (file) and -read/write to the array object using the daos Array API. The DAOS driver works -with IOR only (no mdtest support yet). The file name used by IOR (passed by -o -option) is hashed to an object ID that is used as the array oid. +./configure --prefix=iorInstallDir --with-daos=DIR The DFS (DAOS File System) driver creates an encapsulated namespace and emulates the POSIX driver using the DFS API directly on top of DAOS. The DFS driver works with both IOR and mdtest. -Running with DAOS API ---------------------- - -ior -a DAOS [ior_options] [daos_options] - -In the IOR options, the file name should be specified as a container uuid using -"-o ". If the "-E" option is given, then this UUID shall denote -an existing container created by a "matching" IOR run. Otherwise, IOR will -create a new container with this UUID. In the latter case, one may use -uuidgen(1) to generate the UUID of the new container. - -The DAOS options include: - -Required Options: ---daos.pool : pool uuid to connect to (has to be created beforehand) ---daos.svcl : pool svcl list (: separated) ---daos.cont : container for the IOR files/objects (can use `uuidgen`) - -Optional Options: ---daos.group : group name of servers with the pool ---daos.chunk_size : Chunk size of the array object controlling striping over DKEYs ---daos.destroy flag to destory the container on finalize ---daos.oclass : specific object class for array object - -Examples that should work include: - - - "ior -a DAOS -w -W -o file_name --daos.pool --daos.svcl \ - --daos.cont " - - - "ior -a DAOS -w -W -r -R -o file_name -b 1g -t 4m \ - --daos.pool --daos.svcl --daos.cont \ - --daos.chunk_size 1024 --daos.oclass R2" - -Running with DFS API +Running --------------------- ior -a DFS [ior_options] [dfs_options] @@ -64,15 +22,17 @@ Required Options: --dfs.cont : container uuid that will hold the encapsulated namespace Optional Options: ---dfs.group : group name of servers with the pool ---dfs.chunk_size : Chunk size of the files ---dfs.destroy flag to destory the container on finalize ---dfs.oclass : specific object class for files +--dfs.group : group name of servers with the pool (default: daos_server) +--dfs.chunk_size : Chunk size of the files (default: 1MiB) +--dfs.destroy: flag to destroy the container on finalize (default: no) +--dfs.oclass : specific object class for files (default: SX) +--dfs.dir_oclass : specific object class for directories (default: SX) +--dfs.prefix : absolute path to account for DFS files/dirs before the cont root -In the IOR options, the file name should be specified on the root dir directly -since ior does not create directories and the DFS container representing the -encapsulated namespace is not the same as the system namespace the user is -executing from. +If prefix is not set, in the IOR options, the file name should be specified on +the root dir directly since ior does not create directories and the DFS +container representing the encapsulated namespace is not the same as the system +namespace the user is executing from. Examples that should work include: - "ior -a DFS -w -W -o /test1 --dfs.pool --dfs.svcl --dfs.cont " @@ -80,7 +40,8 @@ Examples that should work include: - "ior -a DFS -w -r -o /test3 -b 8g -t 1m -C --dfs.pool --dfs.svcl --dfs.cont " Running mdtest, the user needs to specify a directory with -d where the test -tree will be created. Some examples: - - "mdtest -a DFS -n 100 -F -D -d /bla --dfs.pool --dfs.svcl --dfs.cont " - - "mdtest -a DFS -n 1000 -F -C -d /bla --dfs.pool --dfs.svcl --dfs.cont " - - "mdtest -a DFS -I 10 -z 5 -b 2 -L -d /bla --dfs.pool --dfs.svcl --dfs.cont " +tree will be created (set '/' if writing to the root of the DFS container). Some +examples: + - "mdtest -a DFS -n 100 -F -D -d / --dfs.pool --dfs.svcl --dfs.cont " + - "mdtest -a DFS -n 1000 -F -C -d / --dfs.pool --dfs.svcl --dfs.cont " + - "mdtest -a DFS -I 10 -z 5 -b 2 -L -d / --dfs.pool --dfs.svcl --dfs.cont " diff --git a/configure.ac b/configure.ac index d9f302d..7f64855 100755 --- a/configure.ac +++ b/configure.ac @@ -73,6 +73,53 @@ AS_IF([test "$ac_cv_header_gpfs_h" = "yes" -o "$ac_cv_header_gpfs_fcntl_h" = "ye ]) ]) +# Check for CUDA +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda], + [support configurable CUDA @<:@default=check@:>@])], + [], [with_cuda=check]) + +AS_IF([test "x$with_cuda" != xno], [ + LDFLAGS="$LDFLAGS -L$with_cuda/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_cuda/lib64" + CPPFLAGS="$CPPFLAGS -I$with_cuda/include" + + AC_CHECK_HEADERS([cuda_runtime.h], [AC_DEFINE([HAVE_CUDA], [], [CUDA GPU API found])], [ + if test "x$with_cuda" != xcheck; then + AC_MSG_FAILURE([--with-cuda was given, not found]) + fi + ]) +AS_IF([test "$ac_cv_header_cuda_runtime_h" = "yes"], [ + AC_SEARCH_LIBS([cudaMalloc], [cudart cudart_static], [], + [AC_MSG_ERROR([Library containing cudaMalloc symbol not found])]) + ]) +]) +AM_CONDITIONAL([HAVE_CUDA], [test x$with_cuda = xyes]) +AM_COND_IF([HAVE_CUDA],[AC_DEFINE([HAVE_CUDA], [], [CUDA GPU API found])]) + +# Check for GPUDirect +AC_ARG_WITH([gpuDirect], + [AS_HELP_STRING([--with-gpuDirect], + [support configurable GPUDirect @<:@default=check@:>@])], + [], [with_gpuDirect=check]) + +AS_IF([test "x$with_gpuDirect" != xno], [ + LDFLAGS="$LDFLAGS -L$with_gpuDirect/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_gpuDirect/lib64" + CPPFLAGS="$CPPFLAGS -I$with_gpuDirect/include" + + AC_CHECK_HEADERS([cufile.h], [AC_DEFINE([HAVE_GPU_DIRECT], [], [GPUDirect API found])], [ + if test "x$with_gpuDirect" != xcheck; then + AC_MSG_FAILURE([--with-gpuDirect was given, not found]) + fi + ]) +AS_IF([test "$ac_cv_header_cufile_h" = "yes"], [ + AC_SEARCH_LIBS([cuFileDriverOpen], [cufile], [], + [AC_MSG_ERROR([Library containing cuFileDriverOpen symbol not found])]) + ]) +]) +AM_CONDITIONAL([HAVE_GPU_DIRECT], [test x$with_gpuDirect = xyes]) +AM_COND_IF([HAVE_GPU_DIRECT],[AC_DEFINE([HAVE_GPU_DIRECT], [], [GPUDirect API found])]) + + # Check for system capabilities AC_SYS_LARGEFILE @@ -84,7 +131,7 @@ AC_ARG_WITH([lustre], [support configurable Lustre striping values @<:@default=check@:>@])], [], [with_lustre=check]) AS_IF([test "x$with_lustre" = xyes ], [ - AC_CHECK_HEADERS([linux/lustre/lustre_user.h lustre/lustre_user.h], break, [ + AC_CHECK_HEADERS([linux/lustre/lustre_user.h lustre/lustre_user.h], [AC_DEFINE([HAVE_LUSTRE_USER], [], [Lustre user API available in some shape or form])], [ if test "x$with_lustre" != xcheck -a \ "x$ac_cv_header_linux_lustre_lustre_user_h" = "xno" -a \ "x$ac_cv_header_lustre_lustre_user_h" = "xno" ; then @@ -160,8 +207,10 @@ AC_ARG_WITH([ncmpi], [], [with_ncmpi=no]) AM_CONDITIONAL([USE_NCMPI_AIORI], [test x$with_ncmpi = xyes]) -AM_COND_IF([USE_NCMPI_AIORI],[ - AC_DEFINE([USE_NCMPI_AIORI], [], [Build NCMPI backend AIORI]) +AS_IF([test "x$with_ncmpi" = xyes ], [ + AC_CHECK_HEADERS([pnetcdf.h], [AC_DEFINE([USE_NCMPI_AIORI], [], [PNetCDF available])], [ + AC_MSG_FAILURE([--with-ncmpi was given but pnetcdf.h not found]) + ]) ]) # MMAP IO support @@ -200,6 +249,19 @@ AS_IF([test "x$with_pmdk" != xno], [ [AC_MSG_ERROR([Library containing pmdk symbols not found])]) ]) +# LINUX AIO support +AC_ARG_WITH([aio], + [AS_HELP_STRING([--with-aio], + [support Linux AIO @<:@default=no@:>@])], + [], + [with_aio=no]) +AM_CONDITIONAL([USE_AIO_AIORI], [test x$with_aio = xyes]) +AS_IF([test "x$with_aio" != xno], [ + AC_DEFINE([USE_AIO_AIORI], [], [Build AIO backend]) + AC_CHECK_HEADERS(libaio.h,, [unset AIO]) + AC_SEARCH_LIBS([aio], [io_setup], [AC_MSG_ERROR([Library containing AIO symbol io_setup not found])]) +]) + # RADOS support AC_ARG_WITH([rados], @@ -226,40 +288,25 @@ AM_COND_IF([USE_CEPHFS_AIORI],[ AC_DEFINE([USE_CEPHFS_AIORI], [], [Build CEPHFS backend AIORI]) ]) -# DAOS Backends (DAOS and DFS) IO support require DAOS and CART/GURT -AC_ARG_WITH([cart], - [AS_HELP_STRING([--with-cart], - [support IO with DAOS backends @<:@default=no@:>@])], - [], [with_daos=no]) - -AS_IF([test "x$with_cart" != xno], [ - CART="yes" - LDFLAGS="$LDFLAGS -L$with_cart/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_cart/lib64" - LDFLAGS="$LDFLAGS -L$with_cart/lib -Wl,--enable-new-dtags -Wl,-rpath=$with_cart/lib" - CPPFLAGS="$CPPFLAGS -I$with_cart/include/" - AC_CHECK_HEADERS(gurt/common.h,, [unset CART]) - AC_CHECK_LIB([gurt], [d_hash_murmur64],, [unset CART]) -]) - +# DAOS-FS Backend (DFS) AC_ARG_WITH([daos], [AS_HELP_STRING([--with-daos], - [support IO with DAOS backends @<:@default=no@:>@])], + [support IO with DAOS backend @<:@default=no@:>@])], [], [with_daos=no]) - AS_IF([test "x$with_daos" != xno], [ DAOS="yes" LDFLAGS="$LDFLAGS -L$with_daos/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_daos/lib64" CPPFLAGS="$CPPFLAGS -I$with_daos/include" - AC_CHECK_HEADERS(daos_types.h,, [unset DAOS]) + AC_CHECK_HEADERS(gurt/common.h,, [unset DAOS]) + AC_CHECK_HEADERS(daos.h,, [unset DAOS]) + AC_CHECK_LIB([gurt], [d_hash_murmur64],, [unset DAOS]) AC_CHECK_LIB([uuid], [uuid_generate],, [unset DAOS]) - AC_CHECK_LIB([daos_common], [daos_sgl_init],, [unset DAOS]) AC_CHECK_LIB([daos], [daos_init],, [unset DAOS]) AC_CHECK_LIB([dfs], [dfs_mkdir],, [unset DAOS]) ]) - AM_CONDITIONAL([USE_DAOS_AIORI], [test x$DAOS = xyes]) AM_COND_IF([USE_DAOS_AIORI],[ - AC_DEFINE([USE_DAOS_AIORI], [], [Build DAOS backends AIORI]) + AC_DEFINE([USE_DAOS_AIORI], [], [Build DAOS-FS backend AIORI]) ]) # Gfarm support @@ -308,19 +355,54 @@ AM_COND_IF([AWS4C_DIR],[ ]) -# Amazon S3 support [see also: --with-aws4c] -AC_ARG_WITH([S3], - [AS_HELP_STRING([--with-S3], - [support IO with Amazon S3 backend @<:@default=no@:>@])], + +# Amazon S3 support using the libs3 API +AC_ARG_WITH([S3-libs3], + [AS_HELP_STRING([--with-S3-libs3], + [support IO with Amazon libS3 @<:@default=no@:>@])], [], - [with_S3=no]) -AM_CONDITIONAL([USE_S3_AIORI], [test x$with_S3 = xyes]) -AM_COND_IF([USE_S3_AIORI],[ - AC_DEFINE([USE_S3_AIORI], [], [Build Amazon-S3 backend AIORI]) + [with_S3_libs3=no]) +AM_CONDITIONAL([USE_S3_LIBS3_AIORI], [test x$with_S3_libs3 = xyes]) +AM_COND_IF([USE_S3_LIBS3_AIORI],[ + AC_DEFINE([USE_S3_LIBS3_AIORI], [], [Build Amazon-S3 backend AIORI using libs3]) ]) err=0 -AS_IF([test "x$with_S3" != xno], [ +AS_IF([test "x$with_S3_libs3" != xno], [ + AC_MSG_NOTICE([beginning of S3-related checks]) + ORIG_CPPFLAGS=$CPPFLAGS + ORIG_LDFLAGS=$LDFLAGS + + AC_CHECK_HEADERS([libs3.h], [], [err=1]) + + # Autotools thinks searching for a library means I want it added to LIBS + ORIG_LIBS=$LIBS + AC_CHECK_LIB([s3], [S3_initialize], [], [err=1]) + LIBS=$ORIG_LIBS + + AC_MSG_NOTICE([end of S3-related checks]) + if test "$err" == 1; then + AC_MSG_FAILURE([S3 support is missing. dnl Make sure you have access to libs3. dnl]) + fi + + # restore user's values + CPPFLAGS=$ORIG_CPPFLAGS + LDFLAGS=$ORIG_LDFLAGS +]) + +# Amazon S3 support [see also: --with-aws4c] +AC_ARG_WITH([S3-4c], + [AS_HELP_STRING([--with-S3-4c], + [support IO with Amazon S3 backend @<:@default=no@:>@])], + [], + [with_S3_4c=no]) +AM_CONDITIONAL([USE_S3_4C_AIORI], [test x$with_S3_4c = xyes]) +AM_COND_IF([USE_S3_4C_AIORI],[ + AC_DEFINE([USE_S3_4C_AIORI], [], [Build Amazon-S3 backend AIORI using lib4c]) +]) + +err=0 +AS_IF([test "x$with_S3_4c" != xno], [ AC_MSG_NOTICE([beginning of S3-related checks]) # save user's values, while we use AC_CHECK_HEADERS with $AWS4C_DIR @@ -352,6 +434,30 @@ Consider --with-aws4c=, CPPFLAGS, LDFLAGS, etc]) LDFLAGS=$ORIG_LDFLAGS ]) +# Check for existence of the function to detect the CPU socket ID (for multi-socket systems) +AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([[ + int main(){ + unsigned long a,d,c; + __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); + return 0; + } + ]])], + AC_DEFINE([HAVE_RDTSCP_ASM], [], [Has ASM to detect CPU socket ID])) + +AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([[ + #define _GNU_SOURCE + #include + #include + unsigned long GetProcessorAndCore(int *chip, int *core){ + return syscall(SYS_getcpu, core, chip, NULL); + } + int main(){ + } + ]])], + AC_DEFINE([HAVE_GETCPU_SYSCALL], [], [Has syscall to detect CPU socket ID])) + # Enable building "IOR", in all capitals AC_ARG_ENABLE([caps], diff --git a/doc/USER_GUIDE b/doc/USER_GUIDE index 3d6b4e4..c68aeca 100755 --- a/doc/USER_GUIDE +++ b/doc/USER_GUIDE @@ -47,7 +47,7 @@ Two ways to run IOR: E.g., to execute: IOR -W -f script This defaults all tests in 'script' to use write data checking. - * The Command line supports to specify additional parameters for the choosen API. + * The Command line supports to specify additional parameters for the chosen API. For example, username and password for the storage. Available options are listed in the help text after selecting the API when running with -h. For example, 'IOR -a DUMMY -h' shows the supported options for the DUMMY backend. @@ -164,7 +164,7 @@ GENERAL: * numTasks - number of tasks that should participate in the test [0] - NOTE: 0 denotes all tasks + NOTE: -1 denotes all tasks * interTestDelay - this is the time in seconds to delay before beginning a write or read in a series of tests [0] @@ -361,7 +361,7 @@ GPFS-SPECIFIC: * gpfsReleaseToken - immediately after opening or creating file, release all locks. Might help mitigate lock-revocation - traffic when many proceses write/read to same file. + traffic when many processes write/read to same file. BeeGFS-SPECIFIC (POSIX only): ================ @@ -499,7 +499,7 @@ zip, gzip, and bzip. 3) bzip2: For bziped files a transfer size of 1k is insufficient (~50% compressed). To avoid compression a transfer size of greater than the bzip block size is required - (default = 900KB). I suggest a transfer size of greather than 1MB to avoid bzip2 compression. + (default = 900KB). I suggest a transfer size of greater than 1MB to avoid bzip2 compression. Be aware of the block size your compression algorithm will look at, and adjust the transfer size accordingly. @@ -660,7 +660,7 @@ HOW DO I USE HINTS? 'setenv IOR_HINT__MPI__ ' -HOW DO I EXPLICITY SET THE FILE DATA SIGNATURE? +HOW DO I EXPLICITLY SET THE FILE DATA SIGNATURE? The data signature for a transfer contains the MPI task number, transfer- buffer offset, and also timestamp for the start of iteration. As IOR works diff --git a/doc/mdtest.1 b/doc/mdtest.1 index 3cfc082..27d4d7b 100644 --- a/doc/mdtest.1 +++ b/doc/mdtest.1 @@ -28,7 +28,7 @@ Use ``collective creates'', meaning task 0 does all the creates. Only perform the create phase of the tests. .TP .I "-d" testdir[@testdir2] -The directory in which the tests will run. For multiple pathes, must use fully-qualified pathnames. +The directory in which the tests will run. For multiple paths, must use fully-qualified pathnames. [default: working directory of mdtest]. .TP .I "-D" @@ -78,6 +78,9 @@ Stride # between neighbor tasks for file/dir stat, 0 = local .I "-p" seconds Pre-iteration delay (in seconds). .TP +.I "-P" +Print both the file creation rate and the elapsed time. +.TP .I "-r" Only perform the remove phase of the tests. .TP @@ -121,6 +124,19 @@ Set verbosity value Set the number of Bytes to write to each file after it is created [default: 0]. .TP +.I "-W" seconds +Specify the stonewall time in seconds. When the stonewall timer has elapsed, +the rank with the highest number of creates sets +.I number_of_items +for the other ranks, so that all ranks create the same number of files. +.TP +.I "-x" filename +Filename to use for stonewall synchronization between processes. +.TP +.I "Y" +Call the sync command after each phase, which is included in the +timing. Note that it causes all IO to be flushed from the nodes. +.TP .I "-z" tree_depth The depth of the hierarchical directory tree [default: 0]. .SH EXAMPLES diff --git a/doc/sphinx/devDoc/release.rst b/doc/sphinx/devDoc/release.rst index 1e39a35..7a49687 100644 --- a/doc/sphinx/devDoc/release.rst +++ b/doc/sphinx/devDoc/release.rst @@ -1,38 +1,156 @@ Release Process =============== -To build a new version of IOR:: +General release process +----------------------- + +The versioning for IOR is encoded in the ``META`` file in the root of the +repository. The nomenclature is + +* 3.2.0 designates a proper release +* 3.2.0rc1 designates the first release candidate in preparation for the 3.2.0 + release +* 3.2.0+dev indicates development towards 3.2.0 prior to a feature freeze +* 3.2.0rc1+dev indicates development towards 3.2.0's first release candidate + after a feature freeze + +Building a release of IOR +------------------------- + +To build a new version of IOR, e.g., from the 3.2 release branch:: $ docker run -it ubuntu bash $ apt-get update $ apt-get install -y git automake autoconf make gcc mpich - $ git clone -b rc https://github.com/hpc/ior + $ git clone -b 3.2 https://github.com/hpc/ior $ cd ior $ ./travis-build.sh -To create a new release candidate from RC, +Alternatively you can build an an arbitrary branch in Docker using a bind mount. +This will be wrapped into a build-release Dockerfile in the future:: -1. Disable the ``check-news`` option in ``AM_INIT_AUTOMAKE`` inside configure.ac -2. Append "rcX" to the ``Version:`` field in META where X is the release - candidate number -3. Build a release package as described above + $ docker run -it --mount type=bind,source=$PWD,target=/ior ubuntu + $ apt-get update + $ apt-get install -y git automake autoconf make gcc mpich + $ ./travis-build.sh -To create a new minor release of IOR, +Feature freezing for a new release +---------------------------------- -1. Build the rc branch as described above -2. Create a release on GitHub which creates the appropriate tag -3. Upload the source distributions generated by travis-build.sh +1. Branch `major.minor` from the commit at which the feature freeze should take + effect. +2. Append the "rc1+dev" designator to the Version field in the META file, and + update the NEWS file to have this new version as the topmost heading +3. Commit and push this new branch +2. Update the ``Version:`` field in META `of the main branch` to be the `next` + release version, not the one whose features have just been frozen, and update + the NEWS file as you did in step 2. -To create a micro branch of IOR (e.g., if a release needs a hotfix), +For example, to feature-freeze for version 3.2:: -1. Check out the relevant release tagged in the rc branch (e.g., ``3.2.0``) -2. Create a branch with the major.minor name (e.g., ``3.2``) from that tag -3. Update the ``Version:`` in META -4. Apply hotfix(es) to that major.minor branch -5. Create the major.minor.micro release on GitHub + $ git checkout 11469ac + $ git checkout -B 3.2 + $ vim META # update the ``Version:`` field to 3.2.0rc1+dev + $ vim NEWS # update the topmost version number to 3.2.0rc1+dev + $ git add NEWS META + $ git commit -m "Update version for feature freeze" + $ git push upstream 3.2 + $ git checkout main + $ vim META # update the ``Version:`` field to 3.3.0+dev + $ vim NEWS # update the topmost version number to 3.3.0+dev + $ git add NEWS META + $ git commit -m "Update version number" + $ git push upstream main -To initiate a feature freeze, +Creating a new release candidate +-------------------------------- -1. Merge the master branch into the rc branch -2. Update the ``Version:`` field in META `of the master branch` to be the `next` - release version, not the one whose features have just been frozen +1. Check out the appropriate commit from the `major.minor` branch +2. Disable the ``check-news`` option in ``AM_INIT_AUTOMAKE`` inside configure.ac +3. Remove the "+dev" designator from the Version field in META +4. Build a release package as described above +5. Revert the change from #2 (it was just required to build a non-release tarball) +5. Tag and commit the updated META so one can easily recompile this rc from git +6. Update the "rcX" number and add "+dev" back to the ``Version:`` field in + META. This will allow anyone playing with the tip of this branch to see that + this the state is in preparation of the next rc, but is unreleased because of + +dev. +7. Commit + +For example to release 3.2.0rc1:: + + $ git checkout 3.2 + $ # edit configure.ac and remove the check-news option + $ # remove +dev from the Version field in META (Version: 3.2.0rc1) + $ # build + $ git checkout configure.ac + $ git add META + $ git commit -m "Release candidate for 3.2.0rc1" + $ git tag 3.2.0rc1 + $ # uptick rc number and re-add +dev to META (Version: 3.2.0rc2+dev) + $ git add META # should contain Version: 3.2.0rc2+dev + $ git commit -m "Uptick version after release" + $ git push && git push --tags + +Applying patches to a new microrelease +-------------------------------------- + +If a released version 3.2.0 has bugs, cherry-pick the fixes from main into the +3.2 branch:: + + $ git checkout 3.2 + $ git cherry-pick cb40c99 + $ git cherry-pick aafdf89 + $ git push upstream 3.2 + +Once you've accumulated enough bugs, move on to issuing a new release below. + +Creating a new release +---------------------- + +This is a two-phase process because we need to ensure that NEWS in main +contains a full history of releases, and we achieve this by always merging +changes from main into a release branch. + +1. Check out main +2. Ensure that the latest release notes for this release are reflected in NEWS +3. Commit that to main + +Then work on the release branch: + +1. Check out the relevant `major.minor` branch +2. Remove any "rcX" and "+dev" from the Version field in META +3. Cherry-pick your NEWS update commit from main into this release branch. + Resolve conflicts and get rid of news that reflect future releases. +4. Build a release package as described above +5. Tag and commit the updated NEWS and META so one can easily recompile this + release from git +6. Update the Version field to the next rc version and re-add "+dev" +7. Commit +8. Create the major.minor.micro release on GitHub from the associated tag + +For example to release 3.2.0:: + + $ git checkout main + $ vim NEWS # add release notes from ``git log --oneline 3.2.0rc1..`` + $ git commit + +Let's say the above generated commit abc345e on main. Then:: + + $ git checkout 3.2 + $ vim META # 3.2.0rc2+dev -> 3.2.0 + $ git cherry-pick abc345e + $ vim NEWS # resolve conflicts, delete stuff for e.g., 3.4 + $ # build + $ git add NEWS META + $ git commit -m "Release v3.2.0" + $ git tag 3.2.0 + $ vim META # 3.2.0 -> 3.2.1rc1+dev + # vim NEWS # add a placeholder for 3.2.1rc2+dev so automake is happy + $ git add NEWS META + $ git commit -m "Uptick version after release" + +Then push your main and your release branch and also push tags:: + + $ git checkout main && git push && git push --tags + $ git checkout 3.2 && git push && git push --tags diff --git a/doc/sphinx/userDoc/faq.rst b/doc/sphinx/userDoc/faq.rst index 0e9a8a9..df07cbb 100644 --- a/doc/sphinx/userDoc/faq.rst +++ b/doc/sphinx/userDoc/faq.rst @@ -146,7 +146,7 @@ HOW DO I USE HINTS? 'setenv IOR_HINT__MPI__ ' -HOW DO I EXPLICITY SET THE FILE DATA SIGNATURE? +HOW DO I EXPLICITLY SET THE FILE DATA SIGNATURE? The data signature for a transfer contains the MPI task number, transfer- buffer offset, and also timestamp for the start of iteration. As IOR works diff --git a/doc/sphinx/userDoc/install.rst b/doc/sphinx/userDoc/install.rst index 4bfa684..9b000c8 100644 --- a/doc/sphinx/userDoc/install.rst +++ b/doc/sphinx/userDoc/install.rst @@ -6,19 +6,19 @@ Install Building -------- -0. If "configure" is missing from the top level directory, you +0. If ``configure`` is missing from the top level directory, you probably retrieved this code directly from the repository. - Run "./bootstrap". + Run ``./bootstrap``. If your versions of the autotools are not new enough to run this script, download and official tarball in which the configure script is already provided. -1. Run "./configure" +1. Run ``./configure`` - See "./configure --help" for configuration options. + See ``./configure --help`` for configuration options. -2. Run "make" +2. Run ``make`` -3. Optionally, run "make install". The installation prefix - can be changed as an option to the "configure" script. +3. Optionally, run ``make install``. The installation prefix + can be changed as an option to the ``configure`` script. diff --git a/doc/sphinx/userDoc/options.rst b/doc/sphinx/userDoc/options.rst index 31240f0..6751749 100644 --- a/doc/sphinx/userDoc/options.rst +++ b/doc/sphinx/userDoc/options.rst @@ -302,7 +302,7 @@ GPFS-SPECIFIC * ``gpfsReleaseToken`` - release all locks immediately after opening or creating file. Might help mitigate lock-revocation traffic when many - proceses write/read to same file. (default: 0) + processes write/read to same file. (default: 0) Verbosity levels ---------------- @@ -338,7 +338,7 @@ bzip. 3) bzip2: For bziped files a transfer size of 1k is insufficient (~50% compressed). To avoid compression a transfer size of greater than the bzip block size is required - (default = 900KB). I suggest a transfer size of greather than 1MB to avoid bzip2 compression. + (default = 900KB). I suggest a transfer size of greater than 1MB to avoid bzip2 compression. Be aware of the block size your compression algorithm will look at, and adjust the transfer size accordingly. diff --git a/doc/sphinx/userDoc/tutorial.rst b/doc/sphinx/userDoc/tutorial.rst index 5fa6814..70d4aa3 100644 --- a/doc/sphinx/userDoc/tutorial.rst +++ b/doc/sphinx/userDoc/tutorial.rst @@ -4,30 +4,31 @@ First Steps with IOR ==================== This is a short tutorial for the basic usage of IOR and some tips on how to use -IOR to handel caching effects as these are very likely to affect your +IOR to handle caching effects as these are very likely to affect your measurements. Running IOR ----------- There are two ways of running IOR: - 1) Command line with arguments -- executable followed by command line - options. + 1) Command line with arguments -- executable followed by command line options. - :: - $ ./IOR -w -r -o filename + .. code-block:: shell - This performs a write and a read to the file 'filename'. + $ ./IOR -w -r -o filename + + This performs a write and a read to the file 'filename'. 2) Command line with scripts -- any arguments on the command line will - establish the default for the test run, but a script may be used in - conjunction with this for varying specific tests during an execution of - the code. Only arguments before the script will be used! + establish the default for the test run, but a script may be used in + conjunction with this for varying specific tests during an execution of + the code. Only arguments before the script will be used! - :: - $ ./IOR -W -f script + .. code-block:: shell - This defaults all tests in 'script' to use write data checking. + $ ./IOR -W -f script + + This defaults all tests in 'script' to use write data checking. In this tutorial the first one is used as it is much easier to toy around with @@ -40,10 +41,10 @@ Getting Started with IOR IOR writes data sequentially with the following parameters: - * blockSize (-b) - * transferSize (-t) - * segmentCount (-s) - * numTasks (-n) + * ``blockSize`` (``-b``) + * ``transferSize`` (``-t``) + * ``segmentCount`` (``-s``) + * ``numTasks`` (``-n``) which are best illustrated with a diagram: @@ -52,30 +53,34 @@ which are best illustrated with a diagram: These four parameters are all you need to get started with IOR. However, naively running IOR usually gives disappointing results. For example, if we run -a four-node IOR test that writes a total of 16 GiB:: +a four-node IOR test that writes a total of 16 GiB: - $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 - ... - access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter - ------ --------- ---------- --------- -------- -------- -------- -------- ---- - write 427.36 16384 1024.00 0.107961 38.34 32.48 38.34 2 - read 239.08 16384 1024.00 0.005789 68.53 65.53 68.53 2 - remove - - - - - - 0.534400 2 +.. code-block:: shell + + $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 + ... + access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter + ------ --------- ---------- --------- -------- -------- -------- -------- ---- + write 427.36 16384 1024.00 0.107961 38.34 32.48 38.34 2 + read 239.08 16384 1024.00 0.005789 68.53 65.53 68.53 2 + remove - - - - - - 0.534400 2 we can only get a couple hundred megabytes per second out of a Lustre file system that should be capable of a lot more. Switching from writing to a single-shared file to one file per process using the --F (filePerProcess=1) option changes the performance dramatically:: +``-F`` (``filePerProcess=1``) option changes the performance dramatically: - $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F - ... - access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter - ------ --------- ---------- --------- -------- -------- -------- -------- ---- - write 33645 16384 1024.00 0.007693 0.486249 0.195494 0.486972 1 - read 149473 16384 1024.00 0.004936 0.108627 0.016479 0.109612 1 - remove - - - - - - 6.08 1 +.. code-block:: shell + + $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F + ... + access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter + ------ --------- ---------- --------- -------- -------- -------- -------- ---- + write 33645 16384 1024.00 0.007693 0.486249 0.195494 0.486972 1 + read 149473 16384 1024.00 0.004936 0.108627 0.016479 0.109612 1 + remove - - - - - - 6.08 1 This is in large part because letting each MPI process work on its own file cuts @@ -123,7 +128,7 @@ There are a couple of ways to measure the read performance of the underlying Lustre file system. The most crude way is to simply write more data than will fit into the total page cache so that by the time the write phase has completed, the beginning of the file has already been evicted from cache. For example, -increasing the number of segments (-s) to write more data reveals the point at +increasing the number of segments (``-s``) to write more data reveals the point at which the nodes' page cache on my test system runs over very clearly: .. image:: tutorial-ior-overflowing-cache.png @@ -142,17 +147,19 @@ written by node N-1. Since page cache is not shared between compute nodes, shifting tasks this way ensures that each MPI process is reading data it did not write. -IOR provides the -C option (reorderTasks) to do this, and it forces each MPI +IOR provides the ``-C`` option (``reorderTasks``) to do this, and it forces each MPI process to read the data written by its neighboring node. Running IOR with -this option gives much more credible read performance:: +this option gives much more credible read performance: - $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F -C - ... - access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter - ------ --------- ---------- --------- -------- -------- -------- -------- ---- - write 41326 16384 1024.00 0.005756 0.395859 0.095360 0.396453 0 - read 3310.00 16384 1024.00 0.011786 4.95 4.20 4.95 1 - remove - - - - - - 0.237291 1 +.. code-block:: shell + + $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F -C + ... + access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter + ------ --------- ---------- --------- -------- -------- -------- -------- ---- + write 41326 16384 1024.00 0.005756 0.395859 0.095360 0.396453 0 + read 3310.00 16384 1024.00 0.011786 4.95 4.20 4.95 1 + remove - - - - - - 0.237291 1 But now it should seem obvious that the write performance is also ridiculously @@ -166,16 +173,18 @@ pages we just wrote to flush out to Lustre. Including the time it takes for fsync() to finish gives us a measure of how long it takes for our data to write to the page cache and for the page cache to write back to Lustre. -IOR provides another convenient option, -e (fsync), to do just this. And, once -again, using this option changes our performance measurement quite a bit:: +IOR provides another convenient option, ``-e`` (fsync), to do just this. And, once +again, using this option changes our performance measurement quite a bit: - $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F -C -e - ... - access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter - ------ --------- ---------- --------- -------- -------- -------- -------- ---- - write 2937.89 16384 1024.00 0.011841 5.56 4.93 5.58 0 - read 2712.55 16384 1024.00 0.005214 6.04 5.08 6.04 3 - remove - - - - - - 0.037706 0 +.. code-block:: shell + + $ mpirun -n 64 ./ior -t 1m -b 16m -s 16 -F -C -e + ... + access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter + ------ --------- ---------- --------- -------- -------- -------- -------- ---- + write 2937.89 16384 1024.00 0.011841 5.56 4.93 5.58 0 + read 2712.55 16384 1024.00 0.005214 6.04 5.08 6.04 3 + remove - - - - - - 0.037706 0 and we finally have a believable bandwidth measurement for our file system. @@ -192,16 +201,17 @@ the best choice. There are several ways in which we can get clever and defeat page cache in a more general sense to get meaningful performance numbers. When measuring write performance, bypassing page cache is actually quite simple; -opening a file with the O_DIRECT flag going directly to disk. In addition, -the fsync() call can be inserted into applications, as is done with IOR's -e +opening a file with the ``O_DIRECT`` flag going directly to disk. In addition, +the ``fsync()`` call can be inserted into applications, as is done with IOR's ``-e`` option. Measuring read performance is a lot trickier. If you are fortunate enough to have root access on a test system, you can force the Linux kernel to empty out its page cache by doing -:: - # echo 1 > /proc/sys/vm/drop_caches +.. code-block:: shell + + # echo 1 > /proc/sys/vm/drop_caches and in fact, this is often good practice before running any benchmark (e.g., Linpack) because it ensures that you aren't losing performance to the @@ -210,23 +220,25 @@ memory for its own use. Unfortunately, many of us do not have root on our systems, so we have to get even more clever. As it turns out, there is a way to pass a hint to the kernel -that a file is no longer needed in page cache:: +that a file is no longer needed in page cache: - #define _XOPEN_SOURCE 600 - #include - #include - int main(int argc, char *argv[]) { - int fd; - fd = open(argv[1], O_RDONLY); - fdatasync(fd); - posix_fadvise(fd, 0,0,POSIX_FADV_DONTNEED); - close(fd); - return 0; - } +.. code-block:: c -The effect of passing POSIX_FADV_DONTNEED using posix_fadvise() is usually that + #define _XOPEN_SOURCE 600 + #include + #include + int main(int argc, char *argv[]) { + int fd; + fd = open(argv[1], O_RDONLY); + fdatasync(fd); + posix_fadvise(fd, 0,0,POSIX_FADV_DONTNEED); + close(fd); + return 0; + } + +The effect of passing POSIX_FADV_DONTNEED using ``posix_fadvise()`` is usually that all pages belonging to that file are evicted from page cache in Linux. However, -this is just a hint--not a guarantee--and the kernel evicts these pages +this is just a hint --not a guarantee-- and the kernel evicts these pages asynchronously, so it may take a second or two for pages to actually leave page cache. Fortunately, Linux also provides a way to probe pages in a file to see if they are resident in memory. diff --git a/src/Makefile.am b/src/Makefile.am index 567d9ce..037433c 100755 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,20 +1,25 @@ SUBDIRS = . test -bin_PROGRAMS = ior mdtest +bin_PROGRAMS = ior mdtest md-workbench if USE_CAPS -bin_PROGRAMS += IOR MDTEST +bin_PROGRAMS += IOR MDTEST MD-WORKBENCH endif -noinst_HEADERS = ior.h utilities.h parse_options.h aiori.h iordef.h ior-internal.h option.h mdtest.h +noinst_HEADERS = ior.h utilities.h parse_options.h aiori.h iordef.h ior-internal.h option.h mdtest.h aiori-debug.h aiori-POSIX.h md-workbench.h lib_LIBRARIES = libaiori.a -libaiori_a_SOURCES = ior.c mdtest.c utilities.c parse_options.c ior-output.c option.c +libaiori_a_SOURCES = ior.c mdtest.c utilities.c parse_options.c ior-output.c option.c md-workbench.c extraSOURCES = aiori.c aiori-DUMMY.c extraLDADD = extraLDFLAGS = extraCPPFLAGS = +md_workbench_SOURCES = md-workbench-main.c +md_workbench_LDFLAGS = +md_workbench_LDADD = libaiori.a +md_workbench_CPPFLAGS = + ior_SOURCES = ior-main.c ior_LDFLAGS = ior_LDADD = libaiori.a @@ -36,6 +41,14 @@ extraLDFLAGS += -L/opt/hadoop-2.2.0/lib/native extraLDADD += -lhdfs endif +if HAVE_CUDA +extraLDADD += -lcudart +endif + +if HAVE_GPU_DIRECT +extraLDADD += -lcufile +endif + if USE_HDF5_AIORI extraSOURCES += aiori-HDF5.c extraLDADD += -lhdf5 -lz @@ -65,6 +78,11 @@ if USE_POSIX_AIORI extraSOURCES += aiori-POSIX.c endif +if USE_AIO_AIORI +extraSOURCES += aiori-aio.c +extraLDADD += -laio +endif + if USE_PMDK_AIORI extraSOURCES += aiori-PMDK.c extraLDADD += -lpmem @@ -82,7 +100,8 @@ endif if USE_DAOS_AIORI -extraSOURCES += aiori-DAOS.c aiori-DFS.c +extraSOURCES += aiori-DFS.c +extraLDADD += -lgurt -ldaos_common -ldaos -ldfs -luuid endif if USE_GFARM_AIORI @@ -90,8 +109,8 @@ extraSOURCES += aiori-Gfarm.c extraLDADD += -lgfarm endif -if USE_S3_AIORI -extraSOURCES += aiori-S3.c +if USE_S3_4C_AIORI +extraSOURCES += aiori-S3-4c.c if AWS4C_DIR extraCPPFLAGS += $(AWS4C_CPPFLAGS) extraLDFLAGS += $(AWS4C_LDFLAGS) @@ -100,6 +119,12 @@ extraLDADD += -lcurl extraLDADD += -lxml2 extraLDADD += -laws4c extraLDADD += -laws4c_extra +extraLDADD += -lcrypto +endif + +if USE_S3_LIBS3_AIORI +extraSOURCES += aiori-S3-libs3.c +extraLDADD += -ls3 endif if WITH_LUSTRE @@ -116,6 +141,16 @@ mdtest_LDFLAGS += $(extraLDFLAGS) mdtest_LDADD += $(extraLDADD) mdtest_CPPFLAGS += $(extraCPPFLAGS) +md_workbench_SOURCES += $(extraSOURCES) +md_workbench_LDFLAGS += $(extraLDFLAGS) +md_workbench_LDADD += $(extraLDADD) +md_workbench_CPPFLAGS += $(extraCPPFLAGS) + +MD_WORKBENCH_SOURCES = $(md_workbench_SOURCES) +MD_WORKBENCH_LDFLAGS = $(md_workbench_LDFLAGS) +MD_WORKBENCH_LDADD = $(md_workbench_LDADD) +MD_WORKBENCH_CPPFLAGS = $(md_workbench_CPPFLAGS) + IOR_SOURCES = $(ior_SOURCES) IOR_LDFLAGS = $(ior_LDFLAGS) IOR_LDADD = $(ior_LDADD) @@ -128,3 +163,10 @@ MDTEST_CPPFLAGS = $(mdtest_CPPFLAGS) libaiori_a_SOURCES += $(extraSOURCES) libaiori_a_CPPFLAGS = $(extraCPPFLAGS) + +# Generate a config file with the build flags to allow the reuse of library +.PHONY: build.conf +all-local: build.conf +build.conf: + @echo LDFLAGS=$(LDFLAGS) $(extraLDFLAGS) $(extraLDADD) $(LIBS) > build.conf + @echo CFLAGS=$(CFLAGS) $(extraCPPFLAGS) >> build.conf diff --git a/src/aiori-DAOS.c b/src/aiori-DAOS.c deleted file mode 100644 index 969507c..0000000 --- a/src/aiori-DAOS.c +++ /dev/null @@ -1,570 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -/* - * Copyright (C) 2018-2020 Intel Corporation - * See the file COPYRIGHT for a complete copyright notice and license. - */ - -/* - * This file implements the abstract I/O interface for DAOS Array API. - */ - -#define _BSD_SOURCE - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "aiori.h" -#include "utilities.h" -#include "iordef.h" - -/************************** O P T I O N S *****************************/ -typedef struct { - char *pool; - char *svcl; - char *group; - char *cont; - int chunk_size; - int destroy; - char *oclass; -} DAOS_options_t; - -static option_help * DAOS_options(aiori_mod_opt_t ** init_backend_options, - aiori_mod_opt_t * init_values){ - DAOS_options_t * o = malloc(sizeof(DAOS_options_t)); - - if (init_values != NULL) { - memcpy(o, init_values, sizeof(DAOS_options_t)); - } else { - memset(o, 0, sizeof(DAOS_options_t)); - /* initialize the options properly */ - o->chunk_size = 1048576; - } - - *init_backend_options = (aiori_mod_opt_t *) o; - - option_help h [] = { - {0, "daos.pool", "pool uuid", OPTION_OPTIONAL_ARGUMENT, 's', &o->pool}, - {0, "daos.svcl", "pool SVCL", OPTION_OPTIONAL_ARGUMENT, 's', &o->svcl}, - {0, "daos.group", "server group", OPTION_OPTIONAL_ARGUMENT, 's', &o->group}, - {0, "daos.cont", "container uuid", OPTION_OPTIONAL_ARGUMENT, 's', &o->cont}, - {0, "daos.chunk_size", "chunk size", OPTION_OPTIONAL_ARGUMENT, 'd', &o->chunk_size}, - {0, "daos.destroy", "Destroy Container", OPTION_FLAG, 'd', &o->destroy}, - {0, "daos.oclass", "object class", OPTION_OPTIONAL_ARGUMENT, 's', &o->oclass}, - LAST_OPTION - }; - - option_help * help = malloc(sizeof(h)); - memcpy(help, h, sizeof(h)); - return help; -} - -/**************************** P R O T O T Y P E S *****************************/ - -static void DAOS_Init(aiori_mod_opt_t *); -static void DAOS_Fini(aiori_mod_opt_t *); -static aiori_fd_t *DAOS_Create(char *, int, aiori_mod_opt_t *); -static aiori_fd_t *DAOS_Open(char *, int, aiori_mod_opt_t *); -static int DAOS_Access(const char *, int, aiori_mod_opt_t *); -static IOR_offset_t DAOS_Xfer(int, aiori_fd_t *, IOR_size_t *, IOR_offset_t, - IOR_offset_t, aiori_mod_opt_t *); -static void DAOS_Close(aiori_fd_t *, aiori_mod_opt_t *); -static void DAOS_Delete(char *, aiori_mod_opt_t *); -static char* DAOS_GetVersion(); -static void DAOS_Fsync(aiori_fd_t *, aiori_mod_opt_t *); -static IOR_offset_t DAOS_GetFileSize(aiori_mod_opt_t *, MPI_Comm, char *); -static option_help * DAOS_options(); -static void DAOS_init_xfer_options(aiori_xfer_hint_t *); -static int DAOS_check_params(aiori_mod_opt_t *); - -/************************** D E C L A R A T I O N S ***************************/ - -ior_aiori_t daos_aiori = { - .name = "DAOS", - .initialize = DAOS_Init, - .finalize = DAOS_Fini, - .create = DAOS_Create, - .open = DAOS_Open, - .access = DAOS_Access, - .xfer = DAOS_Xfer, - .close = DAOS_Close, - .delete = DAOS_Delete, - .get_version = DAOS_GetVersion, - .xfer_hints = DAOS_init_xfer_options, - .fsync = DAOS_Fsync, - .get_file_size = DAOS_GetFileSize, - .statfs = aiori_posix_statfs, - .mkdir = aiori_posix_mkdir, - .rmdir = aiori_posix_rmdir, - .stat = aiori_posix_stat, - .get_options = DAOS_options, - .xfer_hints = DAOS_init_xfer_options, - .check_params = DAOS_check_params, - .enable_mdtest = false, -}; - -#define IOR_DAOS_MUR_SEED 0xDEAD10CC - -enum handleType { - POOL_HANDLE, - CONT_HANDLE, - ARRAY_HANDLE -}; - -static daos_handle_t poh; -static daos_handle_t coh; -static daos_handle_t aoh; -static daos_oclass_id_t objectClass = OC_SX; -static bool daos_initialized = false; - -/***************************** F U N C T I O N S ******************************/ - -/* For DAOS methods. */ -#define DCHECK(rc, format, ...) \ -do { \ - int _rc = (rc); \ - \ - if (_rc < 0) { \ - fprintf(stderr, "ior ERROR (%s:%d): %d: %d: " \ - format"\n", __FILE__, __LINE__, rank, _rc, \ - ##__VA_ARGS__); \ - fflush(stdout); \ - MPI_Abort(MPI_COMM_WORLD, -1); \ - } \ -} while (0) - -#define INFO(level, format, ...) \ -do { \ - if (verbose >= level) \ - printf("[%d] "format"\n", rank, ##__VA_ARGS__); \ -} while (0) - -/* For generic errors like invalid command line options. */ -#define GERR(format, ...) \ -do { \ - fprintf(stderr, format"\n", ##__VA_ARGS__); \ - MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); \ -} while (0) - -static aiori_xfer_hint_t * hints = NULL; - -void DAOS_init_xfer_options(aiori_xfer_hint_t * params) -{ - hints = params; -} - -static int DAOS_check_params(aiori_mod_opt_t * options){ - DAOS_options_t *o = (DAOS_options_t *) options; - - if (o->pool == NULL || o->svcl == NULL || o->cont == NULL) - ERR("Invalid pool or container options\n"); - - return 0; -} - -/* Distribute process 0's pool or container handle to others. */ -static void -HandleDistribute(daos_handle_t *handle, enum handleType type) -{ - d_iov_t global; - int rc; - - global.iov_buf = NULL; - global.iov_buf_len = 0; - global.iov_len = 0; - - if (rank == 0) { - /* Get the global handle size. */ - if (type == POOL_HANDLE) - rc = daos_pool_local2global(*handle, &global); - else if (type == CONT_HANDLE) - rc = daos_cont_local2global(*handle, &global); - else - rc = daos_array_local2global(*handle, &global); - DCHECK(rc, "Failed to get global handle size"); - } - - MPI_CHECK(MPI_Bcast(&global.iov_buf_len, 1, MPI_UINT64_T, 0, - MPI_COMM_WORLD), - "Failed to bcast global handle buffer size"); - - global.iov_len = global.iov_buf_len; - global.iov_buf = malloc(global.iov_buf_len); - if (global.iov_buf == NULL) - ERR("Failed to allocate global handle buffer"); - - if (rank == 0) { - if (type == POOL_HANDLE) - rc = daos_pool_local2global(*handle, &global); - else if (type == CONT_HANDLE) - rc = daos_cont_local2global(*handle, &global); - else - rc = daos_array_local2global(*handle, &global); - DCHECK(rc, "Failed to create global handle"); - } - - MPI_CHECK(MPI_Bcast(global.iov_buf, global.iov_buf_len, MPI_BYTE, 0, - MPI_COMM_WORLD), - "Failed to bcast global pool handle"); - - if (rank != 0) { - if (type == POOL_HANDLE) - rc = daos_pool_global2local(global, handle); - else if (type == CONT_HANDLE) - rc = daos_cont_global2local(poh, global, handle); - else - rc = daos_array_global2local(coh, global, 0, handle); - DCHECK(rc, "Failed to get local handle"); - } - - free(global.iov_buf); -} - -static void -DAOS_Init(aiori_mod_opt_t * options) -{ - DAOS_options_t *o = (DAOS_options_t *)options; - int rc; - - if (daos_initialized) - return; - - if (o->pool == NULL || o->svcl == NULL || o->cont == NULL) - return; - - if (o->oclass) { - objectClass = daos_oclass_name2id(o->oclass); - if (objectClass == OC_UNKNOWN) - GERR("Invalid DAOS Object class %s\n", o->oclass); - } - - rc = daos_init(); - if (rc) - DCHECK(rc, "Failed to initialize daos"); - - if (rank == 0) { - uuid_t uuid; - d_rank_list_t *svcl = NULL; - static daos_pool_info_t po_info; - static daos_cont_info_t co_info; - - INFO(VERBOSE_1, "Connecting to pool %s", o->pool); - - rc = uuid_parse(o->pool, uuid); - DCHECK(rc, "Failed to parse 'pool': %s", o->pool); - - svcl = daos_rank_list_parse(o->svcl, ":"); - if (svcl == NULL) - ERR("Failed to allocate svcl"); - - rc = daos_pool_connect(uuid, o->group, svcl, DAOS_PC_RW, - &poh, &po_info, NULL); - d_rank_list_free(svcl); - DCHECK(rc, "Failed to connect to pool %s", o->pool); - - INFO(VERBOSE_1, "Create/Open Container %s", o->cont); - - uuid_clear(uuid); - rc = uuid_parse(o->cont, uuid); - DCHECK(rc, "Failed to parse 'cont': %s", o->cont); - - rc = daos_cont_open(poh, uuid, DAOS_COO_RW, &coh, &co_info, - NULL); - /* If NOEXIST we create it */ - if (rc == -DER_NONEXIST) { - INFO(VERBOSE_2, "Creating DAOS Container...\n"); - rc = daos_cont_create(poh, uuid, NULL, NULL); - if (rc == 0) - rc = daos_cont_open(poh, uuid, DAOS_COO_RW, - &coh, &co_info, NULL); - } - DCHECK(rc, "Failed to create container"); - } - - HandleDistribute(&poh, POOL_HANDLE); - HandleDistribute(&coh, CONT_HANDLE); - aoh.cookie = 0; - - daos_initialized = true; -} - -static void -DAOS_Fini(aiori_mod_opt_t *options) -{ - DAOS_options_t *o = (DAOS_options_t *)options; - int rc; - - if (!daos_initialized) - return; - - MPI_Barrier(MPI_COMM_WORLD); - rc = daos_cont_close(coh, NULL); - if (rc) { - DCHECK(rc, "Failed to close container %s (%d)", o->cont, rc); - MPI_Abort(MPI_COMM_WORLD, -1); - } - MPI_Barrier(MPI_COMM_WORLD); - - if (o->destroy) { - if (rank == 0) { - uuid_t uuid; - double t1, t2; - - INFO(VERBOSE_1, "Destroying DAOS Container %s", o->cont); - uuid_parse(o->cont, uuid); - t1 = MPI_Wtime(); - rc = daos_cont_destroy(poh, uuid, 1, NULL); - t2 = MPI_Wtime(); - if (rc == 0) - INFO(VERBOSE_1, "Container Destroy time = %f secs", t2-t1); - } - - MPI_Bcast(&rc, 1, MPI_INT, 0, MPI_COMM_WORLD); - if (rc) { - if (rank == 0) - DCHECK(rc, "Failed to destroy container %s (%d)", o->cont, rc); - MPI_Abort(MPI_COMM_WORLD, -1); - } - } - - if (rank == 0) - INFO(VERBOSE_1, "Disconnecting from DAOS POOL.."); - - rc = daos_pool_disconnect(poh, NULL); - DCHECK(rc, "Failed to disconnect from pool %s", o->pool); - - MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD), "barrier error"); - if (rank == 0) - INFO(VERBOSE_1, "Finalizing DAOS.."); - - rc = daos_fini(); - DCHECK(rc, "Failed to finalize daos"); - - daos_initialized = false; -} - -static void -gen_oid(const char *name, daos_obj_id_t *oid) -{ - oid->lo = d_hash_murmur64(name, strlen(name), IOR_DAOS_MUR_SEED); - oid->hi = 0; - - daos_array_generate_id(oid, objectClass, true, 0); -} - -static aiori_fd_t * -DAOS_Create(char *testFileName, int flags, aiori_mod_opt_t *param) -{ - DAOS_options_t *o = (DAOS_options_t*) param; - daos_obj_id_t oid; - int rc; - - /** Convert file name into object ID */ - gen_oid(testFileName, &oid); - - /** Create the array */ - if (hints->filePerProc || rank == 0) { - rc = daos_array_create(coh, oid, DAOS_TX_NONE, 1, o->chunk_size, - &aoh, NULL); - DCHECK(rc, "Failed to create array object\n"); - } - - /** Distribute the array handle if not FPP */ - if (!hints->filePerProc) - HandleDistribute(&aoh, ARRAY_HANDLE); - - return (aiori_fd_t*)(&aoh); -} - -static int -DAOS_Access(const char *testFileName, int mode, aiori_mod_opt_t * param) -{ - daos_obj_id_t oid; - daos_size_t cell_size, chunk_size; - int rc; - - /** Convert file name into object ID */ - gen_oid(testFileName, &oid); - - rc = daos_array_open(coh, oid, DAOS_TX_NONE, DAOS_OO_RO, - &cell_size, &chunk_size, &aoh, NULL); - if (rc) - return rc; - - if (cell_size != 1) - GERR("Invalid DAOS Array object.\n"); - - rc = daos_array_close(aoh, NULL); - aoh.cookie = 0; - return rc; -} - -static aiori_fd_t * -DAOS_Open(char *testFileName, int flags, aiori_mod_opt_t *param) -{ - daos_obj_id_t oid; - - /** Convert file name into object ID */ - gen_oid(testFileName, &oid); - - /** Open the array */ - if (hints->filePerProc || rank == 0) { - daos_size_t cell_size, chunk_size; - int rc; - - rc = daos_array_open(coh, oid, DAOS_TX_NONE, DAOS_OO_RW, - &cell_size, &chunk_size, &aoh, NULL); - DCHECK(rc, "Failed to create array object\n"); - - if (cell_size != 1) - GERR("Invalid DAOS Array object.\n"); - } - - /** Distribute the array handle if not FPP */ - if (!hints->filePerProc) - HandleDistribute(&aoh, ARRAY_HANDLE); - - return (aiori_fd_t*)(&aoh); -} - -static IOR_offset_t -DAOS_Xfer(int access, aiori_fd_t *file, IOR_size_t *buffer, IOR_offset_t length, - IOR_offset_t off, aiori_mod_opt_t *param) -{ - daos_array_iod_t iod; - daos_range_t rg; - d_sg_list_t sgl; - d_iov_t iov; - int rc; - - /** set array location */ - iod.arr_nr = 1; - rg.rg_len = length; - rg.rg_idx = off; - iod.arr_rgs = &rg; - - /** set memory location */ - sgl.sg_nr = 1; - d_iov_set(&iov, buffer, length); - sgl.sg_iovs = &iov; - - if (access == WRITE) { - rc = daos_array_write(aoh, DAOS_TX_NONE, &iod, &sgl, NULL); - DCHECK(rc, "daos_array_write() failed (%d).", rc); - } else { - rc = daos_array_read(aoh, DAOS_TX_NONE, &iod, &sgl, NULL); - DCHECK(rc, "daos_array_read() failed (%d).", rc); - } - - return length; -} - -static void -DAOS_Close(aiori_fd_t *file, aiori_mod_opt_t *param) -{ - int rc; - - if (!daos_initialized) - GERR("DAOS is not initialized!"); - - rc = daos_array_close(aoh, NULL); - DCHECK(rc, "daos_array_close() failed (%d).", rc); - - aoh.cookie = 0; -} - -static void -DAOS_Delete(char *testFileName, aiori_mod_opt_t *param) -{ - daos_obj_id_t oid; - daos_size_t cell_size, chunk_size; - int rc; - - if (!daos_initialized) - GERR("DAOS is not initialized!"); - - /** Convert file name into object ID */ - gen_oid(testFileName, &oid); - - /** open the array to verify it exists */ - rc = daos_array_open(coh, oid, DAOS_TX_NONE, DAOS_OO_RW, - &cell_size, &chunk_size, &aoh, NULL); - DCHECK(rc, "daos_array_open() failed (%d).", rc); - - if (cell_size != 1) - GERR("Invalid DAOS Array object.\n"); - - rc = daos_array_destroy(aoh, DAOS_TX_NONE, NULL); - DCHECK(rc, "daos_array_destroy() failed (%d).", rc); - - rc = daos_array_close(aoh, NULL); - DCHECK(rc, "daos_array_close() failed (%d).", rc); - aoh.cookie = 0; -} - -static char * -DAOS_GetVersion() -{ - static char ver[1024] = {}; - - sprintf(ver, "%s", "DAOS"); - return ver; -} - -static void -DAOS_Fsync(aiori_fd_t *file, aiori_mod_opt_t *param) -{ - return; -} - -static IOR_offset_t -DAOS_GetFileSize(aiori_mod_opt_t *param, MPI_Comm comm, char *testFileName) -{ - daos_obj_id_t oid; - daos_size_t size; - int rc; - - if (!daos_initialized) - GERR("DAOS is not initialized!"); - - /** Convert file name into object ID */ - gen_oid(testFileName, &oid); - - /** open the array to verify it exists */ - if (hints->filePerProc || rank == 0) { - daos_size_t cell_size, chunk_size; - - rc = daos_array_open(coh, oid, DAOS_TX_NONE, DAOS_OO_RO, - &cell_size, &chunk_size, &aoh, NULL); - DCHECK(rc, "daos_array_open() failed (%d).", rc); - - if (cell_size != 1) - GERR("Invalid DAOS Array object.\n"); - - rc = daos_array_get_size(aoh, DAOS_TX_NONE, &size, NULL); - DCHECK(rc, "daos_array_get_size() failed (%d).", rc); - - rc = daos_array_close(aoh, NULL); - DCHECK(rc, "daos_array_close() failed (%d).", rc); - aoh.cookie = 0; - } - - if (!hints->filePerProc) - MPI_Bcast(&size, 1, MPI_LONG, 0, MPI_COMM_WORLD); - - return size; -} diff --git a/src/aiori-DFS.c b/src/aiori-DFS.c index 86f014c..8e6b2a7 100755 --- a/src/aiori-DFS.c +++ b/src/aiori-DFS.c @@ -39,8 +39,8 @@ dfs_t *dfs; static daos_handle_t poh, coh; -static daos_oclass_id_t objectClass = OC_SX; -static daos_oclass_id_t dir_oclass = OC_SX; +static daos_oclass_id_t objectClass; +static daos_oclass_id_t dir_oclass; static struct d_hash_table *dir_hash; static bool dfs_init; @@ -59,7 +59,9 @@ enum handleType { /************************** O P T I O N S *****************************/ typedef struct { char *pool; +#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1 char *svcl; +#endif char *group; char *cont; int chunk_size; @@ -85,7 +87,9 @@ static option_help * DFS_options(aiori_mod_opt_t ** init_backend_options, option_help h [] = { {0, "dfs.pool", "pool uuid", OPTION_OPTIONAL_ARGUMENT, 's', &o->pool}, +#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1 {0, "dfs.svcl", "pool SVCL", OPTION_OPTIONAL_ARGUMENT, 's', &o->svcl}, +#endif {0, "dfs.group", "server group", OPTION_OPTIONAL_ARGUMENT, 's', &o->group}, {0, "dfs.cont", "DFS container uuid", OPTION_OPTIONAL_ARGUMENT, 's', &o->cont}, {0, "dfs.chunk_size", "chunk size", OPTION_OPTIONAL_ARGUMENT, 'd', &o->chunk_size}, @@ -114,7 +118,7 @@ static void DFS_Delete(char *, aiori_mod_opt_t *); static char* DFS_GetVersion(); static void DFS_Fsync(aiori_fd_t *, aiori_mod_opt_t *); static void DFS_Sync(aiori_mod_opt_t *); -static IOR_offset_t DFS_GetFileSize(aiori_mod_opt_t *, MPI_Comm, char *); +static IOR_offset_t DFS_GetFileSize(aiori_mod_opt_t *, char *); static int DFS_Statfs (const char *, ior_aiori_statfs_t *, aiori_mod_opt_t *); static int DFS_Stat (const char *, struct stat *, aiori_mod_opt_t *); static int DFS_Mkdir (const char *, mode_t, aiori_mod_opt_t *); @@ -188,9 +192,13 @@ void DFS_init_xfer_options(aiori_xfer_hint_t * params) static int DFS_check_params(aiori_mod_opt_t * options){ DFS_options_t *o = (DFS_options_t *) options; - if (o->pool == NULL || o->svcl == NULL || o->cont == NULL) + if (o->pool == NULL || o->cont == NULL) ERR("Invalid pool or container options\n"); +#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1 + if (o->svcl == NULL) + ERR("Invalid SVCL\n"); +#endif return 0; } @@ -247,8 +255,7 @@ HandleDistribute(enum handleType type) DCHECK(rc, "Failed to get global handle size"); } - MPI_CHECK(MPI_Bcast(&global.iov_buf_len, 1, MPI_UINT64_T, 0, - MPI_COMM_WORLD), + MPI_CHECK(MPI_Bcast(&global.iov_buf_len, 1, MPI_UINT64_T, 0, testComm), "Failed to bcast global handle buffer size"); global.iov_len = global.iov_buf_len; @@ -266,8 +273,7 @@ HandleDistribute(enum handleType type) DCHECK(rc, "Failed to create global handle"); } - MPI_CHECK(MPI_Bcast(global.iov_buf, global.iov_buf_len, MPI_BYTE, 0, - MPI_COMM_WORLD), + MPI_CHECK(MPI_Bcast(global.iov_buf, global.iov_buf_len, MPI_BYTE, 0, testComm), "Failed to bcast global pool handle"); if (rank != 0) { @@ -374,6 +380,45 @@ out: return rc; } +static void +share_file_handle(dfs_obj_t **file, MPI_Comm comm) +{ + d_iov_t global; + int rc; + + global.iov_buf = NULL; + global.iov_buf_len = 0; + global.iov_len = 0; + + if (rank == 0) { + rc = dfs_obj_local2global(dfs, *file, &global); + DCHECK(rc, "Failed to get global handle size"); + } + + MPI_CHECK(MPI_Bcast(&global.iov_buf_len, 1, MPI_UINT64_T, 0, testComm), + "Failed to bcast global handle buffer size"); + + global.iov_len = global.iov_buf_len; + global.iov_buf = malloc(global.iov_buf_len); + if (global.iov_buf == NULL) + ERR("Failed to allocate global handle buffer"); + + if (rank == 0) { + rc = dfs_obj_local2global(dfs, *file, &global); + DCHECK(rc, "Failed to create global handle"); + } + + MPI_CHECK(MPI_Bcast(global.iov_buf, global.iov_buf_len, MPI_BYTE, 0, testComm), + "Failed to bcast global pool handle"); + + if (rank != 0) { + rc = dfs_obj_global2local(dfs, 0, global, file); + DCHECK(rc, "Failed to get local handle"); + } + + free(global.iov_buf); +} + static dfs_obj_t * lookup_insert_dir(const char *name, mode_t *mode) { @@ -418,9 +463,14 @@ DFS_Init(aiori_mod_opt_t * options) return; /** shouldn't be fatal since it can be called with POSIX backend selection */ - if (o->pool == NULL || o->svcl == NULL || o->cont == NULL) + if (o->pool == NULL || o->cont == NULL) return; +#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1 + if (o->svcl == NULL) + return; +#endif + rc = daos_init(); DCHECK(rc, "Failed to initialize daos"); @@ -441,7 +491,6 @@ DFS_Init(aiori_mod_opt_t * options) if (rank == 0) { uuid_t pool_uuid, co_uuid; - d_rank_list_t *svcl = NULL; daos_pool_info_t pool_info; daos_cont_info_t co_info; @@ -451,17 +500,25 @@ DFS_Init(aiori_mod_opt_t * options) rc = uuid_parse(o->cont, co_uuid); DCHECK(rc, "Failed to parse 'Cont uuid': %s", o->cont); + INFO(VERBOSE_1, "Pool uuid = %s", o->pool); + INFO(VERBOSE_1, "DFS Container namespace uuid = %s", o->cont); + +#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1 + d_rank_list_t *svcl = NULL; + svcl = daos_rank_list_parse(o->svcl, ":"); if (svcl == NULL) ERR("Failed to allocate svcl"); - - INFO(VERBOSE_1, "Pool uuid = %s, SVCL = %s\n", o->pool, o->svcl); - INFO(VERBOSE_1, "DFS Container namespace uuid = %s\n", o->cont); + INFO(VERBOSE_1, "Pool svcl = %s", o->svcl); /** Connect to DAOS pool */ rc = daos_pool_connect(pool_uuid, o->group, svcl, DAOS_PC_RW, &poh, &pool_info, NULL); d_rank_list_free(svcl); +#else + rc = daos_pool_connect(pool_uuid, o->group, DAOS_PC_RW, + &poh, &pool_info, NULL); +#endif DCHECK(rc, "Failed to connect to pool"); rc = daos_cont_open(poh, co_uuid, DAOS_COO_RW, &coh, &co_info, @@ -498,23 +555,23 @@ DFS_Finalize(aiori_mod_opt_t *options) DFS_options_t *o = (DFS_options_t *)options; int rc; - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(testComm); d_hash_table_destroy(dir_hash, true /* force */); rc = dfs_umount(dfs); DCHECK(rc, "Failed to umount DFS namespace"); - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(testComm); rc = daos_cont_close(coh, NULL); DCHECK(rc, "Failed to close container %s (%d)", o->cont, rc); - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(testComm); if (o->destroy) { if (rank == 0) { uuid_t uuid; double t1, t2; - INFO(VERBOSE_1, "Destorying DFS Container: %s\n", o->cont); + INFO(VERBOSE_1, "Destroying DFS Container: %s\n", o->cont); uuid_parse(o->cont, uuid); t1 = MPI_Wtime(); rc = daos_cont_destroy(poh, uuid, 1, NULL); @@ -523,7 +580,7 @@ DFS_Finalize(aiori_mod_opt_t *options) INFO(VERBOSE_1, "Container Destroy time = %f secs", t2-t1); } - MPI_Bcast(&rc, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&rc, 1, MPI_INT, 0, testComm); if (rc) { if (rank == 0) DCHECK(rc, "Failed to destroy container %s (%d)", o->cont, rc); @@ -537,7 +594,7 @@ DFS_Finalize(aiori_mod_opt_t *options) rc = daos_pool_disconnect(poh, NULL); DCHECK(rc, "Failed to disconnect from pool"); - MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD), "barrier error"); + MPI_CHECK(MPI_Barrier(testComm), "barrier error"); if (rank == 0) INFO(VERBOSE_1, "Finalizing DAOS..\n"); @@ -547,21 +604,23 @@ DFS_Finalize(aiori_mod_opt_t *options) /** reset tunables */ o->pool = NULL; +#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1 o->svcl = NULL; - o->group = NULL; +#endif + o->group = NULL; o->cont = NULL; o->chunk_size = 1048576; o->oclass = NULL; o->dir_oclass = NULL; o->prefix = NULL; o->destroy = 0; - objectClass = OC_SX; - dir_oclass = OC_SX; + objectClass = 0; + dir_oclass = 0; dfs_init = false; } /* - * Creat and open a file through the DFS interface. + * Create and open a file through the DFS interface. */ static aiori_fd_t * DFS_Create(char *testFileName, int flags, aiori_mod_opt_t *param) @@ -578,26 +637,21 @@ DFS_Create(char *testFileName, int flags, aiori_mod_opt_t *param) assert(dir_name); assert(name); - parent = lookup_insert_dir(dir_name, NULL); - if (parent == NULL) - GERR("Failed to lookup parent dir"); - mode = S_IFREG | mode; if (hints->filePerProc || rank == 0) { fd_oflag |= O_CREAT | O_RDWR | O_EXCL; + parent = lookup_insert_dir(dir_name, NULL); + if (parent == NULL) + GERR("Failed to lookup parent dir"); + rc = dfs_open(dfs, parent, name, mode, fd_oflag, objectClass, o->chunk_size, NULL, &obj); DCHECK(rc, "dfs_open() of %s Failed", name); } + if (!hints->filePerProc) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank != 0) { - fd_oflag |= O_RDWR; - rc = dfs_open(dfs, parent, name, mode, fd_oflag, - objectClass, o->chunk_size, NULL, &obj); - DCHECK(rc, "dfs_open() of %s Failed", name); - } + share_file_handle(&obj, testComm); } if (name) @@ -629,13 +683,19 @@ DFS_Open(char *testFileName, int flags, aiori_mod_opt_t *param) assert(dir_name); assert(name); - parent = lookup_insert_dir(dir_name, NULL); - if (parent == NULL) - GERR("Failed to lookup parent dir"); + if (hints->filePerProc || rank == 0) { + parent = lookup_insert_dir(dir_name, NULL); + if (parent == NULL) + GERR("Failed to lookup parent dir"); - rc = dfs_open(dfs, parent, name, mode, fd_oflag, objectClass, - o->chunk_size, NULL, &obj); - DCHECK(rc, "dfs_open() of %s Failed", name); + rc = dfs_open(dfs, parent, name, mode, fd_oflag, objectClass, + o->chunk_size, NULL, &obj); + DCHECK(rc, "dfs_open() of %s Failed", name); + } + + if (!hints->filePerProc) { + share_file_handle(&obj, testComm); + } if (name) free(name); @@ -675,14 +735,14 @@ DFS_Xfer(int access, aiori_fd_t *file, IOR_size_t *buffer, IOR_offset_t length, if (access == WRITE) { rc = dfs_write(dfs, obj, &sgl, off, NULL); if (rc) { - fprintf(stderr, "dfs_write() failed (%d)", rc); + fprintf(stderr, "dfs_write() failed (%d)\n", rc); return -1; } ret = remaining; } else { rc = dfs_read(dfs, obj, &sgl, off, &ret, NULL); if (rc || ret == 0) - fprintf(stderr, "dfs_read() failed(%d)", rc); + fprintf(stderr, "dfs_read() failed(%d)\n", rc); } if (ret < remaining) { @@ -774,43 +834,36 @@ static char* DFS_GetVersion() * Use DFS stat() to return aggregate file size. */ static IOR_offset_t -DFS_GetFileSize(aiori_mod_opt_t * test, MPI_Comm comm, char *testFileName) +DFS_GetFileSize(aiori_mod_opt_t * test, char *testFileName) { dfs_obj_t *obj; - daos_size_t fsize, tmpMin, tmpMax, tmpSum; + MPI_Comm comm; + daos_size_t fsize; int rc; - rc = dfs_lookup(dfs, testFileName, O_RDONLY, &obj, NULL, NULL); - if (rc) { - fprintf(stderr, "dfs_lookup() of %s Failed (%d)", testFileName, rc); - return -1; + if (hints->filePerProc == TRUE) { + comm = MPI_COMM_SELF; + } else { + comm = testComm; } - rc = dfs_get_size(dfs, obj, &fsize); - if (rc) - return -1; - - dfs_release(obj); - - if (hints->filePerProc == TRUE) { - MPI_CHECK(MPI_Allreduce(&fsize, &tmpSum, 1, - MPI_LONG_LONG_INT, MPI_SUM, comm), - "cannot total data moved"); - fsize = tmpSum; - } else { - MPI_CHECK(MPI_Allreduce(&fsize, &tmpMin, 1, - MPI_LONG_LONG_INT, MPI_MIN, comm), - "cannot total data moved"); - MPI_CHECK(MPI_Allreduce(&fsize, &tmpMax, 1, - MPI_LONG_LONG_INT, MPI_MAX, comm), - "cannot total data moved"); - if (tmpMin != tmpMax) { - if (rank == 0) { - WARN("inconsistent file size by different tasks"); - } - /* incorrect, but now consistent across tasks */ - fsize = tmpMin; + if (hints->filePerProc || rank == 0) { + rc = dfs_lookup(dfs, testFileName, O_RDONLY, &obj, NULL, NULL); + if (rc) { + fprintf(stderr, "dfs_lookup() of %s Failed (%d)", testFileName, rc); + return -1; } + + rc = dfs_get_size(dfs, obj, &fsize); + dfs_release(obj); + if (rc) + return -1; + } + + if (!hints->filePerProc) { + rc = MPI_Bcast(&fsize, 1, MPI_UINT64_T, 0, comm); + if (rc) + return rc; } return (fsize); @@ -914,7 +967,6 @@ DFS_Stat(const char *path, struct stat *buf, aiori_mod_opt_t * param) GERR("Failed to lookup parent dir"); rc = dfs_stat(dfs, parent, name, buf); - DCHECK(rc, "dfs_stat() of Failed (%d)", rc); if (name) free(name); diff --git a/src/aiori-DUMMY.c b/src/aiori-DUMMY.c index 034fc98..4769de0 100755 --- a/src/aiori-DUMMY.c +++ b/src/aiori-DUMMY.c @@ -108,7 +108,7 @@ static char * DUMMY_getVersion() return "0.5"; } -static IOR_offset_t DUMMY_GetFileSize(aiori_mod_opt_t * options, MPI_Comm testComm, char *testFileName) +static IOR_offset_t DUMMY_GetFileSize(aiori_mod_opt_t * options, char *testFileName) { if(verbose > 4){ fprintf(out_logfile, "DUMMY getFileSize: %s\n", testFileName); @@ -156,6 +156,11 @@ static int DUMMY_stat (const char *path, struct stat *buf, aiori_mod_opt_t * opt return 0; } +static int DUMMY_rename (const char *path, const char *path2, aiori_mod_opt_t * options){ + return 0; +} + + static int DUMMY_check_params(aiori_mod_opt_t * options){ return 0; } @@ -188,6 +193,7 @@ ior_aiori_t dummy_aiori = { .statfs = DUMMY_statfs, .mkdir = DUMMY_mkdir, .rmdir = DUMMY_rmdir, + .rename = DUMMY_rename, .access = DUMMY_access, .stat = DUMMY_stat, .initialize = DUMMY_init, diff --git a/src/aiori-Gfarm.c b/src/aiori-Gfarm.c index a7af0ea..e94022f 100644 --- a/src/aiori-Gfarm.c +++ b/src/aiori-Gfarm.c @@ -14,6 +14,14 @@ struct gfarm_file { GFS_File gf; }; +static aiori_xfer_hint_t *hints = NULL; + +void +Gfarm_xfer_hints(aiori_xfer_hint_t *params) +{ + hints = params; +} + void Gfarm_initialize() { @@ -26,14 +34,14 @@ Gfarm_finalize() gfarm_terminate(); } -void * -Gfarm_create(char *fn, IOR_param_t *param) +aiori_fd_t * +Gfarm_create(char *fn, int flag, aiori_mod_opt_t *param) { GFS_File gf; struct gfarm_file *fp; gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return (NULL); e = gfs_pio_create(fn, GFARM_FILE_RDWR, 0664, &gf); @@ -43,17 +51,17 @@ Gfarm_create(char *fn, IOR_param_t *param) if (fp == NULL) ERR("no memory"); fp->gf = gf; - return (fp); + return ((aiori_fd_t *)fp); } -void * -Gfarm_open(char *fn, IOR_param_t *param) +aiori_fd_t * +Gfarm_open(char *fn, int flag, aiori_mod_opt_t *param) { GFS_File gf; struct gfarm_file *fp; gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return (NULL); e = gfs_pio_open(fn, GFARM_FILE_RDWR, &gf); @@ -63,14 +71,14 @@ Gfarm_open(char *fn, IOR_param_t *param) if (fp == NULL) ERR("no memory"); fp->gf = gf; - return (fp); + return ((aiori_fd_t *)fp); } IOR_offset_t -Gfarm_xfer(int access, void *fd, IOR_size_t *buffer, IOR_offset_t len, - IOR_param_t *param) +Gfarm_xfer(int access, aiori_fd_t *fd, IOR_size_t *buffer, + IOR_offset_t len, IOR_offset_t offset, aiori_mod_opt_t *param) { - struct gfarm_file *fp = fd; + struct gfarm_file *fp = (struct gfarm_file *)fd; IOR_offset_t rem = len; gfarm_off_t off; gfarm_error_t e; @@ -78,7 +86,7 @@ Gfarm_xfer(int access, void *fd, IOR_size_t *buffer, IOR_offset_t len, int sz, n; char *buf = (char *)buffer; - if (param->dryRun) + if (hints->dryRun) return (len); if (len > MAX_SZ) @@ -86,7 +94,7 @@ Gfarm_xfer(int access, void *fd, IOR_size_t *buffer, IOR_offset_t len, else sz = len; - e = gfs_pio_seek(fp->gf, param->offset, GFARM_SEEK_SET, &off); + e = gfs_pio_seek(fp->gf, offset, GFARM_SEEK_SET, &off); if (e != GFARM_ERR_NO_ERROR) ERR("gfs_pio_seek failed"); while (rem > 0) { @@ -105,11 +113,11 @@ Gfarm_xfer(int access, void *fd, IOR_size_t *buffer, IOR_offset_t len, } void -Gfarm_close(void *fd, IOR_param_t *param) +Gfarm_close(aiori_fd_t *fd, aiori_mod_opt_t *param) { - struct gfarm_file *fp = fd; + struct gfarm_file *fp = (struct gfarm_file *)fd; - if (param->dryRun) + if (hints->dryRun) return; if (gfs_pio_close(fp->gf) != GFARM_ERR_NO_ERROR) @@ -118,11 +126,11 @@ Gfarm_close(void *fd, IOR_param_t *param) } void -Gfarm_delete(char *fn, IOR_param_t *param) +Gfarm_delete(char *fn, aiori_mod_opt_t *param) { gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return; e = gfs_unlink(fn); @@ -137,11 +145,11 @@ Gfarm_version() } void -Gfarm_fsync(void *fd, IOR_param_t *param) +Gfarm_fsync(aiori_fd_t *fd, aiori_mod_opt_t *param) { - struct gfarm_file *fp = fd; + struct gfarm_file *fp = (struct gfarm_file *)fd; - if (param->dryRun) + if (hints->dryRun) return; if (gfs_pio_sync(fp->gf) != GFARM_ERR_NO_ERROR) @@ -149,12 +157,12 @@ Gfarm_fsync(void *fd, IOR_param_t *param) } IOR_offset_t -Gfarm_get_file_size(IOR_param_t *param, MPI_Comm comm, char *fn) +Gfarm_get_file_size(aiori_mod_opt_t *param, char *fn) { struct gfs_stat st; IOR_offset_t size, sum, min, max; - if (param->dryRun) + if (hints->dryRun) return (0); if (gfs_stat(fn, &st) != GFARM_ERR_NO_ERROR) @@ -162,34 +170,17 @@ Gfarm_get_file_size(IOR_param_t *param, MPI_Comm comm, char *fn) size = st.st_size; gfs_stat_free(&st); - if (param->filePerProc == TRUE) { - MPI_CHECK(MPI_Allreduce(&size, &sum, 1, MPI_LONG_LONG_INT, - MPI_SUM, comm), "cannot total data moved"); - size = sum; - } else { - MPI_CHECK(MPI_Allreduce(&size, &min, 1, MPI_LONG_LONG_INT, - MPI_MIN, comm), "cannot total data moved"); - MPI_CHECK(MPI_Allreduce(&size, &max, 1, MPI_LONG_LONG_INT, - MPI_MAX, comm), "cannot total data moved"); - if (min != max) { - if (rank == 0) - WARN("inconsistent file size by different " - "tasks"); - /* incorrect, but now consistent across tasks */ - size = min; - } - } return (size); } int -Gfarm_statfs(const char *fn, ior_aiori_statfs_t *st, IOR_param_t *param) +Gfarm_statfs(const char *fn, ior_aiori_statfs_t *st, aiori_mod_opt_t *param) { gfarm_off_t used, avail, files; gfarm_error_t e; int bsize = 4096; - if (param->dryRun) + if (hints->dryRun) return (0); e = gfs_statfs_by_path(fn, &used, &avail, &files); @@ -206,11 +197,11 @@ Gfarm_statfs(const char *fn, ior_aiori_statfs_t *st, IOR_param_t *param) } int -Gfarm_mkdir(const char *fn, mode_t mode, IOR_param_t *param) +Gfarm_mkdir(const char *fn, mode_t mode, aiori_mod_opt_t *param) { gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return (0); e = gfs_mkdir(fn, mode); @@ -221,11 +212,11 @@ Gfarm_mkdir(const char *fn, mode_t mode, IOR_param_t *param) } int -Gfarm_rmdir(const char *fn, IOR_param_t *param) +Gfarm_rmdir(const char *fn, aiori_mod_opt_t *param) { gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return (0); e = gfs_rmdir(fn); @@ -236,12 +227,12 @@ Gfarm_rmdir(const char *fn, IOR_param_t *param) } int -Gfarm_access(const char *fn, int mode, IOR_param_t *param) +Gfarm_access(const char *fn, int mode, aiori_mod_opt_t *param) { struct gfs_stat st; gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return (0); e = gfs_stat(fn, &st); @@ -259,12 +250,12 @@ Gfarm_access(const char *fn, int mode, IOR_param_t *param) #define STAT_BLKSIZ 512 /* for st_blocks */ int -Gfarm_stat(const char *fn, struct stat *buf, IOR_param_t *param) +Gfarm_stat(const char *fn, struct stat *buf, aiori_mod_opt_t *param) { struct gfs_stat st; gfarm_error_t e; - if (param->dryRun) + if (hints->dryRun) return (0); e = gfs_stat(fn, &st); @@ -293,11 +284,22 @@ Gfarm_stat(const char *fn, struct stat *buf, IOR_param_t *param) return (0); } +void +Gfarm_sync(aiori_mod_opt_t *param) +{ + if (hints->dryRun) + return; + + /* no cache in libgfarm */ + return; +} + ior_aiori_t gfarm_aiori = { .name = "Gfarm", .name_legacy = NULL, .create = Gfarm_create, .open = Gfarm_open, + .xfer_hints = Gfarm_xfer_hints, .xfer = Gfarm_xfer, .close = Gfarm_close, .delete = Gfarm_delete, @@ -312,5 +314,6 @@ ior_aiori_t gfarm_aiori = { .initialize = Gfarm_initialize, .finalize = Gfarm_finalize, .get_options = NULL, + .sync = Gfarm_sync, .enable_mdtest = true, }; diff --git a/src/aiori-HDF5.c b/src/aiori-HDF5.c index 1e7f2bf..bd4f29f 100755 --- a/src/aiori-HDF5.c +++ b/src/aiori-HDF5.c @@ -91,7 +91,7 @@ static void HDF5_Close(aiori_fd_t *, aiori_mod_opt_t *); static void HDF5_Delete(char *, aiori_mod_opt_t *); static char* HDF5_GetVersion(); static void HDF5_Fsync(aiori_fd_t *, aiori_mod_opt_t *); -static IOR_offset_t HDF5_GetFileSize(aiori_mod_opt_t *, MPI_Comm, char *); +static IOR_offset_t HDF5_GetFileSize(aiori_mod_opt_t *, char *); static int HDF5_Access(const char *, int, aiori_mod_opt_t *); static void HDF5_init_xfer_options(aiori_xfer_hint_t * params); static int HDF5_check_params(aiori_mod_opt_t * options); @@ -171,6 +171,8 @@ static aiori_xfer_hint_t * hints = NULL; static void HDF5_init_xfer_options(aiori_xfer_hint_t * params){ hints = params; + /** HDF5 utilizes the MPIIO backend too, so init hints there */ + MPIIO_xfer_hints(params); } static int HDF5_check_params(aiori_mod_opt_t * options){ @@ -660,11 +662,11 @@ static void SetupDataSet(void *fd, int flags, aiori_mod_opt_t * param) * Use MPIIO call to get file size. */ static IOR_offset_t -HDF5_GetFileSize(aiori_mod_opt_t * test, MPI_Comm testComm, char *testFileName) +HDF5_GetFileSize(aiori_mod_opt_t * test, char *testFileName) { if(hints->dryRun) return 0; - return(MPIIO_GetFileSize(test, testComm, testFileName)); + return(MPIIO_GetFileSize(test, testFileName)); } /* diff --git a/src/aiori-HDFS.c b/src/aiori-HDFS.c index 2d4dcb1..8c528ab 100755 --- a/src/aiori-HDFS.c +++ b/src/aiori-HDFS.c @@ -77,14 +77,13 @@ #include #include /* -#ifdef HAVE_LUSTRE_LUSTRE_USER_H +#ifdef HAVE_LUSTRE_USER #include #endif */ - #include "ior.h" #include "aiori.h" -#include "iordef.h" +#include "utilities.h" #ifndef open64 /* necessary for TRU64 -- */ # define open64 open /* unlikely, but may pose */ @@ -101,15 +100,23 @@ #include "hdfs.h" /**************************** P R O T O T Y P E S *****************************/ -static void *HDFS_Create(char *, IOR_param_t *); -static void *HDFS_Open(char *, IOR_param_t *); -static IOR_offset_t HDFS_Xfer(int, void *, IOR_size_t *, - IOR_offset_t, IOR_param_t *); -static void HDFS_Close(void *, IOR_param_t *); -static void HDFS_Delete(char *, IOR_param_t *); -static void HDFS_SetVersion(IOR_param_t *); -static void HDFS_Fsync(void *, IOR_param_t *); -static IOR_offset_t HDFS_GetFileSize(IOR_param_t *, MPI_Comm, char *); +static aiori_fd_t *HDFS_Create(char *testFileName, int flags, aiori_mod_opt_t * param); +static aiori_fd_t *HDFS_Open(char *testFileName, int flags, aiori_mod_opt_t * param); +static IOR_offset_t HDFS_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer, + IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * param); +static void HDFS_Close(aiori_fd_t *, aiori_mod_opt_t *); +static void HDFS_Delete(char *testFileName, aiori_mod_opt_t * param); +static void HDFS_Fsync(aiori_fd_t *, aiori_mod_opt_t *); +static IOR_offset_t HDFS_GetFileSize(aiori_mod_opt_t *,char *); +static void hdfs_xfer_hints(aiori_xfer_hint_t * params); +static option_help * HDFS_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values); +static int HDFS_mkdir (const char *path, mode_t mode, aiori_mod_opt_t * options); +static int HDFS_rmdir (const char *path, aiori_mod_opt_t * options); +static int HDFS_access (const char *path, int mode, aiori_mod_opt_t * options); +static int HDFS_stat (const char *path, struct stat *buf, aiori_mod_opt_t * options); +static int HDFS_statfs (const char * path, ior_aiori_statfs_t * stat, aiori_mod_opt_t * options); + +static aiori_xfer_hint_t * hints = NULL; /************************** D E C L A R A T I O N S ***************************/ @@ -121,13 +128,120 @@ ior_aiori_t hdfs_aiori = { .xfer = HDFS_Xfer, .close = HDFS_Close, .delete = HDFS_Delete, - .set_version = HDFS_SetVersion, + .get_options = HDFS_options, + .get_version = aiori_get_version, + .xfer_hints = hdfs_xfer_hints, .fsync = HDFS_Fsync, .get_file_size = HDFS_GetFileSize, + .statfs = HDFS_statfs, + .mkdir = HDFS_mkdir, + .rmdir = HDFS_rmdir, + .access = HDFS_access, + .stat = HDFS_stat, + .enable_mdtest = true }; /***************************** F U N C T I O N S ******************************/ +void hdfs_xfer_hints(aiori_xfer_hint_t * params){ + hints = params; +} + +/************************** O P T I O N S *****************************/ +typedef struct { + char * user; + char * name_node; + int replicas; /* n block replicas. (0 gets default) */ + int direct_io; + IOR_offset_t block_size; /* internal blk-size. (0 gets default) */ + // runtime options + hdfsFS fs; /* file-system handle */ + tPort name_node_port; /* (uint16_t) */ +} hdfs_options_t; + +static void hdfs_connect( hdfs_options_t* o ); + +option_help * HDFS_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){ + hdfs_options_t * o = malloc(sizeof(hdfs_options_t)); + + if (init_values != NULL){ + memcpy(o, init_values, sizeof(hdfs_options_t)); + }else{ + memset(o, 0, sizeof(hdfs_options_t)); + char *hdfs_user; + hdfs_user = getenv("USER"); + if (!hdfs_user){ + hdfs_user = ""; + } + o->user = strdup(hdfs_user); + o->name_node = "default"; + } + + *init_backend_options = (aiori_mod_opt_t*) o; + + option_help h [] = { + {0, "hdfs.odirect", "Direct I/O Mode", OPTION_FLAG, 'd', & o->direct_io}, + {0, "hdfs.user", "Username", OPTION_OPTIONAL_ARGUMENT, 's', & o->user}, + {0, "hdfs.name_node", "Namenode", OPTION_OPTIONAL_ARGUMENT, 's', & o->name_node}, + {0, "hdfs.replicas", "Number of replicas", OPTION_OPTIONAL_ARGUMENT, 'd', & o->replicas}, + {0, "hdfs.block_size", "Blocksize", OPTION_OPTIONAL_ARGUMENT, 'l', & o->block_size}, + LAST_OPTION + }; + option_help * help = malloc(sizeof(h)); + memcpy(help, h, sizeof(h)); + return help; +} + + +int HDFS_mkdir (const char *path, mode_t mode, aiori_mod_opt_t * options){ + hdfs_options_t * o = (hdfs_options_t*) options; + hdfs_connect(o); + return hdfsCreateDirectory(o->fs, path); +} + +int HDFS_rmdir (const char *path, aiori_mod_opt_t * options){ + hdfs_options_t * o = (hdfs_options_t*) options; + hdfs_connect(o); + return hdfsDelete(o->fs, path, 1); +} + +int HDFS_access (const char *path, int mode, aiori_mod_opt_t * options){ + hdfs_options_t * o = (hdfs_options_t*) options; + hdfs_connect(o); + return hdfsExists(o->fs, path); +} + +int HDFS_stat (const char *path, struct stat *buf, aiori_mod_opt_t * options){ + hdfsFileInfo * stat; + hdfs_options_t * o = (hdfs_options_t*) options; + hdfs_connect(o); + stat = hdfsGetPathInfo(o->fs, path); + if(stat == NULL){ + return 1; + } + memset(buf, 0, sizeof(struct stat)); + buf->st_atime = stat->mLastAccess; + buf->st_size = stat->mSize; + buf->st_mtime = stat->mLastMod; + buf->st_mode = stat->mPermissions; + + hdfsFreeFileInfo(stat, 1); + return 0; +} + +int HDFS_statfs (const char * path, ior_aiori_statfs_t * stat, aiori_mod_opt_t * options){ + hdfs_options_t * o = (hdfs_options_t*) options; + hdfs_connect(o); + + stat->f_bsize = hdfsGetDefaultBlockSize(o->fs); + stat->f_blocks = hdfsGetCapacity(o->fs) / hdfsGetDefaultBlockSize(o->fs); + stat->f_bfree = stat->f_blocks - hdfsGetUsed(o->fs) / hdfsGetDefaultBlockSize(o->fs); + stat->f_bavail = 1; + stat->f_files = 1; + stat->f_ffree = 1; + return 0; +} + /* This is identical to the one in aiori-POSIX.c Doesn't seem like * it would be appropriate in utilities.c. */ @@ -159,16 +273,16 @@ void hdfs_set_o_direct_flag(int *fd) * NOTE: It's okay to call this thing whenever you need to be sure the HDFS * filesystem is connected. */ -static void hdfs_connect( IOR_param_t* param ) { - if (param->verbose >= VERBOSE_4) { +void hdfs_connect( hdfs_options_t* o ) { + if (verbose >= VERBOSE_4) { printf("-> hdfs_connect [nn:\"%s\", port:%d, user:%s]\n", - param->hdfs_name_node, - param->hdfs_name_node_port, - param->hdfs_user ); + o->name_node, + o->name_node_port, + o->user ); } - if ( param->hdfs_fs ) { - if (param->verbose >= VERBOSE_4) { + if ( o->fs ) { + if (verbose >= VERBOSE_4) { printf("<- hdfs_connect [nothing to do]\n"); /* DEBUGGING */ } return; @@ -176,34 +290,35 @@ static void hdfs_connect( IOR_param_t* param ) { /* initialize a builder, holding parameters for hdfsBuilderConnect() */ struct hdfsBuilder* builder = hdfsNewBuilder(); - if ( ! builder ) - ERR_SIMPLE("couldn't create an hdfsBuilder"); + if ( ! builder ){ + ERR("couldn't create an hdfsBuilder"); + } hdfsBuilderSetForceNewInstance ( builder ); /* don't use cached instance */ - hdfsBuilderSetNameNode ( builder, param->hdfs_name_node ); - hdfsBuilderSetNameNodePort( builder, param->hdfs_name_node_port ); - hdfsBuilderSetUserName ( builder, param->hdfs_user ); + hdfsBuilderSetNameNode ( builder, o->name_node ); + hdfsBuilderSetNameNodePort( builder, o->name_node_port ); + hdfsBuilderSetUserName ( builder, o->user ); /* NOTE: hdfsBuilderConnect() frees the builder */ - param->hdfs_fs = hdfsBuilderConnect( builder ); - if ( ! param->hdfs_fs ) - ERR_SIMPLE("hdsfsBuilderConnect failed"); + o->fs = hdfsBuilderConnect( builder ); + if ( ! o->fs ) + ERR("hdsfsBuilderConnect failed"); - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- hdfs_connect [success]\n"); } } -static void hdfs_disconnect( IOR_param_t* param ) { - if (param->verbose >= VERBOSE_4) { +static void hdfs_disconnect( hdfs_options_t* o ) { + if (verbose >= VERBOSE_4) { printf("-> hdfs_disconnect\n"); } - if ( param->hdfs_fs ) { - hdfsDisconnect( param->hdfs_fs ); - param->hdfs_fs = NULL; + if ( o->fs ) { + hdfsDisconnect( o->fs ); + o->fs = NULL; } - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- hdfs_disconnect\n"); } } @@ -214,16 +329,17 @@ static void hdfs_disconnect( IOR_param_t* param ) { * Return an hdfsFile. */ -static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsigned char createFile ) { - if (param->verbose >= VERBOSE_4) { +static void *HDFS_Create_Or_Open( char *testFileName, int flags, aiori_mod_opt_t *param, unsigned char createFile ) { + if (verbose >= VERBOSE_4) { printf("-> HDFS_Create_Or_Open\n"); } + hdfs_options_t * o = (hdfs_options_t*) param; hdfsFile hdfs_file = NULL; int fd_oflags = 0, hdfs_return; /* initialize file-system handle, if needed */ - hdfs_connect( param ); + hdfs_connect( o ); /* * Check for unsupported flags. @@ -234,15 +350,15 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign * The other two, we just note that they are not supported and don't do them. */ - if ( param->openFlags & IOR_RDWR ) { + if ( flags & IOR_RDWR ) { ERR( "Opening or creating a file in RDWR is not implemented in HDFS" ); } - if ( param->openFlags & IOR_EXCL ) { + if ( flags & IOR_EXCL ) { fprintf( stdout, "Opening or creating a file in Exclusive mode is not implemented in HDFS\n" ); } - if ( param->openFlags & IOR_APPEND ) { + if ( flags & IOR_APPEND ) { fprintf( stdout, "Opening or creating a file for appending is not implemented in HDFS\n" ); } @@ -254,8 +370,8 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign fd_oflags = O_CREAT; } - if ( param->openFlags & IOR_WRONLY ) { - if ( !param->filePerProc ) { + if ( flags & IOR_WRONLY ) { + if ( ! hints->filePerProc ) { // in N-1 mode, only rank 0 truncates the file if ( rank != 0 ) { @@ -279,7 +395,7 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign * Now see if O_DIRECT is needed. */ - if ( param->useO_DIRECT == TRUE ) { + if ( o->direct_io == TRUE ) { hdfs_set_o_direct_flag( &fd_oflags ); } @@ -290,10 +406,7 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign * truncate each other's writes */ - if (( param->openFlags & IOR_WRONLY ) && - ( !param->filePerProc ) && - ( rank != 0 )) { - + if (( flags & IOR_WRONLY ) && ( ! hints->filePerProc ) && ( rank != 0 )) { MPI_CHECK(MPI_Barrier(testComm), "barrier error"); } @@ -301,21 +414,16 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign * Now rank zero can open and truncate, if necessary. */ - if (param->verbose >= VERBOSE_4) { - printf("\thdfsOpenFile(0x%llx, %s, 0%o, %d, %d, %d)\n", - param->hdfs_fs, + if (verbose >= VERBOSE_4) { + printf("\thdfsOpenFile(%p, %s, 0%o, %lld, %d, %lld)\n", + o->fs, testFileName, fd_oflags, /* shown in octal to compare w/ */ - param->transferSize, - param->hdfs_replicas, - param->hdfs_block_size); + hints->transferSize, + o->replicas, + o->block_size); } - hdfs_file = hdfsOpenFile( param->hdfs_fs, - testFileName, - fd_oflags, - param->transferSize, - param->hdfs_replicas, - param->hdfs_block_size); + hdfs_file = hdfsOpenFile( o->fs, testFileName, fd_oflags, hints->transferSize, o->replicas, o->block_size); if ( ! hdfs_file ) { ERR( "Failed to open the file" ); } @@ -324,14 +432,14 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign * For N-1 write, Rank 0 waits for the other ranks to open the file after it has. */ - if (( param->openFlags & IOR_WRONLY ) && - ( !param->filePerProc ) && + if (( flags & IOR_WRONLY ) && + ( !hints->filePerProc ) && ( rank == 0 )) { MPI_CHECK(MPI_Barrier(testComm), "barrier error"); } - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_Create_Or_Open\n"); } return ((void *) hdfs_file ); @@ -341,36 +449,36 @@ static void *HDFS_Create_Or_Open( char *testFileName, IOR_param_t *param, unsign * Create and open a file through the HDFS interface. */ -static void *HDFS_Create( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_4) { +static aiori_fd_t *HDFS_Create(char *testFileName, int flags, aiori_mod_opt_t * param) { + if (verbose >= VERBOSE_4) { printf("-> HDFS_Create\n"); } - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_Create\n"); } - return HDFS_Create_Or_Open( testFileName, param, TRUE ); + return HDFS_Create_Or_Open( testFileName, flags, param, TRUE ); } /* * Open a file through the HDFS interface. */ -static void *HDFS_Open( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_4) { +static aiori_fd_t *HDFS_Open(char *testFileName, int flags, aiori_mod_opt_t * param) { + if (verbose >= VERBOSE_4) { printf("-> HDFS_Open\n"); } - if ( param->openFlags & IOR_CREAT ) { - if (param->verbose >= VERBOSE_4) { + if ( flags & IOR_CREAT ) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_Open( ... TRUE)\n"); } - return HDFS_Create_Or_Open( testFileName, param, TRUE ); + return HDFS_Create_Or_Open( testFileName, flags, param, TRUE ); } else { - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_Open( ... FALSE)\n"); } - return HDFS_Create_Or_Open( testFileName, param, FALSE ); + return HDFS_Create_Or_Open( testFileName, flags, param, FALSE ); } } @@ -378,19 +486,18 @@ static void *HDFS_Open( char *testFileName, IOR_param_t * param ) { * Write or read to file using the HDFS interface. */ -static IOR_offset_t HDFS_Xfer(int access, void *file, IOR_size_t * buffer, - IOR_offset_t length, IOR_param_t * param) { - if (param->verbose >= VERBOSE_4) { - printf("-> HDFS_Xfer(acc:%d, file:0x%llx, buf:0x%llx, len:%llu, 0x%llx)\n", +static IOR_offset_t HDFS_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer, + IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * param) { + if (verbose >= VERBOSE_4) { + printf("-> HDFS_Xfer(acc:%d, file:%p, buf:%p, len:%llu, %p)\n", access, file, buffer, length, param); } - + hdfs_options_t * o = (hdfs_options_t*) param; int xferRetries = 0; long long remaining = (long long)length; char* ptr = (char *)buffer; long long rc; - off_t offset = param->offset; - hdfsFS hdfs_fs = param->hdfs_fs; /* (void*) */ + hdfsFS hdfs_fs = o->fs; /* (void*) */ hdfsFile hdfs_file = (hdfsFile)file; /* (void*) */ @@ -401,37 +508,34 @@ static IOR_offset_t HDFS_Xfer(int access, void *file, IOR_size_t * buffer, if (verbose >= VERBOSE_4) { fprintf( stdout, "task %d writing to offset %lld\n", rank, - param->offset + length - remaining); + offset + length - remaining); } - if (param->verbose >= VERBOSE_4) { - printf("\thdfsWrite( 0x%llx, 0x%llx, 0x%llx, %lld)\n", + if (verbose >= VERBOSE_4) { + printf("\thdfsWrite( %p, %p, %p, %lld)\n", hdfs_fs, hdfs_file, ptr, remaining ); /* DEBUGGING */ } rc = hdfsWrite( hdfs_fs, hdfs_file, ptr, remaining ); if ( rc < 0 ) { ERR( "hdfsWrite() failed" ); } - offset += rc; - if ( param->fsyncPerWrite == TRUE ) { - HDFS_Fsync( hdfs_file, param ); + if ( hints->fsyncPerWrite == TRUE ) { + HDFS_Fsync( file, param ); } } else { /* READ or CHECK */ if (verbose >= VERBOSE_4) { fprintf( stdout, "task %d reading from offset %lld\n", - rank, - param->offset + length - remaining ); + rank, offset + length - remaining ); } - if (param->verbose >= VERBOSE_4) { - printf("\thdfsRead( 0x%llx, 0x%llx, 0x%llx, %lld)\n", + if (verbose >= VERBOSE_4) { + printf("\thdfsRead( %p, %p, %p, %lld)\n", hdfs_fs, hdfs_file, ptr, remaining ); /* DEBUGGING */ } - rc = hdfsRead( hdfs_fs, hdfs_file, ptr, remaining ); - + rc = hdfsPread(hdfs_fs, hdfs_file, offset, ptr, remaining); if ( rc == 0 ) { ERR( "hdfs_read() returned EOF prematurely" ); } @@ -449,9 +553,9 @@ static IOR_offset_t HDFS_Xfer(int access, void *file, IOR_size_t * buffer, rank, access == WRITE ? "hdfsWrite()" : "hdfs_read()", rc, remaining, - param->offset + length - remaining ); + offset + length - remaining ); - if ( param->singleXferAttempt == TRUE ) { + if ( hints->singleXferAttempt == TRUE ) { MPI_CHECK( MPI_Abort( MPI_COMM_WORLD, -1 ), "barrier error" ); } @@ -467,7 +571,16 @@ static IOR_offset_t HDFS_Xfer(int access, void *file, IOR_size_t * buffer, xferRetries++; } - if (param->verbose >= VERBOSE_4) { + if(access == WRITE){ + // flush user buffer, this makes the write visible to readers + // it is the expected semantics of read/writes + rc = hdfsHFlush(hdfs_fs, hdfs_file); + if(rc != 0){ + WARN("Error during flush"); + } + } + + if (verbose >= VERBOSE_4) { printf("<- HDFS_Xfer\n"); } return ( length ); @@ -476,67 +589,38 @@ static IOR_offset_t HDFS_Xfer(int access, void *file, IOR_size_t * buffer, /* * Perform hdfs_sync(). */ - -static void HDFS_Fsync( void *fd, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_4) { - printf("-> HDFS_Fsync\n"); - } - hdfsFS hdfs_fs = param->hdfs_fs; /* (void *) */ +static void HDFS_Fsync(aiori_fd_t * fd, aiori_mod_opt_t * param) { + hdfs_options_t * o = (hdfs_options_t*) param; + hdfsFS hdfs_fs = o->fs; /* (void *) */ hdfsFile hdfs_file = (hdfsFile)fd; /* (void *) */ -#if 0 - if (param->verbose >= VERBOSE_4) { - printf("\thdfsHSync(0x%llx, 0x%llx)\n", hdfs_fs, hdfs_file); + if (verbose >= VERBOSE_4) { + printf("\thdfsFlush(%p, %p)\n", hdfs_fs, hdfs_file); } if ( hdfsHSync( hdfs_fs, hdfs_file ) != 0 ) { - EWARN( "hdfsHSync() failed" ); - } -#elif 0 - if (param->verbose >= VERBOSE_4) { - printf("\thdfsHFlush(0x%llx, 0x%llx)\n", hdfs_fs, hdfs_file); - } - if ( hdfsHFlush( hdfs_fs, hdfs_file ) != 0 ) { - EWARN( "hdfsHFlush() failed" ); - } -#else - if (param->verbose >= VERBOSE_4) { - printf("\thdfsFlush(0x%llx, 0x%llx)\n", hdfs_fs, hdfs_file); - } - if ( hdfsFlush( hdfs_fs, hdfs_file ) != 0 ) { + // Hsync is implemented to flush out data with newer Hadoop versions EWARN( "hdfsFlush() failed" ); } -#endif - - if (param->verbose >= VERBOSE_4) { - printf("<- HDFS_Fsync\n"); - } } /* * Close a file through the HDFS interface. */ -static void HDFS_Close( void *fd, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_4) { +static void HDFS_Close(aiori_fd_t * fd, aiori_mod_opt_t * param) { + if (verbose >= VERBOSE_4) { printf("-> HDFS_Close\n"); } + hdfs_options_t * o = (hdfs_options_t*) param; - hdfsFS hdfs_fs = param->hdfs_fs; /* (void *) */ + hdfsFS hdfs_fs = o->fs; /* (void *) */ hdfsFile hdfs_file = (hdfsFile)fd; /* (void *) */ - int open_flags; - - if ( param->openFlags & IOR_WRONLY ) { - open_flags = O_CREAT | O_WRONLY; - } else { - open_flags = O_RDONLY; - } - if ( hdfsCloseFile( hdfs_fs, hdfs_file ) != 0 ) { ERR( "hdfsCloseFile() failed" ); } - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_Close\n"); } } @@ -547,119 +631,66 @@ static void HDFS_Close( void *fd, IOR_param_t * param ) { * NOTE: The signature for ior_aiori.delete doesn't include a parameter to * select recursive deletes. We'll assume that that is never needed. */ -static void HDFS_Delete( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_4) { +static void HDFS_Delete( char *testFileName, aiori_mod_opt_t * param ) { + if (verbose >= VERBOSE_4) { printf("-> HDFS_Delete\n"); } + hdfs_options_t * o = (hdfs_options_t*) param; char errmsg[256]; /* initialize file-system handle, if needed */ - hdfs_connect( param ); + hdfs_connect(o); - if ( ! param->hdfs_fs ) - ERR_SIMPLE( "Can't delete a file without an HDFS connection" ); + if ( ! o->fs ) + ERR( "Can't delete a file without an HDFS connection" ); - if ( hdfsDelete( param->hdfs_fs, testFileName, 0 ) != 0 ) { - sprintf(errmsg, - "[RANK %03d]: hdfsDelete() of file \"%s\" failed\n", + if ( hdfsDelete( o->fs, testFileName, 0 ) != 0 ) { + sprintf(errmsg, "[RANK %03d]: hdfsDelete() of file \"%s\" failed\n", rank, testFileName); EWARN( errmsg ); } - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_Delete\n"); } } -/* - * Determine api version. - */ - -static void HDFS_SetVersion( IOR_param_t * param ) { - if (param->verbose >= VERBOSE_4) { - printf("-> HDFS_SetVersion\n"); - } - - strcpy( param->apiVersion, param->api ); - if (param->verbose >= VERBOSE_4) { - printf("<- HDFS_SetVersion\n"); - } -} - /* * Use hdfsGetPathInfo() to get info about file? * Is there an fstat we can use on hdfs? * Should we just use POSIX fstat? */ -static IOR_offset_t -HDFS_GetFileSize(IOR_param_t * param, - MPI_Comm testComm, +static IOR_offset_t HDFS_GetFileSize(aiori_mod_opt_t * param, char * testFileName) { - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("-> HDFS_GetFileSize(%s)\n", testFileName); } + hdfs_options_t * o = (hdfs_options_t*) param; IOR_offset_t aggFileSizeFromStat; IOR_offset_t tmpMin, tmpMax, tmpSum; /* make sure file-system is connected */ - hdfs_connect( param ); + hdfs_connect( o ); /* file-info struct includes size in bytes */ - if (param->verbose >= VERBOSE_4) { - printf("\thdfsGetPathInfo(%s) ...", testFileName);fflush(stdout); + if (verbose >= VERBOSE_4) { + printf("\thdfsGetPathInfo(%s) ...", testFileName); + fflush(stdout); } - hdfsFileInfo* info = hdfsGetPathInfo( param->hdfs_fs, testFileName ); + hdfsFileInfo* info = hdfsGetPathInfo( o->fs, testFileName ); if ( ! info ) - ERR_SIMPLE( "hdfsGetPathInfo() failed" ); - if (param->verbose >= VERBOSE_4) { + ERR( "hdfsGetPathInfo() failed" ); + if (verbose >= VERBOSE_4) { printf("done.\n");fflush(stdout); } aggFileSizeFromStat = info->mSize; - if ( param->filePerProc == TRUE ) { - if (param->verbose >= VERBOSE_4) { - printf("\tall-reduce (1)\n"); - } - MPI_CHECK( - MPI_Allreduce( - &aggFileSizeFromStat, &tmpSum, 1, MPI_LONG_LONG_INT, MPI_SUM, testComm ), - "cannot total data moved" ); - - aggFileSizeFromStat = tmpSum; - } - else { - if (param->verbose >= VERBOSE_4) { - printf("\tall-reduce (2a)\n"); - } - MPI_CHECK( - MPI_Allreduce( - &aggFileSizeFromStat, &tmpMin, 1, MPI_LONG_LONG_INT, MPI_MIN, testComm ), - "cannot total data moved" ); - - if (param->verbose >= VERBOSE_4) { - printf("\tall-reduce (2b)\n"); - } - MPI_CHECK( - MPI_Allreduce( - &aggFileSizeFromStat, &tmpMax, 1, MPI_LONG_LONG_INT, MPI_MAX, testComm ), - "cannot total data moved" ); - - if ( tmpMin != tmpMax ) { - if ( rank == 0 ) { - WARN( "inconsistent file size by different tasks" ); - } - - /* incorrect, but now consistent across tasks */ - aggFileSizeFromStat = tmpMin; - } - } - - if (param->verbose >= VERBOSE_4) { + if (verbose >= VERBOSE_4) { printf("<- HDFS_GetFileSize [%llu]\n", aggFileSizeFromStat); } return ( aggFileSizeFromStat ); diff --git a/src/aiori-IME.c b/src/aiori-IME.c index 500f380..2371fdc 100755 --- a/src/aiori-IME.c +++ b/src/aiori-IME.c @@ -21,8 +21,8 @@ #include #include #include -#include /* sys_errlist */ -#include /* IO operations */ +#include /* sys_errlist */ +#include /* IO operations */ #include "ior.h" #include "iordef.h" @@ -30,63 +30,68 @@ #include "utilities.h" #include "ime_native.h" -#ifndef O_BINARY /* Required on Windows */ +#define IME_UNUSED(x) (void)(x) /* Silence compiler warnings */ + +#ifndef O_BINARY /* Required on Windows */ # define O_BINARY 0 #endif /**************************** P R O T O T Y P E S *****************************/ -static void *IME_Create(char *, IOR_param_t *); -static void *IME_Open(char *, IOR_param_t *); -static void IME_Close(void *, IOR_param_t *); -static void IME_Delete(char *, IOR_param_t *); -static char *IME_GetVersion(); -static void IME_Fsync(void *, IOR_param_t *); -static int IME_Access(const char *, int, IOR_param_t *); -static IOR_offset_t IME_GetFileSize(IOR_param_t *, MPI_Comm, char *); -static IOR_offset_t IME_Xfer(int, void *, IOR_size_t *, - IOR_offset_t, IOR_param_t *); -static int IME_StatFS(const char *, ior_aiori_statfs_t *, - IOR_param_t *); -static int IME_RmDir(const char *, IOR_param_t *); -static int IME_MkDir(const char *, mode_t, IOR_param_t *); -static int IME_Stat(const char *, struct stat *, IOR_param_t *); +aiori_fd_t *IME_Create(char *, int, aiori_mod_opt_t *); +aiori_fd_t *IME_Open(char *, int, aiori_mod_opt_t *); +void IME_Close(aiori_fd_t *, aiori_mod_opt_t *); +void IME_Delete(char *, aiori_mod_opt_t *); +char *IME_GetVersion(); +void IME_Fsync(aiori_fd_t *, aiori_mod_opt_t *); +int IME_Access(const char *, int, aiori_mod_opt_t *); +IOR_offset_t IME_GetFileSize(aiori_mod_opt_t *, char *); +IOR_offset_t IME_Xfer(int, aiori_fd_t *, IOR_size_t *, IOR_offset_t, + IOR_offset_t, aiori_mod_opt_t *); +int IME_Statfs(const char *, ior_aiori_statfs_t *, + aiori_mod_opt_t *); +int IME_Rmdir(const char *, aiori_mod_opt_t *); +int IME_Mkdir(const char *, mode_t, aiori_mod_opt_t *); +int IME_Stat(const char *, struct stat *, aiori_mod_opt_t *); +void IME_Xferhints(aiori_xfer_hint_t *params); #if (IME_NATIVE_API_VERSION >= 132) -static int IME_Mknod(char *); -static void IME_Sync(IOR_param_t *); +int IME_Mknod(char *); +void IME_Sync(aiori_mod_opt_t *param); #endif -static void IME_Initialize(); -static void IME_Finalize(); +void IME_Initialize(); +void IME_Finalize(); +/****************************** O P T I O N S *********************************/ -/************************** O P T I O N S *****************************/ typedef struct{ - int direct_io; + int direct_io; } ime_options_t; +option_help *IME_Options(aiori_mod_opt_t **init_backend_options, + aiori_mod_opt_t *init_values) +{ + ime_options_t *o = malloc(sizeof(ime_options_t)); -option_help * IME_options(void ** init_backend_options, void * init_values){ - ime_options_t * o = malloc(sizeof(ime_options_t)); + if (init_values != NULL) + memcpy(o, init_values, sizeof(ime_options_t)); + else + o->direct_io = 0; - if (init_values != NULL){ - memcpy(o, init_values, sizeof(ime_options_t)); - }else{ - o->direct_io = 0; - } + *init_backend_options = (aiori_mod_opt_t*)o; - *init_backend_options = o; + option_help h[] = { + {0, "ime.odirect", "Direct I/O Mode", OPTION_FLAG, 'd', & o->direct_io}, + LAST_OPTION + }; + option_help *help = malloc(sizeof(h)); + memcpy(help, h, sizeof(h)); - option_help h [] = { - {0, "ime.odirect", "Direct I/O Mode", OPTION_FLAG, 'd', & o->direct_io}, - LAST_OPTION - }; - option_help * help = malloc(sizeof(h)); - memcpy(help, h, sizeof(h)); - return help; + return help; } + /************************** D E C L A R A T I O N S ***************************/ extern int rank; @@ -100,19 +105,20 @@ ior_aiori_t ime_aiori = { .create = IME_Create, .open = IME_Open, .xfer = IME_Xfer, + .xfer_hints = IME_Xferhints, .close = IME_Close, .delete = IME_Delete, .get_version = IME_GetVersion, .fsync = IME_Fsync, .get_file_size = IME_GetFileSize, .access = IME_Access, - .statfs = IME_StatFS, - .rmdir = IME_RmDir, - .mkdir = IME_MkDir, + .statfs = IME_Statfs, + .rmdir = IME_Rmdir, + .mkdir = IME_Mkdir, .stat = IME_Stat, .initialize = IME_Initialize, .finalize = IME_Finalize, - .get_options = IME_options, + .get_options = IME_Options, #if (IME_NATIVE_API_VERSION >= 132) .sync = IME_Sync, .mknod = IME_Mknod, @@ -120,72 +126,92 @@ ior_aiori_t ime_aiori = { .enable_mdtest = true, }; +static aiori_xfer_hint_t *hints = NULL; +static bool ime_initialized = false; + + /***************************** F U N C T I O N S ******************************/ +void IME_Xferhints(aiori_xfer_hint_t *params) +{ + hints = params; +} + /* * Initialize IME (before MPI is started). */ -static void IME_Initialize() +void IME_Initialize() { + if (ime_initialized) + return; + ime_native_init(); + ime_initialized = true; } /* * Finlize IME (after MPI is shutdown). */ -static void IME_Finalize() +void IME_Finalize() { + if (!ime_initialized) + return; + (void)ime_native_finalize(); + ime_initialized = false; } /* * Try to access a file through the IME interface. */ -static int IME_Access(const char *path, int mode, IOR_param_t *param) + +int IME_Access(const char *path, int mode, aiori_mod_opt_t *module_options) { - (void)param; + IME_UNUSED(module_options); return ime_native_access(path, mode); } /* - * Creat and open a file through the IME interface. + * Create and open a file through the IME interface. */ -static void *IME_Create(char *testFileName, IOR_param_t *param) +aiori_fd_t *IME_Create(char *testFileName, int flags, aiori_mod_opt_t *param) { - return IME_Open(testFileName, param); + return IME_Open(testFileName, flags, param); } /* * Open a file through the IME interface. */ -static void *IME_Open(char *testFileName, IOR_param_t *param) +aiori_fd_t *IME_Open(char *testFileName, int flags, aiori_mod_opt_t *param) { int fd_oflag = O_BINARY; int *fd; + if (hints->dryRun) + return NULL; + fd = (int *)malloc(sizeof(int)); if (fd == NULL) ERR("Unable to malloc file descriptor"); - ime_options_t * o = (ime_options_t*) param->backend_options; - if (o->direct_io == TRUE){ - set_o_direct_flag(&fd_oflag); - } + ime_options_t *o = (ime_options_t*) param; + if (o->direct_io == TRUE) + set_o_direct_flag(&fd_oflag); - if (param->openFlags & IOR_RDONLY) + if (flags & IOR_RDONLY) fd_oflag |= O_RDONLY; - if (param->openFlags & IOR_WRONLY) + if (flags & IOR_WRONLY) fd_oflag |= O_WRONLY; - if (param->openFlags & IOR_RDWR) + if (flags & IOR_RDWR) fd_oflag |= O_RDWR; - if (param->openFlags & IOR_APPEND) + if (flags & IOR_APPEND) fd_oflag |= O_APPEND; - if (param->openFlags & IOR_CREAT) + if (flags & IOR_CREAT) fd_oflag |= O_CREAT; - if (param->openFlags & IOR_EXCL) + if (flags & IOR_EXCL) fd_oflag |= O_EXCL; - if (param->openFlags & IOR_TRUNC) + if (flags & IOR_TRUNC) fd_oflag |= O_TRUNC; *fd = ime_native_open(testFileName, fd_oflag, 0664); @@ -194,14 +220,14 @@ static void *IME_Open(char *testFileName, IOR_param_t *param) ERR("cannot open file"); } - return((void *)fd); + return (aiori_fd_t*) fd; } /* * Write or read access to file using the IM interface. */ -static IOR_offset_t IME_Xfer(int access, void *file, IOR_size_t *buffer, - IOR_offset_t length, IOR_param_t *param) +IOR_offset_t IME_Xfer(int access, aiori_fd_t *file, IOR_size_t *buffer, + IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t *param) { int xferRetries = 0; long long remaining = (long long)length; @@ -209,25 +235,28 @@ static IOR_offset_t IME_Xfer(int access, void *file, IOR_size_t *buffer, int fd = *(int *)file; long long rc; + if (hints->dryRun) + return length; + while (remaining > 0) { /* write/read file */ if (access == WRITE) { /* WRITE */ if (verbose >= VERBOSE_4) { fprintf(stdout, "task %d writing to offset %lld\n", - rank, param->offset + length - remaining); + rank, offset + length - remaining); } - rc = ime_native_pwrite(fd, ptr, remaining, param->offset); + rc = ime_native_pwrite(fd, ptr, remaining, offset); - if (param->fsyncPerWrite) - IME_Fsync(&fd, param); + if (hints->fsyncPerWrite) + IME_Fsync(file, param); } else { /* READ or CHECK */ if (verbose >= VERBOSE_4) { fprintf(stdout, "task %d reading from offset %lld\n", - rank, param->offset + length - remaining); + rank, offset + length - remaining); } - rc = ime_native_pread(fd, ptr, remaining, param->offset); + rc = ime_native_pread(fd, ptr, remaining, offset); if (rc == 0) ERR("hit EOF prematurely"); else if (rc < 0) @@ -238,9 +267,9 @@ static IOR_offset_t IME_Xfer(int access, void *file, IOR_size_t *buffer, fprintf(stdout, "WARNING: Task %d, partial %s, %lld of " "%lld bytes at offset %lld\n", rank, access == WRITE ? "write" : "read", rc, - remaining, param->offset + length - remaining ); + remaining, offset + length - remaining ); - if (param->singleXferAttempt) { + if (hints->singleXferAttempt) { MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "barrier error"); } @@ -264,7 +293,7 @@ static IOR_offset_t IME_Xfer(int access, void *file, IOR_size_t *buffer, /* * Perform fsync(). */ -static void IME_Fsync(void *fd, IOR_param_t *param) +void IME_Fsync(aiori_fd_t *fd, aiori_mod_opt_t *param) { if (ime_native_fsync(*(int *)fd) != 0) WARN("cannot perform fsync on file"); @@ -273,33 +302,34 @@ static void IME_Fsync(void *fd, IOR_param_t *param) /* * Close a file through the IME interface. */ -static void IME_Close(void *fd, IOR_param_t *param) +void IME_Close(aiori_fd_t *file, aiori_mod_opt_t *param) { - if (ime_native_close(*(int *)fd) != 0) - { - free(fd); - ERR("cannot close file"); - } - else - free(fd); + if (hints->dryRun) + return; + + if (ime_native_close(*(int*)file) != 0) + ERRF("Cannot close file descriptor: %d", *(int*)file); + + free(file); } /* * Delete a file through the IME interface. */ -static void IME_Delete(char *testFileName, IOR_param_t *param) +void IME_Delete(char *testFileName, aiori_mod_opt_t *param) { - char errmsg[256]; - sprintf(errmsg, "[RANK %03d]:cannot delete file %s\n", - rank, testFileName); + if (hints->dryRun) + return; + if (ime_native_unlink(testFileName) != 0) - WARN(errmsg); + EWARNF("[RANK %03d]: cannot delete file \"%s\"\n", + rank, testFileName); } /* * Determine API version. */ -static char *IME_GetVersion() +char *IME_GetVersion() { static char ver[1024] = {}; #if (IME_NATIVE_API_VERSION >= 120) @@ -310,18 +340,17 @@ static char *IME_GetVersion() return ver; } -static int IME_StatFS(const char *path, ior_aiori_statfs_t *stat_buf, - IOR_param_t *param) +int IME_Statfs(const char *path, ior_aiori_statfs_t *stat_buf, + aiori_mod_opt_t *module_options) { - (void)param; + IME_UNUSED(module_options); #if (IME_NATIVE_API_VERSION >= 130) struct statvfs statfs_buf; int ret = ime_native_statvfs(path, &statfs_buf); if (ret) - return ret; - + return ret; stat_buf->f_bsize = statfs_buf.f_bsize; stat_buf->f_blocks = statfs_buf.f_blocks; stat_buf->f_bfree = statfs_buf.f_bfree; @@ -330,38 +359,37 @@ static int IME_StatFS(const char *path, ior_aiori_statfs_t *stat_buf, return 0; #else - (void)path; - (void)stat_buf; + IME_UNUSED(path); + IME_UNUSED(stat_buf); WARN("statfs is currently not supported in IME backend!"); return -1; #endif } - -static int IME_MkDir(const char *path, mode_t mode, IOR_param_t *param) +int IME_Mkdir(const char *path, mode_t mode, aiori_mod_opt_t * module_options) { - (void)param; + IME_UNUSED(module_options); #if (IME_NATIVE_API_VERSION >= 130) return ime_native_mkdir(path, mode); #else - (void)path; - (void)mode; + IME_UNUSED(path); + IME_UNUSED(mode); WARN("mkdir not supported in IME backend!"); return -1; #endif } -static int IME_RmDir(const char *path, IOR_param_t *param) +int IME_Rmdir(const char *path, aiori_mod_opt_t *module_options) { - (void)param; + IME_UNUSED(module_options); #if (IME_NATIVE_API_VERSION >= 130) return ime_native_rmdir(path); #else - (void)path; + IME_UNUSED(path); WARN("rmdir not supported in IME backend!"); return -1; @@ -371,9 +399,10 @@ static int IME_RmDir(const char *path, IOR_param_t *param) /* * Perform stat() through the IME interface. */ -static int IME_Stat(const char *path, struct stat *buf, IOR_param_t *param) +int IME_Stat(const char *path, struct stat *buf, + aiori_mod_opt_t *module_options) { - (void)param; + IME_UNUSED(module_options); return ime_native_stat(path, buf); } @@ -381,62 +410,39 @@ static int IME_Stat(const char *path, struct stat *buf, IOR_param_t *param) /* * Use IME stat() to return aggregate file size. */ -static IOR_offset_t IME_GetFileSize(IOR_param_t *test, MPI_Comm testComm, - char *testFileName) +IOR_offset_t IME_GetFileSize(aiori_mod_opt_t *test, char *testFileName) { struct stat stat_buf; - IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum; - if (ime_native_stat(testFileName, &stat_buf) != 0) { - ERR("cannot get status of written file"); - } - aggFileSizeFromStat = stat_buf.st_size; + if (hints->dryRun) + return 0; - if (test->filePerProc) { - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1, - MPI_LONG_LONG_INT, MPI_SUM, testComm), - "cannot total data moved"); - aggFileSizeFromStat = tmpSum; - } else { - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMin, 1, - MPI_LONG_LONG_INT, MPI_MIN, testComm), - "cannot total data moved"); - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMax, 1, - MPI_LONG_LONG_INT, MPI_MAX, testComm), - "cannot total data moved"); - - if (tmpMin != tmpMax) { - if (rank == 0) { - WARN("inconsistent file size by different tasks"); - } - /* incorrect, but now consistent across tasks */ - aggFileSizeFromStat = tmpMin; - } - } - - return(aggFileSizeFromStat); + if (ime_native_stat(testFileName, &stat_buf) != 0) + ERRF("cannot get status of written file %s", + testFileName); + return stat_buf.st_size; } #if (IME_NATIVE_API_VERSION >= 132) /* * Create a file through mknod interface. */ -static int IME_Mknod(char *testFileName) +int IME_Mknod(char *testFileName) { - int ret = ime_native_mknod(testFileName, S_IFREG | S_IRUSR, 0); - if (ret < 0) - ERR("mknod failed"); + int ret = ime_native_mknod(testFileName, S_IFREG | S_IRUSR, 0); + if (ret < 0) + ERR("mknod failed"); - return ret; + return ret; } /* * Use IME sync to flush page cache of all opened files. */ -static void IME_Sync(IOR_param_t * param) +void IME_Sync(aiori_mod_opt_t *param) { - int ret = ime_native_sync(0); - if (ret != 0) - FAIL("Error executing the sync command."); + int ret = ime_native_sync(0); + if (ret != 0) + FAIL("Error executing the sync command."); } #endif diff --git a/src/aiori-MMAP.c b/src/aiori-MMAP.c index 7ed3b90..5fa13f8 100644 --- a/src/aiori-MMAP.c +++ b/src/aiori-MMAP.c @@ -22,6 +22,7 @@ #include "ior.h" #include "aiori.h" +#include "aiori-POSIX.h" #include "iordef.h" #include "utilities.h" @@ -86,7 +87,7 @@ static aiori_xfer_hint_t * hints = NULL; static void MMAP_xfer_hints(aiori_xfer_hint_t * params){ hints = params; - aiori_posix_xfer_hints(params); + POSIX_xfer_hints(params); } static int MMAP_check_params(aiori_mod_opt_t * options){ @@ -128,7 +129,7 @@ static void ior_mmap_file(int *file, int mflags, void *param) } /* - * Creat and open a file through the POSIX interface, then setup mmap. + * Create and open a file through the POSIX interface, then setup mmap. */ static aiori_fd_t *MMAP_Create(char *testFileName, int flags, aiori_mod_opt_t * param) { diff --git a/src/aiori-MPIIO.c b/src/aiori-MPIIO.c index 53eaad0..070cff0 100755 --- a/src/aiori-MPIIO.c +++ b/src/aiori-MPIIO.c @@ -40,7 +40,6 @@ static IOR_offset_t MPIIO_Xfer(int, aiori_fd_t *, IOR_size_t *, static void MPIIO_Close(aiori_fd_t *, aiori_mod_opt_t *); static char* MPIIO_GetVersion(); static void MPIIO_Fsync(aiori_fd_t *, aiori_mod_opt_t *); -static void MPIIO_xfer_hints(aiori_xfer_hint_t * params); static int MPIIO_check_params(aiori_mod_opt_t * options); /************************** D E C L A R A T I O N S ***************************/ @@ -48,6 +47,7 @@ static int MPIIO_check_params(aiori_mod_opt_t * options); typedef struct{ MPI_File fd; MPI_Datatype transferType; /* datatype for transfer */ + MPI_Datatype contigType; /* elem datatype */ MPI_Datatype fileType; /* filetype for file view */ } mpiio_fd_t; @@ -73,7 +73,7 @@ static option_help * MPIIO_options(aiori_mod_opt_t ** init_backend_options, aior {0, "mpiio.hintsFileName","Full name for hints file", OPTION_OPTIONAL_ARGUMENT, 's', & o->hintsFileName}, {0, "mpiio.showHints", "Show MPI hints", OPTION_FLAG, 'd', & o->showHints}, {0, "mpiio.preallocate", "Preallocate file size", OPTION_FLAG, 'd', & o->preallocate}, - {0, "mpiio.useStridedDatatype", "put strided access into datatype [not working]", OPTION_FLAG, 'd', & o->useStridedDatatype}, + {0, "mpiio.useStridedDatatype", "put strided access into datatype", OPTION_FLAG, 'd', & o->useStridedDatatype}, //{'P', NULL, "useSharedFilePointer -- use shared file pointer [not working]", OPTION_FLAG, 'd', & params->useSharedFilePointer}, {0, "mpiio.useFileView", "Use MPI_File_set_view", OPTION_FLAG, 'd', & o->useFileView}, LAST_OPTION @@ -108,7 +108,7 @@ ior_aiori_t mpiio_aiori = { /***************************** F U N C T I O N S ******************************/ static aiori_xfer_hint_t * hints = NULL; -static void MPIIO_xfer_hints(aiori_xfer_hint_t * params){ +void MPIIO_xfer_hints(aiori_xfer_hint_t * params){ hints = params; } @@ -121,8 +121,6 @@ static int MPIIO_check_params(aiori_mod_opt_t * module_options){ ERR("segment size must be < 2GiB"); if (param->useSharedFilePointer) ERR("shared file pointer not implemented"); - if (param->useStridedDatatype) - ERR("strided datatype not implemented"); if (param->useStridedDatatype && (hints->blockSize < sizeof(IOR_size_t) || hints->transferSize < sizeof(IOR_size_t))) @@ -140,10 +138,10 @@ static int MPIIO_check_params(aiori_mod_opt_t * module_options){ */ int MPIIO_Access(const char *path, int mode, aiori_mod_opt_t *module_options) { - mpiio_options_t * param = (mpiio_options_t*) module_options; if(hints->dryRun){ return MPI_SUCCESS; } + mpiio_options_t * param = (mpiio_options_t*) module_options; MPI_File fd; int mpi_mode = MPI_MODE_UNIQUE_OPEN; MPI_Info mpiHints = MPI_INFO_NULL; @@ -185,9 +183,7 @@ static aiori_fd_t *MPIIO_Open(char *testFileName, int flags, aiori_mod_opt_t * m offsetFactor, tasksPerFile, transfersPerBlock = hints->blockSize / hints->transferSize; - struct fileTypeStruct { - int globalSizes[2], localSizes[2], startIndices[2]; - } fileTypeStruct; + mpiio_fd_t * mfd = malloc(sizeof(mpiio_fd_t)); memset(mfd, 0, sizeof(mpiio_fd_t)); @@ -272,15 +268,18 @@ static aiori_fd_t *MPIIO_Open(char *testFileName, int flags, aiori_mod_opt_t * m hints->numTasks)), "cannot preallocate file"); } + + /* create file view */ if (param->useFileView) { + /* Create in-memory datatype */ + MPI_CHECK(MPI_Type_contiguous (hints->transferSize / sizeof(IOR_size_t), MPI_LONG_LONG_INT, & mfd->contigType), "cannot create contiguous datatype"); + MPI_CHECK(MPI_Type_create_resized( mfd->contigType, 0, 0, & mfd->transferType), "cannot create resized type"); + MPI_CHECK(MPI_Type_commit(& mfd->contigType), "cannot commit datatype"); + MPI_CHECK(MPI_Type_commit(& mfd->transferType), "cannot commit datatype"); + /* create contiguous transfer datatype */ - MPI_CHECK(MPI_Type_contiguous - (hints->transferSize / sizeof(IOR_size_t), - MPI_LONG_LONG_INT, & mfd->transferType), - "cannot create contiguous datatype"); - MPI_CHECK(MPI_Type_commit(& mfd->transferType), - "cannot commit datatype"); + if (hints->filePerProc) { offsetFactor = 0; tasksPerFile = 1; @@ -289,33 +288,39 @@ static aiori_fd_t *MPIIO_Open(char *testFileName, int flags, aiori_mod_opt_t * m tasksPerFile = hints->numTasks; } - /* - * create file type using subarray - */ - fileTypeStruct.globalSizes[0] = 1; - fileTypeStruct.globalSizes[1] = - transfersPerBlock * tasksPerFile; - fileTypeStruct.localSizes[0] = 1; - fileTypeStruct.localSizes[1] = transfersPerBlock; - fileTypeStruct.startIndices[0] = 0; - fileTypeStruct.startIndices[1] = - transfersPerBlock * offsetFactor; + if(! hints->dryRun) { + if(! param->useStridedDatatype){ + struct fileTypeStruct { + int globalSizes[2], localSizes[2], startIndices[2]; + } fileTypeStruct; - MPI_CHECK(MPI_Type_create_subarray - (2, fileTypeStruct.globalSizes, - fileTypeStruct.localSizes, - fileTypeStruct.startIndices, MPI_ORDER_C, - mfd->transferType, & mfd->fileType), - "cannot create subarray"); - MPI_CHECK(MPI_Type_commit(& mfd->fileType), - "cannot commit datatype"); - - if(! hints->dryRun){ - MPI_CHECK(MPI_File_set_view(mfd->fd, (MPI_Offset) 0, - mfd->transferType, - mfd->fileType, "native", + /* + * create file type using subarray + */ + fileTypeStruct.globalSizes[0] = 1; + fileTypeStruct.globalSizes[1] = transfersPerBlock * tasksPerFile; + fileTypeStruct.localSizes[0] = 1; + fileTypeStruct.localSizes[1] = transfersPerBlock; + fileTypeStruct.startIndices[0] = 0; + fileTypeStruct.startIndices[1] = transfersPerBlock * offsetFactor; + + MPI_CHECK(MPI_Type_create_subarray + (2, fileTypeStruct.globalSizes, + fileTypeStruct.localSizes, + fileTypeStruct.startIndices, MPI_ORDER_C, + mfd->contigType, & mfd->fileType), + "cannot create subarray"); + MPI_CHECK(MPI_Type_commit(& mfd->fileType), "cannot commit datatype"); + MPI_CHECK(MPI_File_set_view(mfd->fd, 0, + mfd->contigType, + mfd->fileType, + "native", (MPI_Info) MPI_INFO_NULL), "cannot set file view"); + }else{ + MPI_CHECK(MPI_Type_create_resized(mfd->contigType, 0, tasksPerFile * hints->blockSize, & mfd->fileType), "cannot create MPI_Type_create_hvector"); + MPI_CHECK(MPI_Type_commit(& mfd->fileType), "cannot commit datatype"); + } } } if (mpiHints != MPI_INFO_NULL) @@ -380,7 +385,7 @@ static IOR_offset_t MPIIO_Xfer(int access, aiori_fd_t * fdp, IOR_size_t * buffer * Access_ordered = MPI_File_read_ordered; */ } - + /* * 'useFileView' uses derived datatypes and individual file pointers */ @@ -391,16 +396,28 @@ static IOR_offset_t MPIIO_Xfer(int access, aiori_fd_t * fdp, IOR_size_t * buffer /* if unsuccessful */ length = -1; } else { + /* - * 'useStridedDatatype' fits multi-strided pattern into a datatype; - * must use 'length' to determine repetitions (fix this for - * multi-segments someday, WEL): - * e.g., 'IOR -s 2 -b 32K -t 32K -a MPIIO -S' - */ + * 'useStridedDatatype' fits multi-strided pattern into a datatype; + * must use 'length' to determine repetitions (fix this for + * multi-segments someday, WEL): + * e.g., 'IOR -s 2 -b 32K -t 32K -a MPIIO --mpiio.useStridedDatatype --mpiio.useFileView' + */ if (param->useStridedDatatype) { - length = hints->segmentCount; - } else { - length = 1; + if(offset >= (rank+1) * hints->blockSize){ + /* we shall write only once per transferSize */ + /* printf("FAKE access %d %lld\n", rank, offset); */ + return hints->transferSize; + } + length = hints->segmentCount; + MPI_CHECK(MPI_File_set_view(mfd->fd, offset, + mfd->contigType, + mfd->fileType, + "native", + (MPI_Info) MPI_INFO_NULL), "cannot set file view"); + /* printf("ACCESS %d %lld -> %lld\n", rank, offset, length); */ + }else{ + length = 1; } if (hints->collective) { /* individual, collective call */ @@ -415,7 +432,12 @@ static IOR_offset_t MPIIO_Xfer(int access, aiori_fd_t * fdp, IOR_size_t * buffer mfd->transferType, &status), "cannot access noncollective"); } - length *= hints->transferSize; /* for return value in bytes */ + /* MPI-IO driver does "nontcontiguous" by transfering + * 'segment' regions of 'transfersize' bytes, but + * our caller WriteOrReadSingle does not know how to + * deal with us reporting that we wrote N times more + * data than requested. */ + length = hints->transferSize; } } else { /* @@ -456,7 +478,7 @@ static IOR_offset_t MPIIO_Xfer(int access, aiori_fd_t * fdp, IOR_size_t * buffer } } } - return (length); + return hints->transferSize; } /* @@ -483,11 +505,12 @@ static void MPIIO_Close(aiori_fd_t *fdp, aiori_mod_opt_t * module_options) MPI_CHECK(MPI_File_close(& mfd->fd), "cannot close file"); } if (param->useFileView == TRUE) { - /* - * need to free the datatype, so done in the close process - */ - MPI_CHECK(MPI_Type_free(& mfd->fileType), "cannot free MPI file datatype"); - MPI_CHECK(MPI_Type_free(& mfd->transferType), "cannot free MPI transfer datatype"); + /* + * need to free the datatype, so done in the close process + */ + MPI_CHECK(MPI_Type_free(& mfd->fileType), "cannot free MPI file datatype"); + MPI_CHECK(MPI_Type_free(& mfd->transferType), "cannot free MPI transfer datatype"); + MPI_CHECK(MPI_Type_free(& mfd->contigType), "cannot free type"); } free(fdp); } @@ -562,8 +585,7 @@ static IOR_offset_t SeekOffset(MPI_File fd, IOR_offset_t offset, * Use MPI_File_get_size() to return aggregate file size. * NOTE: This function is used by the HDF5 and NCMPI backends. */ -IOR_offset_t MPIIO_GetFileSize(aiori_mod_opt_t * module_options, MPI_Comm testComm, - char *testFileName) +IOR_offset_t MPIIO_GetFileSize(aiori_mod_opt_t * module_options, char *testFileName) { mpiio_options_t * test = (mpiio_options_t*) module_options; if(hints->dryRun) @@ -589,26 +611,5 @@ IOR_offset_t MPIIO_GetFileSize(aiori_mod_opt_t * module_options, MPI_Comm testCo if (mpiHints != MPI_INFO_NULL) MPI_CHECK(MPI_Info_free(&mpiHints), "MPI_Info_free failed"); - if (hints->filePerProc == TRUE) { - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1, - MPI_LONG_LONG_INT, MPI_SUM, testComm), - "cannot total data moved"); - aggFileSizeFromStat = tmpSum; - } else { - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMin, 1, - MPI_LONG_LONG_INT, MPI_MIN, testComm), - "cannot total data moved"); - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMax, 1, - MPI_LONG_LONG_INT, MPI_MAX, testComm), - "cannot total data moved"); - if (tmpMin != tmpMax) { - if (rank == 0) { - WARN("inconsistent file size by different tasks"); - } - /* incorrect, but now consistent across tasks */ - aggFileSizeFromStat = tmpMin; - } - } - return (aggFileSizeFromStat); } diff --git a/src/aiori-NCMPI.c b/src/aiori-NCMPI.c index 5fc1375..b6ab84b 100755 --- a/src/aiori-NCMPI.c +++ b/src/aiori-NCMPI.c @@ -45,20 +45,57 @@ /**************************** P R O T O T Y P E S *****************************/ -static int GetFileMode(IOR_param_t *); +static int GetFileMode(int flags); -static void *NCMPI_Create(char *, IOR_param_t *); -static void *NCMPI_Open(char *, IOR_param_t *); -static IOR_offset_t NCMPI_Xfer(int, void *, IOR_size_t *, - IOR_offset_t, IOR_param_t *); -static void NCMPI_Close(void *, IOR_param_t *); -static void NCMPI_Delete(char *, IOR_param_t *); +static aiori_fd_t *NCMPI_Create(char *, int iorflags, aiori_mod_opt_t *); +static aiori_fd_t *NCMPI_Open(char *, int iorflags, aiori_mod_opt_t *); +static IOR_offset_t NCMPI_Xfer(int, aiori_fd_t *, IOR_size_t *, + IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *); +static void NCMPI_Close(aiori_fd_t *, aiori_mod_opt_t *); +static void NCMPI_Delete(char *, aiori_mod_opt_t *); static char *NCMPI_GetVersion(); -static void NCMPI_Fsync(void *, IOR_param_t *); -static IOR_offset_t NCMPI_GetFileSize(IOR_param_t *, MPI_Comm, char *); -static int NCMPI_Access(const char *, int, IOR_param_t *); +static void NCMPI_Fsync(aiori_fd_t *, aiori_mod_opt_t *); +static IOR_offset_t NCMPI_GetFileSize(aiori_mod_opt_t *, char *); +static int NCMPI_Access(const char *, int, aiori_mod_opt_t *); /************************** D E C L A R A T I O N S ***************************/ +static aiori_xfer_hint_t * hints = NULL; + +static void NCMPI_xfer_hints(aiori_xfer_hint_t * params){ + hints = params; + + MPIIO_xfer_hints(params); +} + +typedef struct { + int showHints; /* show hints */ + char * hintsFileName; /* full name for hints file */ + + /* runtime variables */ + int var_id; /* variable id handle for data set */ + int firstReadCheck; + int startDataSet; +} ncmpi_options_t; + + +static option_help * NCMPI_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){ + ncmpi_options_t * o = malloc(sizeof(ncmpi_options_t)); + if (init_values != NULL){ + memcpy(o, init_values, sizeof(ncmpi_options_t)); + }else{ + memset(o, 0, sizeof(ncmpi_options_t)); + } + *init_backend_options = (aiori_mod_opt_t*) o; + + option_help h [] = { + {0, "mpiio.hintsFileName","Full name for hints file", OPTION_OPTIONAL_ARGUMENT, 's', & o->hintsFileName}, + {0, "mpiio.showHints", "Show MPI hints", OPTION_FLAG, 'd', & o->showHints}, + LAST_OPTION + }; + option_help * help = malloc(sizeof(h)); + memcpy(help, h, sizeof(h)); + return help; +} ior_aiori_t ncmpi_aiori = { .name = "NCMPI", @@ -76,6 +113,8 @@ ior_aiori_t ncmpi_aiori = { .rmdir = aiori_posix_rmdir, .access = NCMPI_Access, .stat = aiori_posix_stat, + .get_options = NCMPI_options, + .xfer_hints = NCMPI_xfer_hints, }; /***************************** F U N C T I O N S ******************************/ @@ -83,15 +122,16 @@ ior_aiori_t ncmpi_aiori = { /* * Create and open a file through the NCMPI interface. */ -static void *NCMPI_Create(char *testFileName, IOR_param_t * param) +static aiori_fd_t *NCMPI_Create(char *testFileName, int iorflags, aiori_mod_opt_t * param) { int *fd; int fd_mode; MPI_Info mpiHints = MPI_INFO_NULL; + ncmpi_options_t * o = (ncmpi_options_t*) param; /* read and set MPI file hints from hintsFile */ - SetHints(&mpiHints, param->hintsFileName); - if (rank == 0 && param->showHints) { + SetHints(&mpiHints, o->hintsFileName); + if (rank == 0 && o->showHints) { fprintf(stdout, "\nhints passed to MPI_File_open() {\n"); ShowHints(&mpiHints); fprintf(stdout, "}\n"); @@ -101,7 +141,7 @@ static void *NCMPI_Create(char *testFileName, IOR_param_t * param) if (fd == NULL) ERR("malloc() failed"); - fd_mode = GetFileMode(param); + fd_mode = GetFileMode(iorflags); NCMPI_CHECK(ncmpi_create(testComm, testFileName, fd_mode, mpiHints, fd), "cannot create file"); @@ -111,7 +151,7 @@ static void *NCMPI_Create(char *testFileName, IOR_param_t * param) #if defined(PNETCDF_VERSION_MAJOR) && (PNETCDF_VERSION_MAJOR > 1 || PNETCDF_VERSION_MINOR >= 2) /* ncmpi_get_file_info is first available in 1.2.0 */ - if (rank == 0 && param->showHints) { + if (rank == 0 && o->showHints) { MPI_Info info_used; MPI_CHECK(ncmpi_get_file_info(*fd, &info_used), "cannot inquire file info"); @@ -123,21 +163,22 @@ static void *NCMPI_Create(char *testFileName, IOR_param_t * param) } #endif - return (fd); + return (aiori_fd_t*)(fd); } /* * Open a file through the NCMPI interface. */ -static void *NCMPI_Open(char *testFileName, IOR_param_t * param) +static aiori_fd_t *NCMPI_Open(char *testFileName, int iorflags, aiori_mod_opt_t * param) { int *fd; int fd_mode; MPI_Info mpiHints = MPI_INFO_NULL; + ncmpi_options_t * o = (ncmpi_options_t*) param; /* read and set MPI file hints from hintsFile */ - SetHints(&mpiHints, param->hintsFileName); - if (rank == 0 && param->showHints) { + SetHints(&mpiHints, o->hintsFileName); + if (rank == 0 && o->showHints) { fprintf(stdout, "\nhints passed to MPI_File_open() {\n"); ShowHints(&mpiHints); fprintf(stdout, "}\n"); @@ -147,7 +188,7 @@ static void *NCMPI_Open(char *testFileName, IOR_param_t * param) if (fd == NULL) ERR("malloc() failed"); - fd_mode = GetFileMode(param); + fd_mode = GetFileMode(iorflags); NCMPI_CHECK(ncmpi_open(testComm, testFileName, fd_mode, mpiHints, fd), "cannot open file"); @@ -157,7 +198,7 @@ static void *NCMPI_Open(char *testFileName, IOR_param_t * param) #if defined(PNETCDF_VERSION_MAJOR) && (PNETCDF_VERSION_MAJOR > 1 || PNETCDF_VERSION_MINOR >= 2) /* ncmpi_get_file_info is first available in 1.2.0 */ - if (rank == 0 && param->showHints) { + if (rank == 0 && o->showHints) { MPI_Info info_used; MPI_CHECK(ncmpi_get_file_info(*fd, &info_used), "cannot inquire file info"); @@ -169,51 +210,43 @@ static void *NCMPI_Open(char *testFileName, IOR_param_t * param) } #endif - return (fd); + return (aiori_fd_t*)(fd); } /* * Write or read access to file using the NCMPI interface. */ -static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, - IOR_offset_t length, IOR_param_t * param) +static IOR_offset_t NCMPI_Xfer(int access, aiori_fd_t *fd, IOR_size_t * buffer, IOR_offset_t transferSize, IOR_offset_t offset, aiori_mod_opt_t * param) { signed char *bufferPtr = (signed char *)buffer; - static int firstReadCheck = FALSE, startDataSet; + ncmpi_options_t * o = (ncmpi_options_t*) param; int var_id, dim_id[NUM_DIMS]; - MPI_Offset bufSize[NUM_DIMS], offset[NUM_DIMS]; + MPI_Offset bufSize[NUM_DIMS], offsets[NUM_DIMS]; IOR_offset_t segmentPosition; int segmentNum, transferNum; /* determine by offset if need to start data set */ - if (param->filePerProc == TRUE) { + if (hints->filePerProc == TRUE) { segmentPosition = (IOR_offset_t) 0; } else { - segmentPosition = - (IOR_offset_t) ((rank + rankOffset) % param->numTasks) - * param->blockSize; + segmentPosition = (IOR_offset_t) ((rank + rankOffset) % hints->numTasks) * hints->blockSize; } - if ((int)(param->offset - segmentPosition) == 0) { - startDataSet = TRUE; + if ((int)(offset - segmentPosition) == 0) { + o->startDataSet = TRUE; /* * this toggle is for the read check operation, which passes through * this function twice; note that this function will open a data set * only on the first read check and close only on the second */ if (access == READCHECK) { - if (firstReadCheck == TRUE) { - firstReadCheck = FALSE; - } else { - firstReadCheck = TRUE; - } + o->firstReadCheck = ! o->firstReadCheck; } } - if (startDataSet == TRUE && - (access != READCHECK || firstReadCheck == TRUE)) { + if (o->startDataSet == TRUE && + (access != READCHECK || o->firstReadCheck == TRUE)) { if (access == WRITE) { - int numTransfers = - param->blockSize / param->transferSize; + int numTransfers = hints->blockSize / hints->transferSize; /* reshape 1D array to 3D array: [segmentCount*numTasks][numTransfers][transferSize] @@ -229,7 +262,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, "cannot define data set dimensions"); NCMPI_CHECK(ncmpi_def_dim (*(int *)fd, "transfer_size", - param->transferSize, &dim_id[2]), + hints->transferSize, &dim_id[2]), "cannot define data set dimensions"); NCMPI_CHECK(ncmpi_def_var (*(int *)fd, "data_var", NC_BYTE, NUM_DIMS, @@ -244,77 +277,72 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, "cannot retrieve data set variable"); } - if (param->collective == FALSE) { + if (hints->collective == FALSE) { NCMPI_CHECK(ncmpi_begin_indep_data(*(int *)fd), "cannot enable independent data mode"); } - param->var_id = var_id; - startDataSet = FALSE; + o->var_id = var_id; + o->startDataSet = FALSE; } - var_id = param->var_id; + var_id = o->var_id; /* calculate the segment number */ - segmentNum = param->offset / (param->numTasks * param->blockSize); + segmentNum = offset / (hints->numTasks * hints->blockSize); /* calculate the transfer number in each block */ - transferNum = param->offset % param->blockSize / param->transferSize; + transferNum = offset % hints->blockSize / hints->transferSize; /* read/write the 3rd dim of the dataset, each is of amount param->transferSize */ bufSize[0] = 1; bufSize[1] = 1; - bufSize[2] = param->transferSize; + bufSize[2] = transferSize; - offset[0] = segmentNum * param->numTasks + rank; - offset[1] = transferNum; - offset[2] = 0; + offsets[0] = segmentNum * hints->numTasks + rank; + offsets[1] = transferNum; + offsets[2] = 0; /* access the file */ if (access == WRITE) { /* WRITE */ - if (param->collective) { + if (hints->collective) { NCMPI_CHECK(ncmpi_put_vara_schar_all - (*(int *)fd, var_id, offset, bufSize, - bufferPtr), + (*(int *)fd, var_id, offsets, bufSize, bufferPtr), "cannot write to data set"); } else { NCMPI_CHECK(ncmpi_put_vara_schar - (*(int *)fd, var_id, offset, bufSize, - bufferPtr), + (*(int *)fd, var_id, offsets, bufSize, bufferPtr), "cannot write to data set"); } } else { /* READ or CHECK */ - if (param->collective == TRUE) { + if (hints->collective == TRUE) { NCMPI_CHECK(ncmpi_get_vara_schar_all - (*(int *)fd, var_id, offset, bufSize, - bufferPtr), + (*(int *)fd, var_id, offsets, bufSize, bufferPtr), "cannot read from data set"); } else { NCMPI_CHECK(ncmpi_get_vara_schar - (*(int *)fd, var_id, offset, bufSize, - bufferPtr), + (*(int *)fd, var_id, offsets, bufSize, bufferPtr), "cannot read from data set"); } } - return (length); + return (transferSize); } /* * Perform fsync(). */ -static void NCMPI_Fsync(void *fd, IOR_param_t * param) +static void NCMPI_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * param) { - ; } /* * Close a file through the NCMPI interface. */ -static void NCMPI_Close(void *fd, IOR_param_t * param) +static void NCMPI_Close(aiori_fd_t *fd, aiori_mod_opt_t * param) { - if (param->collective == FALSE) { + if (hints->collective == FALSE) { NCMPI_CHECK(ncmpi_end_indep_data(*(int *)fd), "cannot disable independent data mode"); } @@ -325,7 +353,7 @@ static void NCMPI_Close(void *fd, IOR_param_t * param) /* * Delete a file through the NCMPI interface. */ -static void NCMPI_Delete(char *testFileName, IOR_param_t * param) +static void NCMPI_Delete(char *testFileName, aiori_mod_opt_t * param) { return(MPIIO_Delete(testFileName, param)); } @@ -341,35 +369,35 @@ static char* NCMPI_GetVersion() /* * Return the correct file mode for NCMPI. */ -static int GetFileMode(IOR_param_t * param) +static int GetFileMode(int flags) { int fd_mode = 0; /* set IOR file flags to NCMPI flags */ /* -- file open flags -- */ - if (param->openFlags & IOR_RDONLY) { + if (flags & IOR_RDONLY) { fd_mode |= NC_NOWRITE; } - if (param->openFlags & IOR_WRONLY) { - fprintf(stdout, "File write only not implemented in NCMPI\n"); + if (flags & IOR_WRONLY) { + WARN("File write only not implemented in NCMPI"); } - if (param->openFlags & IOR_RDWR) { + if (flags & IOR_RDWR) { fd_mode |= NC_WRITE; } - if (param->openFlags & IOR_APPEND) { - fprintf(stdout, "File append not implemented in NCMPI\n"); + if (flags & IOR_APPEND) { + WARN("File append not implemented in NCMPI"); } - if (param->openFlags & IOR_CREAT) { + if (flags & IOR_CREAT) { fd_mode |= NC_CLOBBER; } - if (param->openFlags & IOR_EXCL) { - fprintf(stdout, "Exclusive access not implemented in NCMPI\n"); + if (flags & IOR_EXCL) { + WARN("Exclusive access not implemented in NCMPI"); } - if (param->openFlags & IOR_TRUNC) { - fprintf(stdout, "File truncation not implemented in NCMPI\n"); + if (flags & IOR_TRUNC) { + WARN("File truncation not implemented in NCMPI"); } - if (param->openFlags & IOR_DIRECT) { - fprintf(stdout, "O_DIRECT not implemented in NCMPI\n"); + if (flags & IOR_DIRECT) { + WARN("O_DIRECT not implemented in NCMPI"); } /* to enable > 4GB file size */ @@ -381,16 +409,16 @@ static int GetFileMode(IOR_param_t * param) /* * Use MPIIO call to get file size. */ -static IOR_offset_t NCMPI_GetFileSize(IOR_param_t * test, MPI_Comm testComm, +static IOR_offset_t NCMPI_GetFileSize(aiori_mod_opt_t * opt, char *testFileName) { - return(MPIIO_GetFileSize(test, testComm, testFileName)); + return(MPIIO_GetFileSize(opt, testFileName)); } /* * Use MPIIO call to check for access. */ -static int NCMPI_Access(const char *path, int mode, IOR_param_t *param) +static int NCMPI_Access(const char *path, int mode, aiori_mod_opt_t *param) { return(MPIIO_Access(path, mode, param)); } diff --git a/src/aiori-PMDK.c b/src/aiori-PMDK.c index 4a3953b..79b41b4 100644 --- a/src/aiori-PMDK.c +++ b/src/aiori-PMDK.c @@ -28,14 +28,19 @@ static option_help options [] = { /**************************** P R O T O T Y P E S *****************************/ static option_help * PMDK_options(); -static void *PMDK_Create(char *, IOR_param_t *); -static void *PMDK_Open(char *, IOR_param_t *); -static IOR_offset_t PMDK_Xfer(int, void *, IOR_size_t *, IOR_offset_t, IOR_param_t *); -static void PMDK_Fsync(void *, IOR_param_t *); -static void PMDK_Close(void *, IOR_param_t *); -static void PMDK_Delete(char *, IOR_param_t *); -static IOR_offset_t PMDK_GetFileSize(IOR_param_t *, MPI_Comm, char *); +static aiori_fd_t *PMDK_Create(char *,int iorflags, aiori_mod_opt_t *); +static aiori_fd_t *PMDK_Open(char *, int iorflags, aiori_mod_opt_t *); +static IOR_offset_t PMDK_Xfer(int, aiori_fd_t *, IOR_size_t *, IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *); +static void PMDK_Fsync(aiori_fd_t *, aiori_mod_opt_t *); +static void PMDK_Close(aiori_fd_t *, aiori_mod_opt_t *); +static void PMDK_Delete(char *, aiori_mod_opt_t *); +static IOR_offset_t PMDK_GetFileSize(aiori_mod_opt_t *, char *); +static aiori_xfer_hint_t * hints = NULL; + +static void PMDK_xfer_hints(aiori_xfer_hint_t * params){ + hints = params; +} /************************** D E C L A R A T I O N S ***************************/ @@ -55,6 +60,7 @@ ior_aiori_t pmdk_aiori = { .delete = PMDK_Delete, .get_version = aiori_get_version, .fsync = PMDK_Fsync, + .xfer_hints = PMDK_xfer_hints, .get_file_size = PMDK_GetFileSize, .statfs = aiori_posix_statfs, .mkdir = aiori_posix_mkdir, @@ -78,18 +84,18 @@ static option_help * PMDK_options(){ /* * Create and open a memory space through the PMDK interface. */ -static void *PMDK_Create(char * testFileName, IOR_param_t * param){ +static aiori_fd_t *PMDK_Create(char * testFileName, int iorflags, aiori_mod_opt_t * param){ char *pmemaddr = NULL; int is_pmem; size_t mapped_len; size_t open_length; - if(!param->filePerProc){ + if(! hints->filePerProc){ fprintf(stdout, "\nPMDK functionality can only be used with filePerProc functionality\n"); MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); } - open_length = param->blockSize * param->segmentCount; + open_length = hints->blockSize * hints->segmentCount; if((pmemaddr = pmem_map_file(testFileName, open_length, PMEM_FILE_CREATE|PMEM_FILE_EXCL, @@ -98,7 +104,7 @@ static void *PMDK_Create(char * testFileName, IOR_param_t * param){ perror("pmem_map_file"); MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); } - + if(!is_pmem){ fprintf(stdout, "\n is_pmem is %d\n",is_pmem); fprintf(stdout, "\npmem_map_file thinks the hardware being used is not pmem\n"); @@ -106,7 +112,7 @@ static void *PMDK_Create(char * testFileName, IOR_param_t * param){ } - + return((void *)pmemaddr); } /* PMDK_Create() */ @@ -115,20 +121,19 @@ static void *PMDK_Create(char * testFileName, IOR_param_t * param){ /* * Open a memory space through the PMDK interface. */ - -static void *PMDK_Open(char * testFileName, IOR_param_t * param){ +static aiori_fd_t *PMDK_Open(char * testFileName,int iorflags, aiori_mod_opt_t * param){ char *pmemaddr = NULL; int is_pmem; size_t mapped_len; size_t open_length; - if(!param->filePerProc){ + if(!hints->filePerProc){ fprintf(stdout, "\nPMDK functionality can only be used with filePerProc functionality\n"); MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); } - open_length = param->blockSize * param->segmentCount; + open_length = hints->blockSize * hints->segmentCount; if((pmemaddr = pmem_map_file(testFileName, 0, PMEM_FILE_EXCL, @@ -138,12 +143,12 @@ static void *PMDK_Open(char * testFileName, IOR_param_t * param){ fprintf(stdout, "\n %ld %ld\n",open_length, mapped_len); MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); } - + if(!is_pmem){ fprintf(stdout, "pmem_map_file thinks the hardware being used is not pmem\n"); MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); } - + return((void *)pmemaddr); } /* PMDK_Open() */ @@ -153,8 +158,8 @@ static void *PMDK_Open(char * testFileName, IOR_param_t * param){ * Write or read access to a memory space created with PMDK. Include drain/flush functionality. */ -static IOR_offset_t PMDK_Xfer(int access, void *file, IOR_size_t * buffer, - IOR_offset_t length, IOR_param_t * param){ +static IOR_offset_t PMDK_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer, + IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * param){ int xferRetries = 0; long long remaining = (long long)length; char * ptr = (char *)buffer; @@ -162,11 +167,11 @@ static IOR_offset_t PMDK_Xfer(int access, void *file, IOR_size_t * buffer, long long i; long long offset_size; - offset_size = param->offset; + offset_size = offset; if(access == WRITE){ - if(param->fsync){ - pmem_memcpy_nodrain(&file[offset_size], ptr, length); + if(hints->fsyncPerWrite){ + pmem_memcpy_nodrain(&file[offset_size], ptr, length); }else{ pmem_memcpy_persist(&file[offset_size], ptr, length); } @@ -183,7 +188,7 @@ static IOR_offset_t PMDK_Xfer(int access, void *file, IOR_size_t * buffer, * Perform fsync(). */ -static void PMDK_Fsync(void *fd, IOR_param_t * param) +static void PMDK_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * param) { pmem_drain(); } /* PMDK_Fsync() */ @@ -194,11 +199,10 @@ static void PMDK_Fsync(void *fd, IOR_param_t * param) * Stub for close functionality that is not required for PMDK */ -static void PMDK_Close(void *fd, IOR_param_t * param){ +static void PMDK_Close(aiori_fd_t *fd, aiori_mod_opt_t * param){ size_t open_length; - open_length = param->transferSize; + open_length = hints->transferSize; pmem_unmap(fd, open_length); - } /* PMDK_Close() */ @@ -207,38 +211,25 @@ static void PMDK_Close(void *fd, IOR_param_t * param){ * Delete the file backing a memory space through PMDK */ -static void PMDK_Delete(char *testFileName, IOR_param_t * param) +static void PMDK_Delete(char *testFileName, aiori_mod_opt_t * param) { char errmsg[256]; sprintf(errmsg,"[RANK %03d]:cannot delete file %s\n",rank,testFileName); if (unlink(testFileName) != 0) WARN(errmsg); } /* PMDK_Delete() */ - -/******************************************************************************/ -/* - * Determine api version. - */ - -static void PMDK_SetVersion(IOR_param_t *test) -{ - strcpy(test->apiVersion, test->api); -} /* PMDK_SetVersion() */ - - /******************************************************************************/ /* * Use POSIX stat() to return aggregate file size. */ -static IOR_offset_t PMDK_GetFileSize(IOR_param_t * test, - MPI_Comm testComm, +static IOR_offset_t PMDK_GetFileSize(aiori_mod_opt_t * test, char * testFileName) { struct stat stat_buf; IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum; - if (test->filePerProc == FALSE) { + if (hints->filePerProc == FALSE) { fprintf(stdout, "\nPMDK functionality can only be used with filePerProc functionality\n"); MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); } @@ -248,10 +239,5 @@ static IOR_offset_t PMDK_GetFileSize(IOR_param_t * test, } aggFileSizeFromStat = stat_buf.st_size; - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1, - MPI_LONG_LONG_INT, MPI_SUM, testComm), - "cannot total data moved"); - aggFileSizeFromStat = tmpSum; - return(aggFileSizeFromStat); } /* PMDK_GetFileSize() */ diff --git a/src/aiori-POSIX.c b/src/aiori-POSIX.c index 283a6ec..72f7f53 100755 --- a/src/aiori-POSIX.c +++ b/src/aiori-POSIX.c @@ -34,7 +34,7 @@ #ifdef HAVE_LINUX_LUSTRE_LUSTRE_USER_H # include -#elif defined(HAVE_LUSTRE_LUSTRE_USER_H) +#elif defined(HAVE_LUSTRE_USER) # include #endif #ifdef HAVE_GPFS_H @@ -55,6 +55,22 @@ #include "iordef.h" #include "utilities.h" +#include "aiori-POSIX.h" + +#ifdef HAVE_GPU_DIRECT +typedef long long loff_t; +#include +#include +#endif + +typedef struct { + int fd; +#ifdef HAVE_GPU_DIRECT + CUfileHandle_t cf_handle; +#endif +} posix_fd; + + #ifndef open64 /* necessary for TRU64 -- */ # define open64 open /* unlikely, but may pose */ #endif /* not open64 */ /* conflicting prototypes */ @@ -67,35 +83,32 @@ # define O_BINARY 0 #endif +#ifdef HAVE_GPU_DIRECT +static const char* cuFileGetErrorString(CUfileError_t status){ + if(IS_CUDA_ERR(status)){ + return cudaGetErrorString(status.err); + } + return strerror(status.err); +} + +static void init_cufile(posix_fd * pfd){ + CUfileDescr_t cf_descr = (CUfileDescr_t){ + .handle.fd = pfd->fd, + .type = CU_FILE_HANDLE_TYPE_OPAQUE_FD + }; + CUfileError_t status = cuFileHandleRegister(& pfd->cf_handle, & cf_descr); + if(status.err != CU_FILE_SUCCESS){ + EWARNF("Could not register handle %s", cuFileGetErrorString(status)); + } +} +#endif + /**************************** P R O T O T Y P E S *****************************/ +static void POSIX_Initialize(aiori_mod_opt_t * options); +static void POSIX_Finalize(aiori_mod_opt_t * options); + static IOR_offset_t POSIX_Xfer(int, aiori_fd_t *, IOR_size_t *, IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *); -static void POSIX_Fsync(aiori_fd_t *, aiori_mod_opt_t *); -static void POSIX_Sync(aiori_mod_opt_t * ); -static int POSIX_check_params(aiori_mod_opt_t * options); - -/************************** O P T I O N S *****************************/ -typedef struct{ - /* in case of a change, please update depending MMAP module too */ - int direct_io; - - /* Lustre variables */ - int lustre_set_striping; /* flag that we need to set lustre striping */ - int lustre_stripe_count; - int lustre_stripe_size; - int lustre_start_ost; - int lustre_ignore_locks; - - /* gpfs variables */ - int gpfs_hint_access; /* use gpfs "access range" hint */ - int gpfs_release_token; /* immediately release GPFS tokens after - creating or opening a file */ - /* beegfs variables */ - int beegfs_numTargets; /* number storage targets to use */ - int beegfs_chunkSize; /* srtipe pattern for new files */ - -} posix_options_t; - option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){ posix_options_t * o = malloc(sizeof(posix_options_t)); @@ -105,6 +118,7 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o }else{ memset(o, 0, sizeof(posix_options_t)); o->direct_io = 0; + o->lustre_stripe_count = -1; o->lustre_start_ost = -1; o->beegfs_numTargets = -1; o->beegfs_chunkSize = -1; @@ -123,11 +137,14 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o {0, "posix.gpfs.releasetoken", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->gpfs_release_token}, #endif -#ifdef HAVE_LUSTRE_LUSTRE_USER_H +#ifdef HAVE_LUSTRE_USER {0, "posix.lustre.stripecount", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_count}, {0, "posix.lustre.stripesize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_size}, {0, "posix.lustre.startost", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_start_ost}, {0, "posix.lustre.ignorelocks", "", OPTION_FLAG, 'd', & o->lustre_ignore_locks}, +#endif +#ifdef HAVE_GPU_DIRECT + {0, "gpuDirect", "allocate I/O buffers on the GPU", OPTION_FLAG, 'd', & o->gpuDirect}, #endif LAST_OPTION }; @@ -143,19 +160,22 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o ior_aiori_t posix_aiori = { .name = "POSIX", .name_legacy = NULL, + .initialize = POSIX_Initialize, + .finalize = POSIX_Finalize, .create = POSIX_Create, .mknod = POSIX_Mknod, .open = POSIX_Open, .xfer = POSIX_Xfer, .close = POSIX_Close, .delete = POSIX_Delete, - .xfer_hints = aiori_posix_xfer_hints, + .xfer_hints = POSIX_xfer_hints, .get_version = aiori_get_version, .fsync = POSIX_Fsync, .get_file_size = POSIX_GetFileSize, .statfs = aiori_posix_statfs, .mkdir = aiori_posix_mkdir, .rmdir = aiori_posix_rmdir, + .rename = POSIX_Rename, .access = aiori_posix_access, .stat = aiori_posix_stat, .get_options = POSIX_options, @@ -168,16 +188,24 @@ ior_aiori_t posix_aiori = { static aiori_xfer_hint_t * hints = NULL; -void aiori_posix_xfer_hints(aiori_xfer_hint_t * params){ +void POSIX_xfer_hints(aiori_xfer_hint_t * params){ hints = params; } -static int POSIX_check_params(aiori_mod_opt_t * param){ +int POSIX_check_params(aiori_mod_opt_t * param){ posix_options_t * o = (posix_options_t*) param; if (o->beegfs_chunkSize != -1 && (!ISPOWEROFTWO(o->beegfs_chunkSize) || o->beegfs_chunkSize < (1<<16))) ERR("beegfsChunkSize must be a power of two and >64k"); if(o->lustre_stripe_count != -1 || o->lustre_stripe_size != 0) o->lustre_set_striping = 1; + if(o->gpuDirect && ! o->direct_io){ + ERR("GPUDirect required direct I/O to be used!"); + } +#ifndef HAVE_GPU_DIRECT + if(o->gpuDirect){ + ERR("GPUDirect support is not compiled"); + } +#endif return 0; } @@ -203,7 +231,7 @@ void gpfs_free_all_locks(int fd) EWARNF("gpfs_fcntl(%d, ...) release all locks hint failed.", fd); } } -void gpfs_access_start(int fd, IOR_offset_t length, int access) +void gpfs_access_start(int fd, IOR_offset_t length, IOR_offset_t offset, int access) { int rc; struct { @@ -217,7 +245,7 @@ void gpfs_access_start(int fd, IOR_offset_t length, int access) take_locks.access.structLen = sizeof(take_locks.access); take_locks.access.structType = GPFS_ACCESS_RANGE; - take_locks.access.start = hints->offset; + take_locks.access.start = offset; take_locks.access.length = length; take_locks.access.isWrite = (access == WRITE); @@ -227,7 +255,7 @@ void gpfs_access_start(int fd, IOR_offset_t length, int access) } } -void gpfs_access_end(int fd, IOR_offset_t length, int access) +void gpfs_access_end(int fd, IOR_offset_t length, IOR_offset_t offset, int access) { int rc; struct { @@ -242,7 +270,7 @@ void gpfs_access_end(int fd, IOR_offset_t length, int access) free_locks.free.structLen = sizeof(free_locks.free); free_locks.free.structType = GPFS_FREE_RANGE; - free_locks.free.start = hints->offset; + free_locks.free.start = offset; free_locks.free.length = length; rc = gpfs_fcntl(fd, &free_locks); @@ -368,42 +396,39 @@ bool beegfs_createFilePath(char* filepath, mode_t mode, int numTargets, int chun /* - * Creat and open a file through the POSIX interface. + * Create and open a file through the POSIX interface. */ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) { int fd_oflag = O_BINARY; int mode = 0664; - int *fd; - - fd = (int *)malloc(sizeof(int)); - if (fd == NULL) - ERR("Unable to malloc file descriptor"); + posix_fd * pfd = safeMalloc(sizeof(posix_fd)); posix_options_t * o = (posix_options_t*) param; if (o->direct_io == TRUE){ - set_o_direct_flag(&fd_oflag); + set_o_direct_flag(& fd_oflag); } if(hints->dryRun) return (aiori_fd_t*) 0; -#ifdef HAVE_LUSTRE_LUSTRE_USER_H +#ifdef HAVE_LUSTRE_USER /* Add a #define for FASYNC if not available, as it forms part of * the Lustre O_LOV_DELAY_CREATE definition. */ #ifndef FASYNC #define FASYNC 00020000 /* fcntl, for BSD compatibility */ #endif if (o->lustre_set_striping) { - /* In the single-shared-file case, task 0 has to creat the - file with the Lustre striping options before any other processes - open the file */ + /* In the single-shared-file case, task 0 has to create the + file with the Lustre striping options before any other + processes open the file */ if (!hints->filePerProc && rank != 0) { MPI_CHECK(MPI_Barrier(testComm), "barrier error"); fd_oflag |= O_RDWR; - *fd = open64(testFileName, fd_oflag, mode); - if (*fd < 0) - ERRF("open64(\"%s\", %d, %#o) failed", - testFileName, fd_oflag, mode); + pfd->fd = open64(testFileName, fd_oflag, mode); + if (pfd->fd < 0){ + ERRF("open64(\"%s\", %d, %#o) failed. Error: %s", + testFileName, fd_oflag, mode, strerror(errno)); + } } else { struct lov_user_md opts = { 0 }; @@ -416,30 +441,24 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) /* File needs to be opened O_EXCL because we cannot set * Lustre striping information on a pre-existing file.*/ - fd_oflag |= - O_CREAT | O_EXCL | O_RDWR | O_LOV_DELAY_CREATE; - *fd = open64(testFileName, fd_oflag, mode); - if (*fd < 0) { - fprintf(stdout, "\nUnable to open '%s': %s\n", + fd_oflag |= O_CREAT | O_EXCL | O_RDWR | O_LOV_DELAY_CREATE; + pfd->fd = open64(testFileName, fd_oflag, mode); + if (pfd->fd < 0) { + ERRF("Unable to open '%s': %s\n", testFileName, strerror(errno)); - MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), - "MPI_Abort() error"); - } else if (ioctl(*fd, LL_IOC_LOV_SETSTRIPE, &opts)) { + } else if (ioctl(pfd->fd, LL_IOC_LOV_SETSTRIPE, &opts)) { char *errmsg = "stripe already set"; if (errno != EEXIST && errno != EALREADY) errmsg = strerror(errno); - fprintf(stdout, - "\nError on ioctl for '%s' (%d): %s\n", - testFileName, *fd, errmsg); - MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), - "MPI_Abort() error"); + ERRF("Error on ioctl for '%s' (%d): %s\n", + testFileName, pfd->fd, errmsg); } if (!hints->filePerProc) MPI_CHECK(MPI_Barrier(testComm), "barrier error"); } } else { -#endif /* HAVE_LUSTRE_LUSTRE_USER_H */ +#endif /* HAVE_LUSTRE_USER */ fd_oflag |= O_CREAT | O_RDWR; @@ -458,34 +477,40 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) } #endif /* HAVE_BEEGFS_BEEGFS_H */ - *fd = open64(testFileName, fd_oflag, mode); - if (*fd < 0) - ERRF("open64(\"%s\", %d, %#o) failed", - testFileName, fd_oflag, mode); + pfd->fd = open64(testFileName, fd_oflag, mode); + if (pfd->fd < 0){ + ERRF("open64(\"%s\", %d, %#o) failed. Error: %s", + testFileName, fd_oflag, mode, strerror(errno)); + } -#ifdef HAVE_LUSTRE_LUSTRE_USER_H +#ifdef HAVE_LUSTRE_USER } if (o->lustre_ignore_locks) { int lustre_ioctl_flags = LL_FILE_IGNORE_LOCK; - if (ioctl(*fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) - ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", *fd); + if (ioctl(pfd->fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) + ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", pfd->fd); } -#endif /* HAVE_LUSTRE_LUSTRE_USER_H */ +#endif /* HAVE_LUSTRE_USER */ #ifdef HAVE_GPFS_FCNTL_H /* in the single shared file case, immediately release all locks, with * the intent that we can avoid some byte range lock revocation: * everyone will be writing/reading from individual regions */ if (o->gpfs_release_token ) { - gpfs_free_all_locks(*fd); + gpfs_free_all_locks(pfd->fd); } #endif - return (aiori_fd_t*) fd; +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + init_cufile(pfd); + } +#endif + return (aiori_fd_t*) pfd; } /* - * Creat a file through mknod interface. + * Create a file through mknod interface. */ int POSIX_Mknod(char *testFileName) { @@ -504,43 +529,48 @@ int POSIX_Mknod(char *testFileName) aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param) { int fd_oflag = O_BINARY; - int *fd; - - fd = (int *)malloc(sizeof(int)); - if (fd == NULL) - ERR("Unable to malloc file descriptor"); - + if(flags & IOR_RDONLY){ + fd_oflag |= O_RDONLY; + }else if(flags & IOR_WRONLY){ + fd_oflag |= O_WRONLY; + }else{ + fd_oflag |= O_RDWR; + } + posix_fd * pfd = safeMalloc(sizeof(posix_fd)); posix_options_t * o = (posix_options_t*) param; - if (o->direct_io == TRUE) + if (o->direct_io == TRUE){ set_o_direct_flag(&fd_oflag); - - fd_oflag |= O_RDWR; + } if(hints->dryRun) return (aiori_fd_t*) 0; - *fd = open64(testFileName, fd_oflag); - if (*fd < 0) - ERRF("open64(\"%s\", %d) failed", testFileName, fd_oflag); + pfd->fd = open64(testFileName, fd_oflag); + if (pfd->fd < 0) + ERRF("open64(\"%s\", %d) failed: %s", testFileName, fd_oflag, strerror(errno)); -#ifdef HAVE_LUSTRE_LUSTRE_USER_H +#ifdef HAVE_LUSTRE_USER if (o->lustre_ignore_locks) { int lustre_ioctl_flags = LL_FILE_IGNORE_LOCK; if (verbose >= VERBOSE_1) { - fprintf(stdout, - "** Disabling lustre range locking **\n"); + EINFO("** Disabling lustre range locking **\n"); } - if (ioctl(*fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) - ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", *fd); + if (ioctl(pfd->fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) + ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", pfd->fd); } -#endif /* HAVE_LUSTRE_LUSTRE_USER_H */ +#endif /* HAVE_LUSTRE_USER */ #ifdef HAVE_GPFS_FCNTL_H if(o->gpfs_release_token) { - gpfs_free_all_locks(*fd); + gpfs_free_all_locks(pfd->fd); } #endif - return (aiori_fd_t*) fd; +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + init_cufile(pfd); + } +#endif + return (aiori_fd_t*) pfd; } /* @@ -559,11 +589,12 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer if(hints->dryRun) return length; - fd = *(int *)file; + posix_fd * pfd = (posix_fd *) file; + fd = pfd->fd; #ifdef HAVE_GPFS_FCNTL_H if (o->gpfs_hint_access) { - gpfs_access_start(fd, length, access); + gpfs_access_start(fd, length, offset, access); } #endif @@ -571,17 +602,24 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer /* seek to offset */ if (lseek64(fd, offset, SEEK_SET) == -1) ERRF("lseek64(%d, %lld, SEEK_SET) failed", fd, offset); - + off_t mem_offset = 0; while (remaining > 0) { /* write/read file */ if (access == WRITE) { /* WRITE */ if (verbose >= VERBOSE_4) { - fprintf(stdout, - "task %d writing to offset %lld\n", + EINFO("task %d writing to offset %lld\n", rank, offset + length - remaining); } - rc = write(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + rc = cuFileWrite(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset); + }else{ +#endif + rc = write(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + } +#endif if (rc == -1) ERRF("write(%d, %p, %lld) failed", fd, (void*)ptr, remaining); @@ -590,12 +628,19 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer } } else { /* READ or CHECK */ if (verbose >= VERBOSE_4) { - fprintf(stdout, - "task %d reading from offset %lld\n", + EINFO("task %d reading from offset %lld\n", rank, offset + length - remaining); } - rc = read(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + rc = cuFileRead(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset); + }else{ +#endif + rc = read(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + } +#endif if (rc == 0) ERRF("read(%d, %p, %lld) returned EOF prematurely", fd, (void*)ptr, remaining); @@ -604,43 +649,38 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer fd, (void*)ptr, remaining); } if (rc < remaining) { - fprintf(stdout, - "WARNING: Task %d, partial %s, %lld of %lld bytes at offset %lld\n", + EWARNF("task %d, partial %s, %lld of %lld bytes at offset %lld\n", rank, access == WRITE ? "write()" : "read()", rc, remaining, offset + length - remaining); - if (hints->singleXferAttempt == TRUE) - MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), - "barrier error"); - if (xferRetries > MAX_RETRY) + if (xferRetries > MAX_RETRY || hints->singleXferAttempt) ERR("too many retries -- aborting"); } assert(rc >= 0); assert(rc <= remaining); remaining -= rc; ptr += rc; + mem_offset += rc; xferRetries++; } #ifdef HAVE_GPFS_FCNTL_H if (o->gpfs_hint_access) { - gpfs_access_end(fd, length, param, access); + gpfs_access_end(fd, length, offset, access); } #endif return (length); } -/* - * Perform fsync(). - */ -static void POSIX_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * param) +void POSIX_Fsync(aiori_fd_t *afd, aiori_mod_opt_t * param) { - if (fsync(*(int *)fd) != 0) - EWARNF("fsync(%d) failed", *(int *)fd); + int fd = ((posix_fd*) afd)->fd; + if (fsync(fd) != 0) + EWARNF("fsync(%d) failed", fd); } -static void POSIX_Sync(aiori_mod_opt_t * param) +void POSIX_Sync(aiori_mod_opt_t * param) { int ret = system("sync"); if (ret != 0){ @@ -652,13 +692,21 @@ static void POSIX_Sync(aiori_mod_opt_t * param) /* * Close a file through the POSIX interface. */ -void POSIX_Close(aiori_fd_t *fd, aiori_mod_opt_t * param) +void POSIX_Close(aiori_fd_t *afd, aiori_mod_opt_t * param) { if(hints->dryRun) return; - if (close(*(int *)fd) != 0) - ERRF("close(%d) failed", *(int *)fd); - free(fd); + posix_options_t * o = (posix_options_t*) param; + int fd = ((posix_fd*) afd)->fd; +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + cuFileHandleDeregister(((posix_fd*) afd)->cf_handle); + } +#endif + if (close(fd) != 0){ + ERRF("close(%d) failed", fd); + } + free(afd); } /* @@ -669,16 +717,25 @@ void POSIX_Delete(char *testFileName, aiori_mod_opt_t * param) if(hints->dryRun) return; if (unlink(testFileName) != 0){ - EWARNF("[RANK %03d]: unlink() of file \"%s\" failed\n", - rank, testFileName); + EWARNF("[RANK %03d]: unlink() of file \"%s\" failed", rank, testFileName); } } +int POSIX_Rename(const char * oldfile, const char * newfile, aiori_mod_opt_t * module_options){ + if(hints->dryRun) + return 0; + + if(rename(oldfile, newfile) != 0){ + EWARNF("[RANK %03d]: rename() of file \"%s\" to \"%s\" failed", rank, oldfile, newfile); + return -1; + } + return 0; +} + /* * Use POSIX stat() to return aggregate file size. */ -IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, MPI_Comm testComm, - char *testFileName) +IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, char *testFileName) { if(hints->dryRun) return 0; @@ -690,26 +747,17 @@ IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, MPI_Comm testComm, } aggFileSizeFromStat = stat_buf.st_size; - if (hints->filePerProc == TRUE) { - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1, - MPI_LONG_LONG_INT, MPI_SUM, testComm), - "cannot total data moved"); - aggFileSizeFromStat = tmpSum; - } else { - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMin, 1, - MPI_LONG_LONG_INT, MPI_MIN, testComm), - "cannot total data moved"); - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMax, 1, - MPI_LONG_LONG_INT, MPI_MAX, testComm), - "cannot total data moved"); - if (tmpMin != tmpMax) { - if (rank == 0) { - WARN("inconsistent file size by different tasks"); - } - /* incorrect, but now consistent across tasks */ - aggFileSizeFromStat = tmpMin; - } - } - return (aggFileSizeFromStat); } + +void POSIX_Initialize(aiori_mod_opt_t * options){ +#ifdef HAVE_GPU_DIRECT + CUfileError_t err = cuFileDriverOpen(); +#endif +} + +void POSIX_Finalize(aiori_mod_opt_t * options){ +#ifdef HAVE_GPU_DIRECT + CUfileError_t err = cuFileDriverClose(); +#endif +} diff --git a/src/aiori-POSIX.h b/src/aiori-POSIX.h new file mode 100644 index 0000000..b2f556a --- /dev/null +++ b/src/aiori-POSIX.h @@ -0,0 +1,43 @@ +#ifndef AIORI_POSIX_H +#define AIORI_POSIX_H + +#include "aiori.h" + +/************************** O P T I O N S *****************************/ +typedef struct{ + /* in case of a change, please update depending MMAP module too */ + int direct_io; + + /* Lustre variables */ + int lustre_set_striping; /* flag that we need to set lustre striping */ + int lustre_stripe_count; + int lustre_stripe_size; + int lustre_start_ost; + int lustre_ignore_locks; + + /* gpfs variables */ + int gpfs_hint_access; /* use gpfs "access range" hint */ + int gpfs_release_token; /* immediately release GPFS tokens after + creating or opening a file */ + /* beegfs variables */ + int beegfs_numTargets; /* number storage targets to use */ + int beegfs_chunkSize; /* srtipe pattern for new files */ + int gpuDirect; +} posix_options_t; + +void POSIX_Sync(aiori_mod_opt_t * param); +int POSIX_check_params(aiori_mod_opt_t * param); +void POSIX_Fsync(aiori_fd_t *, aiori_mod_opt_t *); +int POSIX_check_params(aiori_mod_opt_t * options); +aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * module_options); +int POSIX_Mknod(char *testFileName); +aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * module_options); +IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, char *testFileName); +void POSIX_Delete(char *testFileName, aiori_mod_opt_t * module_options); +int POSIX_Rename(const char *oldfile, const char *newfile, aiori_mod_opt_t * module_options); +void POSIX_Close(aiori_fd_t *fd, aiori_mod_opt_t * module_options); +option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values); +void POSIX_xfer_hints(aiori_xfer_hint_t * params); + + +#endif diff --git a/src/aiori-S3.c b/src/aiori-S3-4c.c similarity index 74% rename from src/aiori-S3.c rename to src/aiori-S3-4c.c index a060646..6155ceb 100755 --- a/src/aiori-S3.c +++ b/src/aiori-S3-4c.c @@ -91,16 +91,6 @@ #include #include -/* -#ifdef HAVE_LUSTRE_LUSTRE_USER_H -#include -#endif -*/ - -#include "ior.h" -#include "aiori.h" -#include "iordef.h" - #include #include // from libxml2 @@ -109,28 +99,28 @@ #include "aws4c.h" // extended vers of "aws4c" lib for S3 via libcurl #include "aws4c_extra.h" // utilities, e.g. for parsing XML in responses +#include "ior.h" +#include "aiori.h" +#include "aiori-debug.h" +extern int rank; +extern MPI_Comm testComm; - -/* buffer is used to generate URLs, err_msgs, etc */ #define BUFF_SIZE 1024 -static char buff[BUFF_SIZE]; - const int ETAG_SIZE = 32; - CURLcode rc; -/* Any objects we create or delete will be under this bucket */ -const char* bucket_name = "ior"; - /* TODO: The following stuff goes into options! */ /* REST/S3 variables */ // CURL* curl; /* for libcurl "easy" fns (now managed by aws4c) */ -# define IOR_CURL_INIT 0x01 /* curl top-level inits were perfomed once? */ +# define IOR_CURL_INIT 0x01 /* curl top-level inits were performed once? */ # define IOR_CURL_NOCONTINUE 0x02 # define IOR_CURL_S3_EMC_EXT 0x04 /* allow EMC extensions to S3? */ -#ifdef USE_S3_AIORI +#define MAX_UPLOAD_ID_SIZE 256 /* TODO don't know the actual value */ + + +#ifdef USE_S3_4C_AIORI # include # include "aws4c.h" #else @@ -138,41 +128,60 @@ const char* bucket_name = "ior"; typedef void IOBuf; /* unused, but needs a type */ #endif - IOBuf* io_buf; /* aws4c places parsed header values here */ - IOBuf* etags; /* accumulate ETags for N:1 parts */ + +typedef struct { + /* Any objects we create or delete will be under this bucket */ + char* bucket_name; + char* user; + char* host; + /* Runtime data, this data isn't yet safe to allow concurrent access to multiple files, only open one file at a time */ + int curl_flags; + IOBuf* io_buf; /* aws4c places parsed header values here */ + IOBuf* etags; /* accumulate ETags for N:1 parts */ + size_t part_number; + char UploadId[MAX_UPLOAD_ID_SIZE]; /* key for multi-part-uploads */ + int written; /* did we write to the file */ +} s3_options_t; /////////////////////////////////////////////// +static aiori_xfer_hint_t * hints = NULL; + +static void S3_xfer_hints(aiori_xfer_hint_t * params){ + hints = params; +} + /**************************** P R O T O T Y P E S *****************************/ -static void* S3_Create(char*, IOR_param_t*); -static void* S3_Open(char*, IOR_param_t*); -static IOR_offset_t S3_Xfer(int, void*, IOR_size_t*, IOR_offset_t, IOR_param_t*); -static void S3_Close(void*, IOR_param_t*); +static aiori_fd_t* S3_Create(char *path, int iorflags, aiori_mod_opt_t * options); +static aiori_fd_t* S3_Open(char *path, int flags, aiori_mod_opt_t * options); +static IOR_offset_t S3_Xfer(int access, aiori_fd_t * afd, IOR_size_t * buffer, IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * options); +static void S3_Close(aiori_fd_t * afd, aiori_mod_opt_t * options); -static void* EMC_Create(char*, IOR_param_t*); -static void* EMC_Open(char*, IOR_param_t*); -static IOR_offset_t EMC_Xfer(int, void*, IOR_size_t*, IOR_offset_t, IOR_param_t*); -static void EMC_Close(void*, IOR_param_t*); - -static void S3_Delete(char*, IOR_param_t*); -static void S3_Fsync(void*, IOR_param_t*); -static IOR_offset_t S3_GetFileSize(IOR_param_t*, MPI_Comm, char*); -static void S3_init(void * options); -static void S3_finalize(void * options); -static int S3_check_params(IOR_param_t *); +static aiori_fd_t* EMC_Create(char *path, int iorflags, aiori_mod_opt_t * options); +static aiori_fd_t* EMC_Open(char *path, int flags, aiori_mod_opt_t * options); +static IOR_offset_t EMC_Xfer(int access, aiori_fd_t * afd, IOR_size_t * buffer, IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * options); +static void EMC_Close(aiori_fd_t * afd, aiori_mod_opt_t * options); +static void S3_Delete(char *path, aiori_mod_opt_t * options); +static void S3_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * options); +static IOR_offset_t S3_GetFileSize(aiori_mod_opt_t * options, char *testFileName); +static void S3_init(aiori_mod_opt_t * options); +static void S3_finalize(aiori_mod_opt_t * options); +static int S3_check_params(aiori_mod_opt_t * options); +static option_help * S3_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values); /************************** D E C L A R A T I O N S ***************************/ // "Pure S3" // N:1 writes use multi-part upload // N:N fails if "transfer-size" != "block-size" (because that requires "append") -ior_aiori_t s3_aiori = { - .name = "S3", +ior_aiori_t s3_4c_aiori = { + .name = "S3-4c", .name_legacy = NULL, .create = S3_Create, .open = S3_Open, .xfer = S3_Xfer, + .xfer_hints = S3_xfer_hints, .close = S3_Close, .delete = S3_Delete, .get_version = aiori_get_version, @@ -180,7 +189,9 @@ ior_aiori_t s3_aiori = { .get_file_size = S3_GetFileSize, .initialize = S3_init, .finalize = S3_finalize, - .check_params = S3_check_params + .check_params = S3_check_params, + .get_options = S3_options, + .enable_mdtest = true }; // "S3", plus EMC-extensions enabled @@ -193,7 +204,7 @@ ior_aiori_t s3_plus_aiori = { .xfer = S3_Xfer, .close = S3_Close, .delete = S3_Delete, - .set_version = S3_SetVersion, + .get_version = aiori_get_version, .fsync = S3_Fsync, .get_file_size = S3_GetFileSize, .initialize = S3_init, @@ -210,7 +221,7 @@ ior_aiori_t s3_emc_aiori = { .xfer = EMC_Xfer, .close = EMC_Close, .delete = S3_Delete, - .set_version = S3_SetVersion, + .get_version = aiori_get_version, .fsync = S3_Fsync, .get_file_size = S3_GetFileSize, .initialize = S3_init, @@ -218,26 +229,50 @@ ior_aiori_t s3_emc_aiori = { }; -static void S3_init(void * options){ +static option_help * S3_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){ + s3_options_t * o = malloc(sizeof(s3_options_t)); + if (init_values != NULL){ + memcpy(o, init_values, sizeof(s3_options_t)); + }else{ + memset(o, 0, sizeof(s3_options_t)); + } + + *init_backend_options = (aiori_mod_opt_t*) o; + o->bucket_name = "ior"; + + option_help h [] = { + {0, "S3-4c.user", "The username (in ~/.awsAuth).", OPTION_OPTIONAL_ARGUMENT, 's', & o->user}, + {0, "S3-4C.host", "The host optionally followed by:port.", OPTION_OPTIONAL_ARGUMENT, 's', & o->host}, + {0, "S3-4c.bucket-name", "The name of the bucket.", OPTION_OPTIONAL_ARGUMENT, 's', & o->bucket_name}, + LAST_OPTION + }; + option_help * help = malloc(sizeof(h)); + memcpy(help, h, sizeof(h)); + return help; +} + + +static void S3_init(aiori_mod_opt_t * options){ /* This is supposed to be done before *any* threads are created. * Could MPI_Init() create threads (or call multi-threaded * libraries)? We'll assume so. */ AWS4C_CHECK( aws_init() ); } -static void S3_finalize(void * options){ +static void S3_finalize(aiori_mod_opt_t * options){ /* done once per program, after exiting all threads. * NOTE: This fn doesn't return a value that can be checked for success. */ aws_cleanup(); } -static int S3_check_params(IOR_param_t * test){ +static int S3_check_params(aiori_mod_opt_t * test){ + if(! hints) return 0; /* N:1 and N:N */ - IOR_offset_t NtoN = test->filePerProc; + IOR_offset_t NtoN = hints->filePerProc; IOR_offset_t Nto1 = ! NtoN; - IOR_offset_t s = test->segmentCount; - IOR_offset_t t = test->transferSize; - IOR_offset_t b = test->blockSize; + IOR_offset_t s = hints->segmentCount; + IOR_offset_t t = hints->transferSize; + IOR_offset_t b = hints->blockSize; if (Nto1 && (s != 1) && (b != t)) { ERR("N:1 (strided) requires xfer-size == block-size"); @@ -286,21 +321,21 @@ static int S3_check_params(IOR_param_t * test){ * NOTE: Our custom version of aws4c can be configured so that connections * are reused, instead of opened and closed on every operation. We * do configure it that way, but you still need to call these - * connect/disconnet functions, in order to insure that aws4c has + * connect/disconnect functions, in order to insure that aws4c has * been configured. * --------------------------------------------------------------------------- */ -static void s3_connect( IOR_param_t* param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> s3_connect\n"); /* DEBUGGING */ - } +static void s3_connect( s3_options_t* param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> s3_connect\n"); /* DEBUGGING */ + //} if ( param->curl_flags & IOR_CURL_INIT ) { - if (param->verbose >= VERBOSE_2) { - printf("<- s3_connect [nothing to do]\n"); /* DEBUGGING */ - } + //if (param->verbose >= VERBOSE_2) { + // printf("<- s3_connect [nothing to do]\n"); /* DEBUGGING */ + //} return; } @@ -318,11 +353,11 @@ static void s3_connect( IOR_param_t* param ) { // NOTE: These inits could be done in init_IORParam_t(), in ior.c, but // would require conditional compilation, there. - aws_set_debug(param->verbose >= 4); - aws_read_config(getenv("USER")); // requires ~/.awsAuth + aws_set_debug(0); // param->verbose >= 4 + aws_read_config(param->user); // requires ~/.awsAuth aws_reuse_connections(1); - // initalize IOBufs. These are basically dynamically-extensible + // initialize IOBufs. These are basically dynamically-extensible // linked-lists. "growth size" controls the increment of new memory // allocated, whenever storage is used up. param->io_buf = aws_iobuf_new(); @@ -346,8 +381,8 @@ static void s3_connect( IOR_param_t* param ) { // snprintf(buff, BUFF_SIZE, "10.140.0.%d", 15 + (rank % 4)); // s3_set_host(buff); - snprintf(buff, BUFF_SIZE, "10.140.0.%d:9020", 15 + (rank % 4)); - s3_set_host(buff); + //snprintf(options->buff, BUFF_SIZE, "10.140.0.%d:9020", 15 + (rank % 4)); + //s3_set_host(options->buff); #else /* @@ -366,23 +401,25 @@ static void s3_connect( IOR_param_t* param ) { // s3_set_host( "10.143.0.1:80"); #endif + s3_set_host(param->host); + // make sure test-bucket exists - s3_set_bucket((char*)bucket_name); + s3_set_bucket((char*) param->bucket_name); if (rank == 0) { AWS4C_CHECK( s3_head(param->io_buf, "") ); if ( param->io_buf->code == 404 ) { // "404 Not Found" - printf(" bucket '%s' doesn't exist\n", bucket_name); + printf(" bucket '%s' doesn't exist\n", param->bucket_name); AWS4C_CHECK( s3_put(param->io_buf, "") ); /* creates URL as bucket + obj */ AWS4C_CHECK_OK( param->io_buf ); // assure "200 OK" - printf("created bucket '%s'\n", bucket_name); + printf("created bucket '%s'\n", param->bucket_name); } else { // assure "200 OK" AWS4C_CHECK_OK( param->io_buf ); } } - MPI_CHECK(MPI_Barrier(param->testComm), "barrier error"); + MPI_CHECK(MPI_Barrier(testComm), "barrier error"); // Maybe allow EMC extensions to S3 @@ -391,24 +428,22 @@ static void s3_connect( IOR_param_t* param ) { // don't perform these inits more than once param->curl_flags |= IOR_CURL_INIT; - - if (param->verbose >= VERBOSE_2) { - printf("<- s3_connect [success]\n"); - } + //if (param->verbose >= VERBOSE_2) { + // printf("<- s3_connect [success]\n"); + //} } static void -s3_disconnect( IOR_param_t* param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> s3_disconnect\n"); - } - +s3_disconnect( s3_options_t* param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> s3_disconnect\n"); + //} // nothing to do here, if using new aws4c ... - if (param->verbose >= VERBOSE_2) { - printf("<- s3_disconnect\n"); - } + //if (param->verbose >= VERBOSE_2) { + // printf("<- s3_disconnect\n"); + //} } @@ -416,8 +451,7 @@ s3_disconnect( IOR_param_t* param ) { // After finalizing an S3 multi-part-upload, you must reset some things // before you can use multi-part-upload again. This will also avoid (one // particular set of) memory-leaks. -void -s3_MPU_reset(IOR_param_t* param) { +void s3_MPU_reset(s3_options_t* param) { aws_iobuf_reset(param->io_buf); aws_iobuf_reset(param->etags); param->part_number = 0; @@ -453,46 +487,44 @@ s3_MPU_reset(IOR_param_t* param) { * */ -static -void * -S3_Create_Or_Open_internal(char* testFileName, - IOR_param_t* param, - unsigned char createFile, - int multi_part_upload_p ) { +static aiori_fd_t * S3_Create_Or_Open_internal(char* testFileName, int openFlags, s3_options_t* param, int multi_part_upload_p ) { + unsigned char createFile = openFlags & IOR_CREAT; - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Create_Or_Open('%s', ,%d, %d)\n", - testFileName, createFile, multi_part_upload_p); - } + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Create_Or_Open('%s', ,%d, %d)\n", + // testFileName, createFile, multi_part_upload_p); + //} /* initialize curl, if needed */ s3_connect( param ); /* Check for unsupported flags */ - if ( param->openFlags & IOR_EXCL ) { - fprintf( stdout, "Opening in Exclusive mode is not implemented in S3\n" ); - } - if ( param->useO_DIRECT == TRUE ) { - fprintf( stdout, "Direct I/O mode is not implemented in S3\n" ); - } + //if ( param->openFlags & IOR_EXCL ) { + // fprintf( stdout, "Opening in Exclusive mode is not implemented in S3\n" ); + //} + //if ( param->useO_DIRECT == TRUE ) { + // fprintf( stdout, "Direct I/O mode is not implemented in S3\n" ); + //} // easier to think - int n_to_n = param->filePerProc; + int n_to_n = hints->filePerProc; int n_to_1 = ! n_to_n; /* check whether object needs reset to zero-length */ int needs_reset = 0; if (! multi_part_upload_p) needs_reset = 1; /* so "append" can work */ - else if ( param->openFlags & IOR_TRUNC ) + else if ( openFlags & IOR_TRUNC ) needs_reset = 1; /* so "append" can work */ else if (createFile) { // AWS4C_CHECK( s3_head(param->io_buf, testFileName) ); // if ( ! AWS4C_OK(param->io_buf) ) needs_reset = 1; } - - if ( param->open == WRITE ) { + char buff[BUFF_SIZE]; /* buffer is used to generate URLs, err_msgs, etc */ + param->written = 0; + if ( openFlags & IOR_WRONLY || openFlags & IOR_RDWR ) { + param->written = 1; /* initializations for N:1 or N:N writes using multi-part upload */ if (multi_part_upload_p) { @@ -522,23 +554,21 @@ S3_Create_Or_Open_internal(char* testFileName, response->first->len, NULL, NULL, 0); if (doc == NULL) - ERR_SIMPLE("Rank0 Failed to find POST response\n"); + ERR("Rank0 Failed to find POST response\n"); // navigate parsed XML-tree to find UploadId xmlNode* root_element = xmlDocGetRootElement(doc); const char* upload_id = find_element_named(root_element, (char*)"UploadId"); if (! upload_id) - ERR_SIMPLE("couldn't find 'UploadId' in returned XML\n"); + ERR("couldn't find 'UploadId' in returned XML\n"); - if (param->verbose >= VERBOSE_3) - printf("got UploadId = '%s'\n", upload_id); + //if (param->verbose >= VERBOSE_3) + // printf("got UploadId = '%s'\n", upload_id); const size_t upload_id_len = strlen(upload_id); if (upload_id_len > MAX_UPLOAD_ID_SIZE) { - snprintf(buff, BUFF_SIZE, - "UploadId length %d exceeds expected max (%d)", - upload_id_len, MAX_UPLOAD_ID_SIZE); - ERR_SIMPLE(buff); + snprintf(buff, BUFF_SIZE, "UploadId length %zd exceeds expected max (%d)", upload_id_len, MAX_UPLOAD_ID_SIZE); + ERR(buff); } // save the UploadId we found @@ -551,16 +581,15 @@ S3_Create_Or_Open_internal(char* testFileName, // For N:1, share UploadId across all ranks if (n_to_1) - MPI_Bcast(param->UploadId, MAX_UPLOAD_ID_SIZE, MPI_BYTE, 0, param->testComm); + MPI_Bcast(param->UploadId, MAX_UPLOAD_ID_SIZE, MPI_BYTE, 0, testComm); } else // N:1, and we're not rank0. recv UploadID from Rank 0 - MPI_Bcast(param->UploadId, MAX_UPLOAD_ID_SIZE, MPI_BYTE, 0, param->testComm); + MPI_Bcast(param->UploadId, MAX_UPLOAD_ID_SIZE, MPI_BYTE, 0, testComm); } /* initializations for N:N or N:1 writes using EMC byte-range extensions */ else { - /* maybe reset to zero-length, so "append" can work */ if (needs_reset) { @@ -576,84 +605,48 @@ S3_Create_Or_Open_internal(char* testFileName, } } - - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Create_Or_Open\n"); - } - return ((void *) testFileName ); + //if (param->verbose >= VERBOSE_2) { + // printf("<- S3_Create_Or_Open\n"); + //} + return ((aiori_fd_t *) testFileName ); } +static aiori_fd_t * S3_Create( char *testFileName, int iorflags, aiori_mod_opt_t * param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Create\n"); + //} - -static -void * -S3_Create( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Create\n"); - } - - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Create\n"); - } - return S3_Create_Or_Open_internal( testFileName, param, TRUE, TRUE ); -} -static -void * -EMC_Create( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> EMC_Create\n"); - } - - if (param->verbose >= VERBOSE_2) { - printf("<- EMC_Create\n"); - } - return S3_Create_Or_Open_internal( testFileName, param, TRUE, FALSE ); + //if (param->verbose >= VERBOSE_2) { + // printf("<- S3_Create\n"); + //} + return S3_Create_Or_Open_internal( testFileName, iorflags, (s3_options_t*) param, TRUE ); } +static aiori_fd_t * EMC_Create( char *testFileName, int iorflags, aiori_mod_opt_t * param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> EMC_Create\n"); + //} - - - - -static -void * -S3_Open( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Open\n"); - } - - if ( param->openFlags & IOR_CREAT ) { - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Open( ... TRUE)\n"); - } - return S3_Create_Or_Open_internal( testFileName, param, TRUE, TRUE ); - } - else { - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Open( ... FALSE)\n"); - } - return S3_Create_Or_Open_internal( testFileName, param, FALSE, TRUE ); - } + //if (param->verbose >= VERBOSE_2) { + // printf("<- EMC_Create\n"); + //} + return S3_Create_Or_Open_internal( testFileName, iorflags, (s3_options_t*) param, FALSE ); } -static -void * -EMC_Open( char *testFileName, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Open\n"); - } - if ( param->openFlags & IOR_CREAT ) { - if (param->verbose >= VERBOSE_2) { - printf("<- EMC_Open( ... TRUE)\n"); - } - return S3_Create_Or_Open_internal( testFileName, param, TRUE, FALSE ); - } - else { - if (param->verbose >= VERBOSE_2) { - printf("<- EMC_Open( ... FALSE)\n"); - } - return S3_Create_Or_Open_internal( testFileName, param, FALSE, FALSE ); - } +static aiori_fd_t * S3_Open( char *testFileName, int flags, aiori_mod_opt_t * param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Open\n"); + //} + + return S3_Create_Or_Open_internal( testFileName, flags, (s3_options_t*) param, TRUE ); +} + +static aiori_fd_t * EMC_Open( char *testFileName, int flags, aiori_mod_opt_t * param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Open\n"); + //} + + return S3_Create_Or_Open_internal( testFileName, flags, (s3_options_t*) param, FALSE ); } @@ -714,7 +707,7 @@ EMC_Open( char *testFileName, IOR_param_t * param ) { * impose two scaling problems: (1) requires all ETags to be shipped at * the BW available to a single process, (1) requires either that they * all fit into memory of a single process, or be written to disk - * (imposes additional BW contraints), or make a more-complex + * (imposes additional BW constraints), or make a more-complex * interaction with a threaded curl writefunction, to present the * appearance of a single thread to curl, whilst allowing streaming * reception of non-local ETags. @@ -730,39 +723,35 @@ EMC_Open( char *testFileName, IOR_param_t * param ) { */ -static -IOR_offset_t -S3_Xfer_internal(int access, - void* file, +static IOR_offset_t S3_Xfer_internal(int access, + aiori_fd_t* file, IOR_size_t* buffer, IOR_offset_t length, - IOR_param_t* param, + IOR_offset_t offset, + s3_options_t* param, int multi_part_upload_p ) { - - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Xfer(acc:%d, target:%s, buf:0x%llx, len:%llu, 0x%llx)\n", - access, (char*)file, buffer, length, param); - } + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Xfer(acc:%d, target:%s, buf:0x%llx, len:%llu, 0x%llx)\n", + // access, (char*)file, buffer, length, param); + //} char* fname = (char*)file; /* see NOTE above S3_Create_Or_Open() */ size_t remaining = (size_t)length; char* data_ptr = (char *)buffer; - off_t offset = param->offset; // easier to think - int n_to_n = param->filePerProc; + int n_to_n = hints->filePerProc; int n_to_1 = (! n_to_n); - int segmented = (param->segmentCount == 1); + int segmented = (hints->segmentCount == 1); if (access == WRITE) { /* WRITE */ - - if (verbose >= VERBOSE_3) { - fprintf( stdout, "rank %d writing length=%lld to offset %lld\n", - rank, - remaining, - param->offset + length - remaining); - } + //if (verbose >= VERBOSE_3) { + // fprintf( stdout, "rank %d writing length=%lld to offset %lld\n", + // rank, + // remaining, + // param->offset + length - remaining); + //} if (multi_part_upload_p) { @@ -777,7 +766,7 @@ S3_Xfer_internal(int access, // // In the N:1 case, the global order of part-numbers we're writing // depends on whether wer're writing strided or segmented, in - // other words, how and are acutally + // other words, how and are actually // positioning the parts being written. [See discussion at // S3_Close_internal().] // @@ -790,11 +779,11 @@ S3_Xfer_internal(int access, size_t part_number; if (n_to_1) { if (segmented) { // segmented - size_t parts_per_rank = param->blockSize / param->transferSize; + size_t parts_per_rank = hints->blockSize / hints->transferSize; part_number = (rank * parts_per_rank) + param->part_number; } else // strided - part_number = (param->part_number * param->numTasks) + rank; + part_number = (param->part_number * hints->numTasks) + rank; } else part_number = param->part_number; @@ -804,14 +793,15 @@ S3_Xfer_internal(int access, // if (verbose >= VERBOSE_3) { // fprintf( stdout, "rank %d of %d writing (%s,%s) part_number %lld\n", // rank, - // param->numTasks, + // hints->numTasks, // (n_to_1 ? "N:1" : "N:N"), // (segmented ? "segmented" : "strided"), // part_number); // } + char buff[BUFF_SIZE]; /* buffer is used to generate URLs, err_msgs, etc */ snprintf(buff, BUFF_SIZE, - "%s?partNumber=%d&uploadId=%s", + "%s?partNumber=%zd&uploadId=%s", fname, part_number, param->UploadId); // For performance, we append directly into the linked list @@ -838,16 +828,16 @@ S3_Xfer_internal(int access, // } // } - if (verbose >= VERBOSE_3) { - fprintf( stdout, "rank %d of %d (%s,%s) offset %lld, part# %lld --> ETag %s\n", - rank, - param->numTasks, - (n_to_1 ? "N:1" : "N:N"), - (segmented ? "segmented" : "strided"), - offset, - part_number, - param->io_buf->eTag); // incl quote-marks at [0] and [len-1] - } + //if (verbose >= VERBOSE_3) { + // fprintf( stdout, "rank %d of %d (%s,%s) offset %lld, part# %lld --> ETag %s\n", + // rank, + // hints->numTasks, + // (n_to_1 ? "N:1" : "N:N"), + // (segmented ? "segmented" : "strided"), + // offset, + // part_number, + // param->io_buf->eTag); // incl quote-marks at [0] and [len-1] + //} if (strlen(param->io_buf->eTag) != ETAG_SIZE+2) { /* quotes at both ends */ fprintf(stderr, "Rank %d: ERROR: expected ETag to be %d hex digits\n", rank, ETAG_SIZE); @@ -862,9 +852,9 @@ S3_Xfer_internal(int access, param->io_buf->eTag +1, strlen(param->io_buf->eTag) -2); // DEBUGGING - if (verbose >= VERBOSE_4) { - printf("rank %d: part %d = ETag %s\n", rank, part_number, param->io_buf->eTag); - } + //if (verbose >= VERBOSE_4) { + // printf("rank %d: part %d = ETag %s\n", rank, part_number, param->io_buf->eTag); + //} // drop ptrs to , in param->io_buf aws_iobuf_reset(param->io_buf); @@ -885,7 +875,7 @@ S3_Xfer_internal(int access, // than empty storage. aws_iobuf_reset(param->io_buf); aws_iobuf_append_static(param->io_buf, data_ptr, remaining); - AWS4C_CHECK ( s3_put(param->io_buf, file) ); + AWS4C_CHECK ( s3_put(param->io_buf, (char*) file) ); AWS4C_CHECK_OK( param->io_buf ); // drop ptrs to , in param->io_buf @@ -893,18 +883,18 @@ S3_Xfer_internal(int access, } - if ( param->fsyncPerWrite == TRUE ) { + if ( hints->fsyncPerWrite == TRUE ) { WARN("S3 doesn't support 'fsync'" ); /* does it? */ } } else { /* READ or CHECK */ - if (verbose >= VERBOSE_3) { - fprintf( stdout, "rank %d reading from offset %lld\n", - rank, - param->offset + length - remaining ); - } + //if (verbose >= VERBOSE_3) { + // fprintf( stdout, "rank %d reading from offset %lld\n", + // rank, + // hints->offset + length - remaining ); + //} // read specific byte-range from the object // [This is included in the "pure" S3 spec.] @@ -917,43 +907,45 @@ S3_Xfer_internal(int access, // libcurl writefunction, invoked via aws4c. aws_iobuf_reset(param->io_buf); aws_iobuf_extend_static(param->io_buf, data_ptr, remaining); - AWS4C_CHECK( s3_get(param->io_buf, file) ); + AWS4C_CHECK( s3_get(param->io_buf, (char*) file) ); if (param->io_buf->code != 206) { /* '206 Partial Content' */ + char buff[BUFF_SIZE]; /* buffer is used to generate URLs, err_msgs, etc */ snprintf(buff, BUFF_SIZE, "Unexpected result (%d, '%s')", param->io_buf->code, param->io_buf->result); - ERR_SIMPLE(buff); + ERR(buff); } // drop refs to , in param->io_buf aws_iobuf_reset(param->io_buf); } - - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Xfer\n"); - } + //if (verbose >= VERBOSE_2) { + // printf("<- S3_Xfer\n"); + //} return ( length ); } -static -IOR_offset_t -S3_Xfer(int access, - void* file, +static IOR_offset_t S3_Xfer(int access, + aiori_fd_t* file, IOR_size_t* buffer, IOR_offset_t length, - IOR_param_t* param ) { - S3_Xfer_internal(access, file, buffer, length, param, TRUE); + IOR_offset_t offset, + aiori_mod_opt_t* param ) { + S3_Xfer_internal(access, file, buffer, length, offset, (s3_options_t*) param, TRUE); } + + static IOR_offset_t EMC_Xfer(int access, - void* file, + aiori_fd_t* file, IOR_size_t* buffer, IOR_offset_t length, - IOR_param_t* param ) { - S3_Xfer_internal(access, file, buffer, length, param, FALSE); + IOR_offset_t offset, + aiori_mod_opt_t* param ) { + S3_Xfer_internal(access, file, buffer, length, offset, (s3_options_t*) param, FALSE); } @@ -992,16 +984,10 @@ EMC_Xfer(int access, * MPI_COMM_WORLD. */ -static -void -S3_Fsync( void *fd, IOR_param_t * param ) { - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Fsync [no-op]\n"); - } - - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Fsync\n"); - } +static void S3_Fsync( aiori_fd_t *fd, aiori_mod_opt_t * param ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Fsync [no-op]\n"); + //} } @@ -1014,7 +1000,7 @@ S3_Fsync( void *fd, IOR_param_t * param ) { * * ISSUE: The S3 spec says that a multi-part upload can have at most 10,000 * parts. Does EMC allow more than this? (NOTE the spec also says - * parts must be at leaast 5MB, but EMC definitely allows smaller + * parts must be at least 5MB, but EMC definitely allows smaller * parts than that.) * * ISSUE: All Etags must be sent from a single rank, in a single @@ -1030,29 +1016,17 @@ S3_Fsync( void *fd, IOR_param_t * param ) { * See S3_Fsync() for some possible considerations. */ -static -void -S3_Close_internal( void* fd, - IOR_param_t* param, - int multi_part_upload_p ) { +static void S3_Close_internal(aiori_fd_t* fd, s3_options_t* param, int multi_part_upload_p) { char* fname = (char*)fd; /* see NOTE above S3_Create_Or_Open() */ // easier to think - int n_to_n = param->filePerProc; + int n_to_n = hints->filePerProc; int n_to_1 = (! n_to_n); - int segmented = (param->segmentCount == 1); - - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Close('%s', ,%d) %s\n", - fname, - multi_part_upload_p, - ((n_to_n) ? "N:N" : ((segmented) ? "N:1(seg)" : "N:1(str)"))); - } - - if (param->open == WRITE) { + int segmented = (hints->segmentCount == 1); + if (param->written) { // finalizing Multi-Part Upload (for N:1 or N:N) if (multi_part_upload_p) { @@ -1078,11 +1052,11 @@ S3_Close_internal( void* fd, // Everybody should have the same number of ETags (?) size_t etag_count_max = 0; /* highest number on any proc */ MPI_Allreduce(&etags_per_rank, &etag_count_max, - 1, mpi_size_t, MPI_MAX, param->testComm); + 1, mpi_size_t, MPI_MAX, testComm); if (etags_per_rank != etag_count_max) { - printf("Rank %d: etag count mismatch: max:%d, mine:%d\n", + printf("Rank %d: etag count mismatch: max:%zd, mine:%zd\n", rank, etag_count_max, etags_per_rank); - MPI_Abort(param->testComm, 1); + MPI_Abort(testComm, 1); } // collect ETag data at Rank0 @@ -1095,26 +1069,25 @@ S3_Close_internal( void* fd, int j; int rnk; - char* etag_vec = (char*)malloc((param->numTasks * etag_data_size) +1); + char* etag_vec = (char*)malloc((hints->numTasks * etag_data_size) +1); if (! etag_vec) { - fprintf(stderr, "rank 0 failed to malloc %d bytes\n", - param->numTasks * etag_data_size); - MPI_Abort(param->testComm, 1); + fprintf(stderr, "rank 0 failed to malloc %zd bytes\n", + hints->numTasks * etag_data_size); + MPI_Abort(testComm, 1); } MPI_Gather(etag_data, etag_data_size, MPI_BYTE, - etag_vec, etag_data_size, MPI_BYTE, 0, MPI_COMM_WORLD); + etag_vec, etag_data_size, MPI_BYTE, 0, testComm); // --- debugging: show the gathered etag data // (This shows the raw concatenated etag-data from each node.) - if (param->verbose >= VERBOSE_4) { - - printf("rank 0: gathered %d etags from all ranks:\n", etags_per_rank); + if (verbose >= VERBOSE_4) { + printf("rank 0: gathered %zd etags from all ranks:\n", etags_per_rank); etag_ptr=etag_vec; - for (rnk=0; rnknumTasks; ++rnk) { + for (rnk=0; rnk < hints->numTasks; ++rnk) { printf("\t[%d]: '", rnk); int ii; - for (ii=0; ii parts, // locally. At rank0, the etags for each rank are now - // stored as a continguous block of text, with the blocks + // stored as a contiguous block of text, with the blocks // stored in rank order in etag_vec. In other words, our // internal rep at rank 0 matches the "segmented" format. // From this, we must select etags in an order matching how @@ -1173,14 +1146,14 @@ S3_Close_internal( void* fd, size_t stride; // in etag_vec if (segmented) { // segmented - i_max = param->numTasks; + i_max = hints->numTasks; j_max = etags_per_rank; start_multiplier = etag_data_size; /* one rank's-worth of Etag data */ stride = ETAG_SIZE; /* one ETag */ } else { // strided i_max = etags_per_rank; - j_max = param->numTasks; + j_max = hints->numTasks; start_multiplier = ETAG_SIZE; /* one ETag */ stride = etag_data_size; /* one rank's-worth of Etag data */ } @@ -1203,7 +1176,7 @@ S3_Close_internal( void* fd, char etag[ETAG_SIZE +1]; memcpy(etag, etag_ptr, ETAG_SIZE); etag[ETAG_SIZE] = 0; - + char buff[BUFF_SIZE]; /* buffer is used to generate URLs, err_msgs, etc */ // write XML for next part, with Etag ... snprintf(buff, BUFF_SIZE, " \n" @@ -1221,15 +1194,11 @@ S3_Close_internal( void* fd, // write XML tail ... aws_iobuf_append_str(xml, "\n"); - } - - else { + } else { MPI_Gather(etag_data, etag_data_size, MPI_BYTE, - NULL, etag_data_size, MPI_BYTE, 0, MPI_COMM_WORLD); + NULL, etag_data_size, MPI_BYTE, 0, testComm); } - } - - else { /* N:N */ + } else { /* N:N */ xml = aws_iobuf_new(); aws_iobuf_growth_size(xml, 1024 * 8); @@ -1241,6 +1210,7 @@ S3_Close_internal( void* fd, char etag[ETAG_SIZE +1]; int part = 0; int i; + char buff[BUFF_SIZE]; /* buffer is used to generate URLs, err_msgs, etc */ for (i=0; i\n"); } - - // send request to finalize MPU if (n_to_n || (rank == 0)) { // DEBUGGING: show the XML we constructed - if (param->verbose >= VERBOSE_3) + if (verbose >= VERBOSE_3) debug_iobuf(xml, 1, 1); - + char buff[BUFF_SIZE]; /* buffer is used to generate URLs, err_msgs, etc */ // --- POST our XML to the server. snprintf(buff, BUFF_SIZE, "%s?uploadId=%s", @@ -1300,42 +1268,36 @@ S3_Close_internal( void* fd, // N:1 file until rank0 has finished the S3 multi-part finalize. // The object will not appear to exist, until then. if (n_to_1) - MPI_CHECK(MPI_Barrier(param->testComm), "barrier error"); - } - else { + MPI_CHECK(MPI_Barrier(testComm), "barrier error"); + } else { // No finalization is needed, when using EMC's byte-range writing // support. However, we do need to make sure everyone has // finished writing, before anyone starts reading. if (n_to_1) { - MPI_CHECK(MPI_Barrier(param->testComm), "barrier error"); - if (param->verbose >= VERBOSE_2) - printf("rank %d: passed barrier\n", rank); - } - } + MPI_CHECK(MPI_Barrier(testComm), "barrier error"); + //if (verbose >= VERBOSE_2) + // printf("rank %d: passed barrier\n", rank); + //} + } + } // After writing, reset the CURL connection, so that caches won't be // used for reads. aws_reset_connection(); } - - if (param->verbose >= VERBOSE_2) { - printf("<- S3_Close\n"); - } + //if (param->verbose >= VERBOSE_2) { + // printf("<- S3_Close\n"); + //} } -static -void -S3_Close( void* fd, - IOR_param_t* param ) { - S3_Close_internal(fd, param, TRUE); +static void S3_Close( aiori_fd_t* fd, aiori_mod_opt_t* param ) { + S3_Close_internal(fd, (s3_options_t*) param, TRUE); } -static -void -EMC_Close( void* fd, - IOR_param_t* param ) { - S3_Close_internal(fd, param, FALSE); + +static void EMC_Close( aiori_fd_t* fd, aiori_mod_opt_t* param ) { + S3_Close_internal(fd, (s3_options_t*) param, FALSE); } @@ -1349,13 +1311,36 @@ EMC_Close( void* fd, * successfully read. */ -static -void -S3_Delete( char *testFileName, IOR_param_t * param ) { +static void S3_Delete( char *testFileName, aiori_mod_opt_t * options ) { + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_Delete(%s)\n", testFileName); + //} + /* maybe initialize curl */ + s3_options_t * param = (s3_options_t*) options; + s3_connect(param ); - if (param->verbose >= VERBOSE_2) { - printf("-> S3_Delete(%s)\n", testFileName); - } +#if 0 + // EMC BUG: If file was written with appends, and is deleted, + // Then any future recreation will result in an object that can't be read. + // this + AWS4C_CHECK( s3_delete(param->io_buf, testFileName) ); +#else + // just replace with a zero-length object for now + aws_iobuf_reset(param->io_buf); + AWS4C_CHECK ( s3_put(param->io_buf, testFileName) ); +#endif + + AWS4C_CHECK_OK( param->io_buf ); + //if (verbose >= VERBOSE_2) + // printf("<- S3_Delete\n"); +} + + +static void EMC_Delete( char *testFileName, aiori_mod_opt_t * options ) { + s3_options_t * param = (s3_options_t*) options; + //if (param->verbose >= VERBOSE_2) { + // printf("-> EMC_Delete(%s)\n", testFileName); + //} /* maybe initialize curl */ s3_connect( param ); @@ -1372,45 +1357,10 @@ S3_Delete( char *testFileName, IOR_param_t * param ) { #endif AWS4C_CHECK_OK( param->io_buf ); - - if (param->verbose >= VERBOSE_2) - printf("<- S3_Delete\n"); + //if (param->verbose >= VERBOSE_2) + // printf("<- EMC_Delete\n"); } - -static -void -EMC_Delete( char *testFileName, IOR_param_t * param ) { - - if (param->verbose >= VERBOSE_2) { - printf("-> EMC_Delete(%s)\n", testFileName); - } - - /* maybe initialize curl */ - s3_connect( param ); - -#if 0 - // EMC BUG: If file was written with appends, and is deleted, - // Then any future recreation will result in an object that can't be read. - // this - AWS4C_CHECK( s3_delete(param->io_buf, testFileName) ); -#else - // just replace with a zero-length object for now - aws_iobuf_reset(param->io_buf); - AWS4C_CHECK ( s3_put(param->io_buf, testFileName) ); -#endif - - AWS4C_CHECK_OK( param->io_buf ); - - if (param->verbose >= VERBOSE_2) - printf("<- EMC_Delete\n"); -} - - - - - - /* * HTTP HEAD returns meta-data for a "file". * @@ -1420,15 +1370,11 @@ EMC_Delete( char *testFileName, IOR_param_t * param ) { * request more data than the header actually takes? */ -static -IOR_offset_t -S3_GetFileSize(IOR_param_t * param, - MPI_Comm testComm, - char * testFileName) { - - if (param->verbose >= VERBOSE_2) { - printf("-> S3_GetFileSize(%s)\n", testFileName); - } +static IOR_offset_t S3_GetFileSize(aiori_mod_opt_t * options, char * testFileName) { + s3_options_t * param = (s3_options_t*) options; + //if (param->verbose >= VERBOSE_2) { + // printf("-> S3_GetFileSize(%s)\n", testFileName); + //} IOR_offset_t aggFileSizeFromStat; /* i.e. "long long int" */ IOR_offset_t tmpMin, tmpMax, tmpSum; @@ -1442,63 +1388,9 @@ S3_GetFileSize(IOR_param_t * param, if ( ! AWS4C_OK(param->io_buf) ) { fprintf(stderr, "rank %d: couldn't stat '%s': %s\n", rank, testFileName, param->io_buf->result); - MPI_Abort(param->testComm, 1); + MPI_Abort(testComm, 1); } aggFileSizeFromStat = param->io_buf->contentLen; - if (param->verbose >= VERBOSE_2) { - printf("\trank %d: file-size %llu\n", rank, aggFileSizeFromStat); - } - - if ( param->filePerProc == TRUE ) { - if (param->verbose >= VERBOSE_2) { - printf("\tall-reduce (1)\n"); - } - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, - &tmpSum, /* sum */ - 1, - MPI_LONG_LONG_INT, - MPI_SUM, - testComm ), - "cannot total data moved" ); - - aggFileSizeFromStat = tmpSum; - } - else { - if (param->verbose >= VERBOSE_2) { - printf("\tall-reduce (2a)\n"); - } - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, - &tmpMin, /* min */ - 1, - MPI_LONG_LONG_INT, - MPI_MIN, - testComm ), - "cannot total data moved" ); - - if (param->verbose >= VERBOSE_2) { - printf("\tall-reduce (2b)\n"); - } - MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, - &tmpMax, /* max */ - 1, - MPI_LONG_LONG_INT, - MPI_MAX, - testComm ), - "cannot total data moved" ); - - if ( tmpMin != tmpMax ) { - if ( rank == 0 ) { - WARN( "inconsistent file size by different tasks" ); - } - - /* incorrect, but now consistent across tasks */ - aggFileSizeFromStat = tmpMin; - } - } - - if (param->verbose >= VERBOSE_2) { - printf("<- S3_GetFileSize [%llu]\n", aggFileSizeFromStat); - } return ( aggFileSizeFromStat ); } diff --git a/src/aiori-S3-libs3.c b/src/aiori-S3-libs3.c new file mode 100644 index 0000000..98d5df9 --- /dev/null +++ b/src/aiori-S3-libs3.c @@ -0,0 +1,586 @@ +/* +* S3 implementation using the newer libs3 +* https://github.com/bji/libs3 +* Use one object per file chunk +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include + +#include "ior.h" +#include "aiori.h" +#include "aiori-debug.h" +#include "utilities.h" + + +static aiori_xfer_hint_t * hints = NULL; + +static void s3_xfer_hints(aiori_xfer_hint_t * params){ + hints = params; +} + +/************************** O P T I O N S *****************************/ +typedef struct { + int bucket_per_file; + char * access_key; + char * secret_key; + char * host; + char * bucket_prefix; + char * bucket_prefix_cur; + char * locationConstraint; + char * authRegion; + + int timeout; + int dont_suffix; + int s3_compatible; + int use_ssl; + S3BucketContext bucket_context; + S3Protocol s3_protocol; +} s3_options_t; + +static option_help * S3_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){ + s3_options_t * o = malloc(sizeof(s3_options_t)); + if (init_values != NULL){ + memcpy(o, init_values, sizeof(s3_options_t)); + }else{ + memset(o, 0, sizeof(s3_options_t)); + } + + *init_backend_options = (aiori_mod_opt_t*) o; + o->bucket_prefix = "ior"; + o->bucket_prefix_cur = "b"; + + option_help h [] = { + {0, "S3-libs3.bucket-per-file", "Use one bucket to map one file/directory, otherwise one bucket is used to store all dirs/files.", OPTION_FLAG, 'd', & o->bucket_per_file}, + {0, "S3-libs3.bucket-name-prefix", "The prefix of the bucket(s).", OPTION_OPTIONAL_ARGUMENT, 's', & o->bucket_prefix}, + {0, "S3-libs3.dont-suffix-bucket", "By default a hash will be added to the bucket name to increase uniqueness, this disables the option.", OPTION_FLAG, 'd', & o->dont_suffix }, + {0, "S3-libs3.s3-compatible", "to be selected when using S3 compatible storage", OPTION_FLAG, 'd', & o->s3_compatible }, + {0, "S3-libs3.use-ssl", "used to specify that SSL is needed for the connection", OPTION_FLAG, 'd', & o->use_ssl }, + {0, "S3-libs3.host", "The host optionally followed by:port.", OPTION_OPTIONAL_ARGUMENT, 's', & o->host}, + {0, "S3-libs3.secret-key", "The secret key.", OPTION_OPTIONAL_ARGUMENT, 's', & o->secret_key}, + {0, "S3-libs3.access-key", "The access key.", OPTION_OPTIONAL_ARGUMENT, 's', & o->access_key}, + {0, "S3-libs3.region", "The region used for the authorization signature.", OPTION_OPTIONAL_ARGUMENT, 's', & o->authRegion}, + {0, "S3-libs3.location", "The bucket geographic location.", OPTION_OPTIONAL_ARGUMENT, 's', & o->locationConstraint}, + LAST_OPTION + }; + option_help * help = malloc(sizeof(h)); + memcpy(help, h, sizeof(h)); + return help; +} + +static void def_file_name(s3_options_t * o, char * out_name, char const * path){ + if(o->bucket_per_file){ + out_name += sprintf(out_name, "%s-", o->bucket_prefix_cur); + } + // duplicate path except "/" + while(*path != 0){ + char c = *path; + if(((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') )){ + *out_name = *path; + out_name++; + }else if(c >= 'A' && c <= 'Z'){ + *out_name = *path + ('a' - 'A'); + out_name++; + }else if(c == '/'){ + *out_name = '_'; + out_name++; + }else{ + // encode special characters + *out_name = 'a' + (c / 26); + out_name++; + *out_name = 'a' + (c % 26); + out_name++; + } + path++; + } + *out_name = 'b'; + out_name++; + *out_name = '\0'; +} + +static void def_bucket_name(s3_options_t * o, char * out_name, char const * path){ + // S3_MAX_BUCKET_NAME_SIZE + if(o->bucket_per_file){ + out_name += sprintf(out_name, "%s-", o->bucket_prefix_cur); + } + // duplicate path except "/" + while(*path != 0){ + char c = *path; + if(((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') )){ + *out_name = *path; + out_name++; + }else if(c >= 'A' && c <= 'Z'){ + *out_name = *path + ('a' - 'A'); + out_name++; + } + path++; + } + *out_name = '\0'; + + // S3Status S3_validate_bucket_name(const char *bucketName, S3UriStyle uriStyle); +} + +struct data_handling{ + IOR_size_t * buf; + int64_t size; +}; + +static S3Status s3status = S3StatusInterrupted; +static S3ErrorDetails s3error = {NULL}; + +static S3Status responsePropertiesCallback(const S3ResponseProperties *properties, void *callbackData){ + s3status = S3StatusOK; + return s3status; +} + +static void responseCompleteCallback(S3Status status, const S3ErrorDetails *error, void *callbackData) { + s3status = status; + if (error == NULL){ + s3error.message = NULL; + }else{ + s3error = *error; + } + return; +} + +#define CHECK_ERROR(p) \ +if (s3status != S3StatusOK){ \ + EWARNF("S3 %s:%d (path:%s) \"%s\": %s %s", __FUNCTION__, __LINE__, p, S3_get_status_name(s3status), s3error.message, s3error.furtherDetails ? s3error.furtherDetails : ""); \ +} + + +static S3ResponseHandler responseHandler = { &responsePropertiesCallback, &responseCompleteCallback }; + +static char * S3_getVersion() +{ + return "0.5"; +} + +static void S3_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * options) +{ + // Not needed +} + + +static void S3_Sync(aiori_mod_opt_t * options) +{ + // Not needed +} + +static S3Status S3ListResponseCallback(const char *ownerId, const char *ownerDisplayName, const char *bucketName, int64_t creationDateSeconds, void *callbackData){ + uint64_t * count = (uint64_t*) callbackData; + *count += 1; + return S3StatusOK; +} + +static S3ListServiceHandler listhandler = { { &responsePropertiesCallback, &responseCompleteCallback }, & S3ListResponseCallback}; + +static int S3_statfs (const char * path, ior_aiori_statfs_t * stat, aiori_mod_opt_t * options){ + stat->f_bsize = 1; + stat->f_blocks = 1; + stat->f_bfree = 1; + stat->f_bavail = 1; + stat->f_ffree = 1; + s3_options_t * o = (s3_options_t*) options; + + // use the number of bucket as files + uint64_t buckets = 0; + S3_list_service(o->s3_protocol, o->access_key, o->secret_key, NULL, o->host, + o->authRegion, NULL, o->timeout, & listhandler, & buckets); + stat->f_files = buckets; + CHECK_ERROR(o->authRegion); + + return 0; +} + +static S3Status S3multipart_handler(const char *upload_id, void *callbackData){ + *((char const**)(callbackData)) = upload_id; + return S3StatusOK; +} + +static S3MultipartInitialHandler multipart_handler = { {&responsePropertiesCallback, &responseCompleteCallback }, & S3multipart_handler}; + +typedef struct{ + char * object; +} S3_fd_t; + +static int putObjectDataCallback(int bufferSize, char *buffer, void *callbackData){ + struct data_handling * dh = (struct data_handling *) callbackData; + const int64_t size = dh->size > bufferSize ? bufferSize : dh->size; + if(size == 0) return 0; + memcpy(buffer, dh->buf, size); + dh->buf = (IOR_size_t*) ((char*)(dh->buf) + size); + dh->size -= size; + + return size; +} + +static S3PutObjectHandler putObjectHandler = { { &responsePropertiesCallback, &responseCompleteCallback }, & putObjectDataCallback }; + +static aiori_fd_t *S3_Create(char *path, int iorflags, aiori_mod_opt_t * options) +{ + char * upload_id; + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + def_file_name(o, p, path); + + + if(iorflags & IOR_CREAT){ + if(o->bucket_per_file){ + S3_create_bucket(o->s3_protocol, o->access_key, o->secret_key, NULL, o->host, p, o->authRegion, S3CannedAclPrivate, o->locationConstraint, NULL, o->timeout, & responseHandler, NULL); + }else{ + struct data_handling dh = { .buf = NULL, .size = 0 }; + S3_put_object(& o->bucket_context, p, 0, NULL, NULL, o->timeout, &putObjectHandler, & dh); + } + if (s3status != S3StatusOK){ + CHECK_ERROR(p); + return NULL; + } + } + + S3_fd_t * fd = malloc(sizeof(S3_fd_t)); + fd->object = strdup(p); + return (aiori_fd_t*) fd; +} + + +static S3Status statResponsePropertiesCallback(const S3ResponseProperties *properties, void *callbackData){ + // check the size + struct stat *buf = (struct stat*) callbackData; + if(buf != NULL){ + buf->st_size = properties->contentLength; + buf->st_mtime = properties->lastModified; + } + s3status = S3StatusOK; + return s3status; +} + +static S3ResponseHandler statResponseHandler = { &statResponsePropertiesCallback, &responseCompleteCallback }; + +static aiori_fd_t *S3_Open(char *path, int flags, aiori_mod_opt_t * options) +{ + if(flags & IOR_CREAT){ + return S3_Create(path, flags, options); + } + if(flags & IOR_WRONLY){ + WARN("S3 IOR_WRONLY is not supported"); + } + if(flags & IOR_RDWR){ + WARN("S3 IOR_RDWR is not supported"); + } + + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + def_file_name(o, p, path); + + if (o->bucket_per_file){ + S3_test_bucket(o->s3_protocol, S3UriStylePath, o->access_key, o->secret_key, + NULL, o->host, p, o->authRegion, 0, NULL, + NULL, o->timeout, & responseHandler, NULL); + }else{ + struct stat buf; + S3_head_object(& o->bucket_context, p, NULL, o->timeout, & statResponseHandler, & buf); + } + if (s3status != S3StatusOK){ + CHECK_ERROR(p); + return NULL; + } + + S3_fd_t * fd = malloc(sizeof(S3_fd_t)); + fd->object = strdup(p); + return (aiori_fd_t*) fd; +} + +static S3Status getObjectDataCallback(int bufferSize, const char *buffer, void *callbackData){ + struct data_handling * dh = (struct data_handling *) callbackData; + const int64_t size = dh->size > bufferSize ? bufferSize : dh->size; + memcpy(dh->buf, buffer, size); + dh->buf = (IOR_size_t*) ((char*)(dh->buf) + size); + dh->size -= size; + + return S3StatusOK; +} + +static S3GetObjectHandler getObjectHandler = { { &responsePropertiesCallback, &responseCompleteCallback }, & getObjectDataCallback }; + +static IOR_offset_t S3_Xfer(int access, aiori_fd_t * afd, IOR_size_t * buffer, IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * options){ + S3_fd_t * fd = (S3_fd_t *) afd; + struct data_handling dh = { .buf = buffer, .size = length }; + + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + + if(o->bucket_per_file){ + o->bucket_context.bucketName = fd->object; + if(offset != 0){ + sprintf(p, "%ld-%ld", (long) offset, (long) length); + }else{ + sprintf(p, "0"); + } + }else{ + if(offset != 0){ + sprintf(p, "%s-%ld-%ld", fd->object, (long) offset, (long) length); + }else{ + sprintf(p, "%s", fd->object); + } + } + if(access == WRITE){ + S3_put_object(& o->bucket_context, p, length, NULL, NULL, o->timeout, &putObjectHandler, & dh); + }else{ + S3_get_object(& o->bucket_context, p, NULL, 0, length, NULL, o->timeout, &getObjectHandler, & dh); + } + if (! o->s3_compatible){ + CHECK_ERROR(p); + } + return length; +} + + +static void S3_Close(aiori_fd_t * afd, aiori_mod_opt_t * options) +{ + S3_fd_t * fd = (S3_fd_t *) afd; + free(fd->object); + free(afd); +} + +typedef struct { + int status; // do not reorder! + s3_options_t * o; + int truncated; + char const *nextMarker; +} s3_delete_req; + +S3Status list_delete_cb(int isTruncated, const char *nextMarker, int contentsCount, const S3ListBucketContent *contents, int commonPrefixesCount, const char **commonPrefixes, void *callbackData){ + s3_delete_req * req = (s3_delete_req*) callbackData; + for(int i=0; i < contentsCount; i++){ + S3_delete_object(& req->o->bucket_context, contents[i].key, NULL, req->o->timeout, & responseHandler, NULL); + } + req->truncated = isTruncated; + if(isTruncated){ + req->nextMarker = nextMarker; + } + return S3StatusOK; +} + +static S3ListBucketHandler list_delete_handler = {{&responsePropertiesCallback, &responseCompleteCallback }, list_delete_cb}; + +static void S3_Delete(char *path, aiori_mod_opt_t * options) +{ + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + def_file_name(o, p, path); + + + if(o->bucket_per_file){ + o->bucket_context.bucketName = p; + s3_delete_req req = {0, o, 0, NULL}; + do{ + S3_list_bucket(& o->bucket_context, NULL, req.nextMarker, NULL, INT_MAX, NULL, o->timeout, & list_delete_handler, & req); + }while(req.truncated); + S3_delete_bucket(o->s3_protocol, S3UriStylePath, o->access_key, o->secret_key, NULL, o->host, p, o->authRegion, NULL, o->timeout, & responseHandler, NULL); + }else{ + char * del_heuristics = getenv("S3LIB_DELETE_HEURISTICS"); + if(del_heuristics){ + struct stat buf; + S3_head_object(& o->bucket_context, p, NULL, o->timeout, & statResponseHandler, & buf); + if(s3status != S3StatusOK){ + // As the file does not exist, can return safely + CHECK_ERROR(p); + return; + } + int threshold = atoi(del_heuristics); + if (buf.st_size > threshold){ + // there may exist fragments, so try to delete them + s3_delete_req req = {0, o, 0, NULL}; + do{ + S3_list_bucket(& o->bucket_context, p, req.nextMarker, NULL, INT_MAX, NULL, o->timeout, & list_delete_handler, & req); + }while(req.truncated); + } + S3_delete_object(& o->bucket_context, p, NULL, o->timeout, & responseHandler, NULL); + }else{ + // Regular deletion, must remove all created fragments + S3_delete_object(& o->bucket_context, p, NULL, o->timeout, & responseHandler, NULL); + if(s3status != S3StatusOK){ + // As the file does not exist, can return savely + CHECK_ERROR(p); + return; + } + s3_delete_req req = {0, o, 0, NULL}; + do{ + S3_list_bucket(& o->bucket_context, p, req.nextMarker, NULL, INT_MAX, NULL, o->timeout, & list_delete_handler, & req); + }while(req.truncated); + } + } + CHECK_ERROR(p); +} + +static int S3_mkdir (const char *path, mode_t mode, aiori_mod_opt_t * options){ + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + def_bucket_name(o, p, path); + + + if (o->bucket_per_file){ + S3_create_bucket(o->s3_protocol, o->access_key, o->secret_key, NULL, o->host, p, o->authRegion, S3CannedAclPrivate, o->locationConstraint, NULL, o->timeout, & responseHandler, NULL); + CHECK_ERROR(p); + return 0; + }else{ + struct data_handling dh = { .buf = NULL, .size = 0 }; + S3_put_object(& o->bucket_context, p, 0, NULL, NULL, o->timeout, & putObjectHandler, & dh); + if (! o->s3_compatible){ + CHECK_ERROR(p); + } + return 0; + } +} + +static int S3_rmdir (const char *path, aiori_mod_opt_t * options){ + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + + def_bucket_name(o, p, path); + if (o->bucket_per_file){ + S3_delete_bucket(o->s3_protocol, S3UriStylePath, o->access_key, o->secret_key, NULL, o->host, p, o->authRegion, NULL, o->timeout, & responseHandler, NULL); + CHECK_ERROR(p); + return 0; + }else{ + S3_delete_object(& o->bucket_context, p, NULL, o->timeout, & responseHandler, NULL); + CHECK_ERROR(p); + return 0; + } +} + +static int S3_stat(const char *path, struct stat *buf, aiori_mod_opt_t * options){ + s3_options_t * o = (s3_options_t*) options; + char p[FILENAME_MAX]; + def_file_name(o, p, path); + memset(buf, 0, sizeof(struct stat)); + // TODO count the individual file fragment sizes together + if (o->bucket_per_file){ + S3_test_bucket(o->s3_protocol, S3UriStylePath, o->access_key, o->secret_key, + NULL, o->host, p, o->authRegion, 0, NULL, + NULL, o->timeout, & responseHandler, NULL); + }else{ + S3_head_object(& o->bucket_context, p, NULL, o->timeout, & statResponseHandler, buf); + } + if (s3status != S3StatusOK){ + return -1; + } + return 0; +} + +static int S3_access (const char *path, int mode, aiori_mod_opt_t * options){ + struct stat buf; + return S3_stat(path, & buf, options); +} + +static IOR_offset_t S3_GetFileSize(aiori_mod_opt_t * options, char *testFileName) +{ + struct stat buf; + if(S3_stat(testFileName, & buf, options) != 0) return -1; + return buf.st_size; +} + + +static int S3_check_params(aiori_mod_opt_t * options){ + s3_options_t * o = (s3_options_t*) options; + if(o->access_key == NULL){ + o->access_key = ""; + } + if(o->secret_key == NULL){ + o->secret_key = ""; + } + if(o->host == NULL){ + WARN("The S3 hostname should be specified"); + } + return 0; +} + +static void S3_init(aiori_mod_opt_t * options){ + s3_options_t * o = (s3_options_t*) options; + int ret = S3_initialize(NULL, S3_INIT_ALL, o->host); + if(ret != S3StatusOK) + FAIL("Could not initialize S3 library"); + + // create a bucket id based on access-key using a trivial checksumming + if(! o->dont_suffix){ + uint64_t c = 0; + char * r = o->access_key; + for(uint64_t pos = 1; (*r) != '\0' ; r++, pos*=10) { + c += (*r) * pos; + } + int count = snprintf(NULL, 0, "%s%lu", o->bucket_prefix, c % 1000); + char * old_prefix = o->bucket_prefix; + o->bucket_prefix_cur = malloc(count + 1); + sprintf(o->bucket_prefix_cur, "%s%lu", old_prefix, c % 1000); + }else{ + o->bucket_prefix_cur = o->bucket_prefix; + } + + // init bucket context + memset(& o->bucket_context, 0, sizeof(o->bucket_context)); + o->bucket_context.hostName = o->host; + o->bucket_context.bucketName = o->bucket_prefix_cur; + if (o->use_ssl){ + o->s3_protocol = S3ProtocolHTTPS; + }else{ + o->s3_protocol = S3ProtocolHTTP; + } + o->bucket_context.protocol = o->s3_protocol; + o->bucket_context.uriStyle = S3UriStylePath; + o->bucket_context.accessKeyId = o->access_key; + o->bucket_context.secretAccessKey = o->secret_key; + + if (! o->bucket_per_file && rank == 0){ + S3_create_bucket(o->s3_protocol, o->access_key, o->secret_key, NULL, o->host, o->bucket_context.bucketName, o->authRegion, S3CannedAclPrivate, o->locationConstraint, NULL, o->timeout, & responseHandler, NULL); + CHECK_ERROR(o->bucket_context.bucketName); + } + + if ( ret != S3StatusOK ){ + FAIL("S3 error %s", S3_get_status_name(ret)); + } +} + +static void S3_final(aiori_mod_opt_t * options){ + s3_options_t * o = (s3_options_t*) options; + if (! o->bucket_per_file && rank == 0){ + S3_delete_bucket(o->s3_protocol, S3UriStylePath, o->access_key, o->secret_key, NULL, o->host, o->bucket_context.bucketName, o->authRegion, NULL, o->timeout, & responseHandler, NULL); + CHECK_ERROR(o->bucket_context.bucketName); + } + + S3_deinitialize(); +} + + +ior_aiori_t S3_libS3_aiori = { + .name = "S3-libs3", + .name_legacy = NULL, + .create = S3_Create, + .open = S3_Open, + .xfer = S3_Xfer, + .close = S3_Close, + .delete = S3_Delete, + .get_version = S3_getVersion, + .fsync = S3_Fsync, + .xfer_hints = s3_xfer_hints, + .get_file_size = S3_GetFileSize, + .statfs = S3_statfs, + .mkdir = S3_mkdir, + .rmdir = S3_rmdir, + .access = S3_access, + .stat = S3_stat, + .initialize = S3_init, + .finalize = S3_final, + .get_options = S3_options, + .check_params = S3_check_params, + .sync = S3_Sync, + .enable_mdtest = true +}; diff --git a/src/aiori-aio.c b/src/aiori-aio.c new file mode 100644 index 0000000..f9ee475 --- /dev/null +++ b/src/aiori-aio.c @@ -0,0 +1,258 @@ +/* + This backend uses linux-aio + Requires: libaio-dev + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "ior.h" +#include "aiori.h" +#include "iordef.h" +#include "utilities.h" + +#include "aiori-POSIX.h" + +/************************** O P T I O N S *****************************/ +typedef struct{ + aiori_mod_opt_t * p; // posix options + int max_pending; + int granularity; // how frequent to submit, submit ever granularity elements + + // runtime data + io_context_t ioctx; // one context per fs + struct iocb ** iocbs; + int iocbs_pos; // how many are pending in iocbs + + int in_flight; // total pending ops + IOR_offset_t pending_bytes; // track pending IO volume for error checking +} aio_options_t; + +option_help * aio_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values){ + aio_options_t * o = malloc(sizeof(aio_options_t)); + + if (init_values != NULL){ + memcpy(o, init_values, sizeof(aio_options_t)); + }else{ + memset(o, 0, sizeof(aio_options_t)); + o->max_pending = 128; + o->granularity = 16; + } + option_help * p_help = POSIX_options((aiori_mod_opt_t**)& o->p, init_values == NULL ? NULL : (aiori_mod_opt_t*) ((aio_options_t*)init_values)->p); + *init_backend_options = (aiori_mod_opt_t*) o; + + option_help h [] = { + {0, "aio.max-pending", "Max number of pending ops", OPTION_OPTIONAL_ARGUMENT, 'd', & o->max_pending}, + {0, "aio.granularity", "How frequent to submit pending IOs, submit every *granularity* elements", OPTION_OPTIONAL_ARGUMENT, 'd', & o->granularity}, + LAST_OPTION + }; + option_help * help = option_merge(h, p_help); + free(p_help); + return help; +} + + +/************************** D E C L A R A T I O N S ***************************/ + +typedef struct{ + aiori_fd_t * pfd; // the underlying POSIX fd +} aio_fd_t; + +/***************************** F U N C T I O N S ******************************/ + +static aiori_xfer_hint_t * hints = NULL; + +static void aio_xfer_hints(aiori_xfer_hint_t * params){ + hints = params; + POSIX_xfer_hints(params); +} + +static void aio_initialize(aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + if(io_setup(o->max_pending, & o->ioctx) != 0){ + ERRF("Couldn't initialize io context %s", strerror(errno)); + } + printf("%d\n", (o->max_pending)); + + o->iocbs = malloc(sizeof(struct iocb *) * o->granularity); + o->iocbs_pos = 0; + o->in_flight = 0; +} + +static void aio_finalize(aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + io_destroy(o->ioctx); +} + +static int aio_check_params(aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + POSIX_check_params((aiori_mod_opt_t*) o->p); + if(o->max_pending < 8){ + ERRF("AIO max-pending = %d < 8", o->max_pending); + } + if(o->granularity > o->max_pending){ + ERRF("AIO granularity must be < max-pending, is %d > %d", o->granularity, o->max_pending); + } + return 0; +} + +static aiori_fd_t *aio_Open(char *testFileName, int flags, aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + aio_fd_t * fd = malloc(sizeof(aio_fd_t)); + fd->pfd = POSIX_Open(testFileName, flags, o->p); + return (aiori_fd_t*) fd; +} + +static aiori_fd_t *aio_create(char *testFileName, int flags, aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + aio_fd_t * fd = malloc(sizeof(aio_fd_t)); + fd->pfd = POSIX_Create(testFileName, flags, o->p); + return (aiori_fd_t*) fd; +} + +/* called whenever the granularity is met */ +static void submit_pending(aio_options_t * o){ + if(o->iocbs_pos == 0){ + return; + } + int res; + res = io_submit(o->ioctx, o->iocbs_pos, o->iocbs); + //printf("AIO submit %d jobs\n", o->iocbs_pos); + if(res != o->iocbs_pos){ + if(errno == EAGAIN){ + ERR("AIO: errno == EAGAIN; this should't happen"); + } + ERRF("AIO: submitted %d, error: \"%s\" ; this should't happen", res, strerror(errno)); + } + o->iocbs_pos = 0; +} + +/* complete all pending ops */ +static void complete_all(aio_options_t * o){ + submit_pending(o); + + struct io_event events[o->in_flight]; + int num_events; + num_events = io_getevents(o->ioctx, o->in_flight, o->in_flight, events, NULL); + for (int i = 0; i < num_events; i++) { + struct io_event event = events[i]; + if(event.res == -1){ + ERR("AIO, error in io_getevents(), IO incomplete!"); + }else{ + o->pending_bytes -= event.res; + } + free(event.obj); + } + if(o->pending_bytes != 0){ + ERRF("AIO, error in flushing data, pending bytes: %lld", o->pending_bytes); + } + o->in_flight = 0; +} + +/* called if we must make *some* progress */ +static void process_some(aio_options_t * o){ + if(o->in_flight == 0){ + return; + } + struct io_event events[o->in_flight]; + int num_events; + int mn = o->in_flight < o->granularity ? o->in_flight : o->granularity; + num_events = io_getevents(o->ioctx, mn, o->in_flight, events, NULL); + //printf("Completed: %d\n", num_events); + for (int i = 0; i < num_events; i++) { + struct io_event event = events[i]; + if(event.res == -1){ + ERR("AIO, error in io_getevents(), IO incomplete!"); + }else{ + o->pending_bytes -= event.res; + } + free(event.obj); + } + o->in_flight -= num_events; +} + +static IOR_offset_t aio_Xfer(int access, aiori_fd_t *fd, IOR_size_t * buffer, + IOR_offset_t length, IOR_offset_t offset, aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + aio_fd_t * afd = (aio_fd_t*) fd; + + if(o->in_flight >= o->max_pending){ + process_some(o); + } + o->pending_bytes += length; + + struct iocb * iocb = malloc(sizeof(struct iocb)); + if(access == WRITE){ + io_prep_pwrite(iocb, *(int*)afd->pfd, buffer, length, offset); + }else{ + io_prep_pread(iocb, *(int*)afd->pfd, buffer, length, offset); + } + o->iocbs[o->iocbs_pos] = iocb; + o->iocbs_pos++; + o->in_flight++; + + if(o->iocbs_pos == o->granularity){ + submit_pending(o); + } + return length; +} + +static void aio_Close(aiori_fd_t *fd, aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + aio_fd_t * afd = (aio_fd_t*) fd; + complete_all(o); + POSIX_Close(afd->pfd, o->p); +} + +static void aio_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + complete_all(o); + aio_fd_t * afd = (aio_fd_t*) fd; + POSIX_Fsync(afd->pfd, o->p); +} + +static void aio_Sync(aiori_mod_opt_t * param){ + aio_options_t * o = (aio_options_t*) param; + complete_all(o); + POSIX_Sync((aiori_mod_opt_t*) o->p); +} + + + +ior_aiori_t aio_aiori = { + .name = "AIO", + .name_legacy = NULL, + .create = aio_create, + .get_options = aio_options, + .initialize = aio_initialize, + .finalize = aio_finalize, + .xfer_hints = aio_xfer_hints, + .get_options = aio_options, + .fsync = aio_Fsync, + .open = aio_Open, + .xfer = aio_Xfer, + .close = aio_Close, + .sync = aio_Sync, + .check_params = aio_check_params, + .delete = POSIX_Delete, + .get_version = aiori_get_version, + .get_file_size = POSIX_GetFileSize, + .statfs = aiori_posix_statfs, + .mkdir = aiori_posix_mkdir, + .rmdir = aiori_posix_rmdir, + .access = aiori_posix_access, + .stat = aiori_posix_stat, + .enable_mdtest = true +}; diff --git a/src/aiori-debug.h b/src/aiori-debug.h new file mode 100644 index 0000000..32db28f --- /dev/null +++ b/src/aiori-debug.h @@ -0,0 +1,131 @@ +#ifndef _AIORI_UTIL_H +#define _AIORI_UTIL_H + +/* This file contains only debug relevant helpers */ + +#include +#include + +extern FILE * out_logfile; +extern int verbose; /* verbose output */ + +#define FAIL(...) FailMessage(rank, ERROR_LOCATION, __VA_ARGS__) +void FailMessage(int rank, const char *location, char *format, ...); + +/******************************** M A C R O S *********************************/ + +/******************************************************************************/ +/* + * WARN_RESET will display a custom error message and set value to default + */ +#define WARN_RESET(MSG, TO_STRUCT_PTR, FROM_STRUCT_PTR, MEMBER) do { \ + (TO_STRUCT_PTR)->MEMBER = (FROM_STRUCT_PTR)->MEMBER; \ + if (rank == 0) { \ + fprintf(out_logfile, "WARNING: %s. Using value of %d.\n", \ + MSG, (TO_STRUCT_PTR)->MEMBER); \ + } \ + fflush(out_logfile); \ +} while (0) + +extern int aiori_warning_as_errors; + +#define WARN(MSG) do { \ + if(aiori_warning_as_errors){ ERR(MSG); } \ + if (verbose > VERBOSE_2) { \ + fprintf(out_logfile, "WARNING: %s, (%s:%d).\n", \ + MSG, __FILE__, __LINE__); \ + } else { \ + fprintf(out_logfile, "WARNING: %s.\n", MSG); \ + } \ + fflush(out_logfile); \ +} while (0) + + +/* warning with format string and errno printed */ +#define EWARNF(FORMAT, ...) do { \ + if(aiori_warning_as_errors){ ERRF(FORMAT, __VA_ARGS__); } \ + if (verbose > VERBOSE_2) { \ + fprintf(out_logfile, "WARNING: " FORMAT ", (%s:%d).\n", \ + __VA_ARGS__, __FILE__, __LINE__); \ + } else { \ + fprintf(out_logfile, "WARNING: " FORMAT "\n", \ + __VA_ARGS__); \ + } \ + fflush(out_logfile); \ +} while (0) + + +/* warning with errno printed */ +#define EWARN(MSG) do { \ + EWARNF("%s", MSG); \ +} while (0) + + +/* warning with format string and errno printed */ +#define EINFO(FORMAT, ...) do { \ + if (verbose > VERBOSE_2) { \ + fprintf(out_logfile, "INFO: " FORMAT ", (%s:%d).\n", \ + __VA_ARGS__, __FILE__, __LINE__); \ + } else { \ + fprintf(out_logfile, "INFO: " FORMAT "\n", \ + __VA_ARGS__); \ + } \ + fflush(out_logfile); \ +} while (0) + +/* display error message with format string and terminate execution */ +#define ERRF(FORMAT, ...) do { \ + fprintf(out_logfile, "ERROR: " FORMAT ", (%s:%d)\n", \ + __VA_ARGS__, __FILE__, __LINE__); \ + fflush(out_logfile); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ +} while (0) + + +/* display error message and terminate execution */ +#define ERR_ERRNO(MSG) do { \ + ERRF("%s", MSG); \ +} while (0) + + +/* display a simple error message (i.e. errno is not set) and terminate execution */ +#define ERR(MSG) do { \ + fprintf(out_logfile, "ERROR: %s, (%s:%d)\n", \ + MSG, __FILE__, __LINE__); \ + fflush(out_logfile); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ +} while (0) + + +/******************************************************************************/ +/* + * MPI_CHECKF will display a custom format string as well as an error string + * from the MPI_STATUS and then exit the program + */ + +#define MPI_CHECKF(MPI_STATUS, FORMAT, ...) do { \ + char resultString[MPI_MAX_ERROR_STRING]; \ + int resultLength; \ + int checkf_mpi_status = MPI_STATUS; \ + \ + if (checkf_mpi_status != MPI_SUCCESS) { \ + MPI_Error_string(checkf_mpi_status, resultString, &resultLength);\ + fprintf(out_logfile, "ERROR: " FORMAT ", MPI %s, (%s:%d)\n", \ + __VA_ARGS__, resultString, __FILE__, __LINE__); \ + fflush(out_logfile); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } \ +} while(0) + + +/******************************************************************************/ +/* + * MPI_CHECK will display a custom error message as well as an error string + * from the MPI_STATUS and then exit the program + */ + +#define MPI_CHECK(MPI_STATUS, MSG) do { \ + MPI_CHECKF(MPI_STATUS, "%s", MSG); \ +} while(0) + +#endif diff --git a/src/aiori.c b/src/aiori.c index 303f367..6c9a971 100644 --- a/src/aiori.c +++ b/src/aiori.c @@ -42,11 +42,13 @@ ior_aiori_t *available_aiori[] = { #ifdef USE_POSIX_AIORI &posix_aiori, #endif +#ifdef USE_AIO_AIORI + &aio_aiori, +#endif #ifdef USE_PMDK_AIORI &pmdk_aiori, #endif #ifdef USE_DAOS_AIORI - &daos_aiori, &dfs_aiori, #endif & dummy_aiori, @@ -68,8 +70,11 @@ ior_aiori_t *available_aiori[] = { #ifdef USE_MMAP_AIORI &mmap_aiori, #endif -#ifdef USE_S3_AIORI - &s3_aiori, +#ifdef USE_S3_LIBS3_AIORI + &S3_libS3_aiori, +#endif +#ifdef USE_S3_4C_AIORI + &s3_4c_aiori, &s3_plus_aiori, &s3_emc_aiori, #endif @@ -100,6 +105,7 @@ void * airoi_update_module_options(const ior_aiori_t * backend, options_all_t * } options_all_t * airoi_create_all_module_options(option_help * global_options){ + if(! out_logfile) out_logfile = stdout; int airoi_c = aiori_count(); options_all_t * opt = malloc(sizeof(options_all_t)); opt->module_count = airoi_c + 1; @@ -122,6 +128,8 @@ void aiori_supported_apis(char * APIs, char * APIs_legacy, enum bench_type type) { ior_aiori_t **tmp = available_aiori; char delimiter = ' '; + *APIs = 0; + *APIs_legacy = 0; while (*tmp != NULL) { @@ -130,7 +138,6 @@ void aiori_supported_apis(char * APIs, char * APIs_legacy, enum bench_type type) tmp++; continue; } - if (delimiter == ' ') { APIs += sprintf(APIs, "%s", (*tmp)->name); @@ -142,6 +149,7 @@ void aiori_supported_apis(char * APIs, char * APIs_legacy, enum bench_type type) if ((*tmp)->name_legacy != NULL) APIs_legacy += sprintf(APIs_legacy, "%c%s", delimiter, (*tmp)->name_legacy); + tmp++; } } diff --git a/src/aiori.h b/src/aiori.h index ad10e4d..6f78e5f 100755 --- a/src/aiori.h +++ b/src/aiori.h @@ -15,16 +15,11 @@ #ifndef _AIORI_H #define _AIORI_H -#include - -#ifndef MPI_FILE_NULL -# include -#endif /* not MPI_FILE_NULL */ - #include #include #include "iordef.h" /* IOR Definitions */ +#include "aiori-debug.h" #include "option.h" /*************************** D E F I N I T I O N S ****************************/ @@ -81,9 +76,9 @@ typedef struct aiori_xfer_hint_t{ } aiori_xfer_hint_t; /* this is a dummy structure to create some type safety */ -typedef struct aiori_mod_opt_t{ +struct aiori_mod_opt_t{ void * dummy; -} aiori_mod_opt_t; +}; typedef struct aiori_fd_t{ void * dummy; @@ -100,12 +95,12 @@ typedef struct ior_aiori { */ void (*xfer_hints)(aiori_xfer_hint_t * params); IOR_offset_t (*xfer)(int access, aiori_fd_t *, IOR_size_t *, - IOR_offset_t size, IOR_offset_t offset, aiori_mod_opt_t *); - void (*close)(aiori_fd_t *, aiori_mod_opt_t *); - void (*delete)(char *, aiori_mod_opt_t *); + IOR_offset_t size, IOR_offset_t offset, aiori_mod_opt_t * module_options); + void (*close)(aiori_fd_t *, aiori_mod_opt_t * module_options); + void (*delete)(char *, aiori_mod_opt_t * module_options); char* (*get_version)(void); - void (*fsync)(aiori_fd_t *, aiori_mod_opt_t *); - IOR_offset_t (*get_file_size)(aiori_mod_opt_t * module_options, MPI_Comm, char *); + void (*fsync)(aiori_fd_t *, aiori_mod_opt_t * module_options); + IOR_offset_t (*get_file_size)(aiori_mod_opt_t * module_options, char * filename); int (*statfs) (const char *, ior_aiori_statfs_t *, aiori_mod_opt_t * module_options); int (*mkdir) (const char *path, mode_t mode, aiori_mod_opt_t * module_options); int (*rmdir) (const char *path, aiori_mod_opt_t * module_options); @@ -113,6 +108,7 @@ typedef struct ior_aiori { int (*stat) (const char *path, struct stat *buf, aiori_mod_opt_t * module_options); void (*initialize)(aiori_mod_opt_t * options); /* called once per program before MPI is started */ void (*finalize)(aiori_mod_opt_t * options); /* called once per program after MPI is shutdown */ + int (*rename) (const char *oldpath, const char *newpath, aiori_mod_opt_t * module_options); option_help * (*get_options)(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t* init_values); /* initializes the backend options as well and returns the pointer to the option help structure */ int (*check_params)(aiori_mod_opt_t *); /* check if the provided module_optionseters for the given test and the module options are correct, if they aren't print a message and exit(1) or return 1*/ void (*sync)(aiori_mod_opt_t * ); /* synchronize every pending operation for this storage */ @@ -125,6 +121,7 @@ enum bench_type { }; extern ior_aiori_t dummy_aiori; +extern ior_aiori_t aio_aiori; extern ior_aiori_t daos_aiori; extern ior_aiori_t dfs_aiori; extern ior_aiori_t hdf5_aiori; @@ -135,7 +132,8 @@ extern ior_aiori_t ncmpi_aiori; extern ior_aiori_t posix_aiori; extern ior_aiori_t pmdk_aiori; extern ior_aiori_t mmap_aiori; -extern ior_aiori_t s3_aiori; +extern ior_aiori_t S3_libS3_aiori; +extern ior_aiori_t s3_4c_aiori; extern ior_aiori_t s3_plus_aiori; extern ior_aiori_t s3_emc_aiori; extern ior_aiori_t rados_aiori; @@ -158,20 +156,12 @@ int aiori_posix_mkdir (const char *path, mode_t mode, aiori_mod_opt_t * module_o int aiori_posix_rmdir (const char *path, aiori_mod_opt_t * module_options); int aiori_posix_access (const char *path, int mode, aiori_mod_opt_t * module_options); int aiori_posix_stat (const char *path, struct stat *buf, aiori_mod_opt_t * module_options); -void aiori_posix_xfer_hints(aiori_xfer_hint_t * params); - -aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * module_options); -int POSIX_Mknod(char *testFileName); -aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * module_options); -IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, MPI_Comm testComm, char *testFileName); -void POSIX_Delete(char *testFileName, aiori_mod_opt_t * module_options); -void POSIX_Close(aiori_fd_t *fd, aiori_mod_opt_t * module_options); -option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_opt_t * init_values); -/* NOTE: these 3 MPI-IO functions are exported for reuse by HDF5/PNetCDF */ +/* NOTE: these 4 MPI-IO functions are exported for reuse by HDF5/PNetCDF */ void MPIIO_Delete(char *testFileName, aiori_mod_opt_t * module_options); -IOR_offset_t MPIIO_GetFileSize(aiori_mod_opt_t * options, MPI_Comm testComm, char *testFileName); -int MPIIO_Access(const char *, int, aiori_mod_opt_t *); +IOR_offset_t MPIIO_GetFileSize(aiori_mod_opt_t * options, char *testFileName); +int MPIIO_Access(const char *, int, aiori_mod_opt_t * module_options); +void MPIIO_xfer_hints(aiori_xfer_hint_t * params); #endif /* not _AIORI_H */ diff --git a/src/ior-internal.h b/src/ior-internal.h index fa7212e..c0af544 100644 --- a/src/ior-internal.h +++ b/src/ior-internal.h @@ -25,8 +25,7 @@ void PrintTestEnds(); void PrintTableHeader(); /* End of ior-output */ -IOR_offset_t *GetOffsetArraySequential(IOR_param_t * test, int pretendRank); -IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int access); +IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, IOR_offset_t * out_count); struct results { double min; diff --git a/src/ior-output.c b/src/ior-output.c index b890cd9..1b21a00 100644 --- a/src/ior-output.c +++ b/src/ior-output.c @@ -20,6 +20,8 @@ void PrintTableHeader(){ fprintf(out_resultfile, "\n"); fprintf(out_resultfile, "access bw(MiB/s) IOPS Latency(s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter\n"); fprintf(out_resultfile, "------ --------- ---- ---------- ---------- --------- -------- -------- -------- -------- ----\n"); + }else if(outputFormat == OUTPUT_CSV){ + fprintf(out_resultfile, "access,bw(MiB/s),IOPS,Latency,block(KiB),xfer(KiB),open(s),wr/rd(s),close(s),total(s),numTasks,iter\n"); } } @@ -45,8 +47,6 @@ static void PrintKeyValStart(char * key){ } if(outputFormat == OUTPUT_JSON){ fprintf(out_resultfile, "\"%s\": \"", key); - }else if(outputFormat == OUTPUT_CSV){ - } } @@ -84,7 +84,7 @@ static void PrintKeyVal(char * key, char * value){ if(outputFormat == OUTPUT_JSON){ fprintf(out_resultfile, "\"%s\": \"%s\"", key, value); }else if(outputFormat == OUTPUT_CSV){ - fprintf(out_resultfile, "%s", value); + fprintf(out_resultfile, "%s,", value); } } @@ -98,7 +98,7 @@ static void PrintKeyValDouble(char * key, double value){ if(outputFormat == OUTPUT_JSON){ fprintf(out_resultfile, "\"%s\": %.4f", key, value); }else if(outputFormat == OUTPUT_CSV){ - fprintf(out_resultfile, "%.4f", value); + fprintf(out_resultfile, "%.4f,", value); } } @@ -113,7 +113,7 @@ static void PrintKeyValInt(char * key, int64_t value){ if(outputFormat == OUTPUT_JSON){ fprintf(out_resultfile, "\"%s\": %lld", key, (long long) value); }else if(outputFormat == OUTPUT_CSV){ - fprintf(out_resultfile, "%lld", (long long) value); + fprintf(out_resultfile, "%lld,", (long long) value); } } @@ -203,13 +203,16 @@ void PrintRepeatEnd(){ void PrintRepeatStart(){ if (rank != 0) return; - if( outputFormat == OUTPUT_DEFAULT){ + if(outputFormat == OUTPUT_DEFAULT){ return; } PrintArrayStart(); } void PrintTestEnds(){ + if (outputFormat == OUTPUT_CSV){ + return; + } if (rank != 0 || verbose <= VERBOSE_0) { PrintEndSection(); return; @@ -246,7 +249,21 @@ void PrintReducedResult(IOR_test_t *test, int access, double bw, double iops, do PrintKeyValDouble("closeTime", diff_subset[2]); PrintKeyValDouble("totalTime", totalTime); PrintEndSection(); + }else if (outputFormat == OUTPUT_CSV){ + PrintKeyVal("access", access == WRITE ? "write" : "read"); + PrintKeyValDouble("bwMiB", bw / MEBIBYTE); + PrintKeyValDouble("iops", iops); + PrintKeyValDouble("latency", latency); + PrintKeyValDouble("blockKiB", (double)test->params.blockSize / KIBIBYTE); + PrintKeyValDouble("xferKiB", (double)test->params.transferSize / KIBIBYTE); + PrintKeyValDouble("openTime", diff_subset[0]); + PrintKeyValDouble("wrRdTime", diff_subset[1]); + PrintKeyValDouble("closeTime", diff_subset[2]); + PrintKeyValDouble("totalTime", totalTime); + PrintKeyValInt("Numtasks", test->params.numTasks); + fprintf(out_resultfile, "%d\n", rep); } + fflush(out_resultfile); } @@ -258,6 +275,10 @@ void PrintHeader(int argc, char **argv) if (rank != 0) return; + if (outputFormat == OUTPUT_CSV){ + return; + } + PrintStartSection(); if (outputFormat != OUTPUT_DEFAULT){ PrintKeyVal("Version", META_VERSION); @@ -284,23 +305,6 @@ void PrintHeader(int argc, char **argv) } PrintKeyValEnd(); } - -#ifdef _NO_MPI_TIMER - if (verbose >= VERBOSE_2) - fprintf(out_logfile, "Using unsynchronized POSIX timer\n"); -#else /* not _NO_MPI_TIMER */ - if (MPI_WTIME_IS_GLOBAL) { - if (verbose >= VERBOSE_2) - fprintf(out_logfile, "Using synchronized MPI timer\n"); - } else { - if (verbose >= VERBOSE_2) - fprintf(out_logfile, "Using unsynchronized MPI timer\n"); - } -#endif /* _NO_MPI_TIMER */ - if (verbose >= VERBOSE_1) { - fprintf(out_logfile, "Start time skew across all tasks: %.02f sec\n", - wall_clock_deviation); - } if (verbose >= VERBOSE_3) { /* show env */ fprintf(out_logfile, "STARTING ENVIRON LOOP\n"); for (i = 0; environ[i] != NULL; i++) { @@ -319,11 +323,16 @@ void PrintHeader(int argc, char **argv) */ void ShowTestStart(IOR_param_t *test) { + if (outputFormat == OUTPUT_CSV){ + return; + } PrintStartSection(); PrintKeyValInt("TestID", test->id); PrintKeyVal("StartTime", CurrentTimeString()); - ShowFileSystemSize(test); + char filename[MAX_PATHLEN]; + GetTestFileName(filename, test); + ShowFileSystemSize(filename, test->backend, test->backend_options); if (verbose >= VERBOSE_3 || outputFormat == OUTPUT_JSON) { char* data_packets[] = {"g","t","o","i"}; @@ -362,19 +371,19 @@ void ShowTestStart(IOR_param_t *test) PrintKeyValInt("randomOffset", test->randomOffset); PrintKeyValInt("checkWrite", test->checkWrite); PrintKeyValInt("checkRead", test->checkRead); - PrintKeyValInt("storeFileOffset", test->storeFileOffset); + PrintKeyValInt("dataPacketType", test->dataPacketType); PrintKeyValInt("keepFile", test->keepFile); PrintKeyValInt("keepFileWithError", test->keepFileWithError); - PrintKeyValInt("quitOnError", test->quitOnError); + PrintKeyValInt("warningAsErrors", test->warningAsErrors); PrintKeyValInt("verbose", verbose); PrintKeyVal("data packet type", data_packets[test->dataPacketType]); PrintKeyValInt("setTimeStampSignature/incompressibleSeed", test->setTimeStampSignature); /* Seed value was copied into setTimeStampSignature as well */ PrintKeyValInt("collective", test->collective); PrintKeyValInt("segmentCount", test->segmentCount); - #ifdef HAVE_GPFS_FCNTL_H - PrintKeyValInt("gpfsHintAccess", test->gpfs_hint_access); - PrintKeyValInt("gpfsReleaseToken", test->gpfs_release_token); - #endif + //#ifdef HAVE_GPFS_FCNTL_H + //PrintKeyValInt("gpfsHintAccess", test->gpfs_hint_access); + //PrintKeyValInt("gpfsReleaseToken", test->gpfs_release_token); + //#endif PrintKeyValInt("transferSize", test->transferSize); PrintKeyValInt("blockSize", test->blockSize); PrintEndSection(); @@ -401,6 +410,9 @@ void ShowTestEnd(IOR_test_t *tptr){ */ void ShowSetup(IOR_param_t *params) { + if (outputFormat == OUTPUT_CSV){ + return; + } if (params->debug) { fprintf(out_logfile, "\n*** DEBUG MODE ***\n"); fprintf(out_logfile, "*** %s ***\n\n", params->debug); @@ -594,9 +606,6 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access) PrintKeyValInt("taskPerNodeOffset", params->taskPerNodeOffset); PrintKeyValInt("reorderTasksRandom", params->reorderTasksRandom); PrintKeyValInt("reorderTasksRandomSeed", params->reorderTasksRandomSeed); - PrintKeyValInt("segmentCount", params->segmentCount); - PrintKeyValInt("blockSize", params->blockSize); - PrintKeyValInt("transferSize", params->transferSize); PrintKeyValDouble("bwMaxMIB", bw->max / MEBIBYTE); PrintKeyValDouble("bwMinMIB", bw->min / MEBIBYTE); PrintKeyValDouble("bwMeanMIB", bw->mean / MEBIBYTE); @@ -612,8 +621,6 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access) } PrintKeyValDouble("xsizeMiB", (double) point->aggFileSizeForBW / MEBIBYTE); PrintEndSection(); - }else if (outputFormat == OUTPUT_CSV){ - } fflush(out_resultfile); @@ -638,7 +645,7 @@ void PrintLongSummaryHeader() if (rank != 0 || verbose <= VERBOSE_0) return; if(outputFormat != OUTPUT_DEFAULT){ - return; + return; } fprintf(out_resultfile, "\n"); @@ -665,8 +672,6 @@ void PrintLongSummaryAllTests(IOR_test_t *tests_head) fprintf(out_resultfile, "Summary of all tests:"); }else if (outputFormat == OUTPUT_JSON){ PrintNamedArrayStart("summary"); - }else if (outputFormat == OUTPUT_CSV){ - } PrintLongSummaryHeader(); diff --git a/src/ior.c b/src/ior.c index 08f95ef..cf96cd8 100755 --- a/src/ior.c +++ b/src/ior.c @@ -33,6 +33,10 @@ # include /* uname() */ #endif +#ifdef HAVE_CUDA +#include +#endif + #include #include "ior.h" @@ -51,9 +55,9 @@ static const ior_aiori_t *backend; static void DestroyTests(IOR_test_t *tests_head); static char *PrependDir(IOR_param_t *, char *); static char **ParseFileName(char *, int *); -static void InitTests(IOR_test_t * , MPI_Comm); +static void InitTests(IOR_test_t *); static void TestIoSys(IOR_test_t *); -static void ValidateTests(IOR_param_t *); +static void ValidateTests(IOR_param_t * params, MPI_Comm com); static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers); @@ -78,17 +82,58 @@ static void ior_set_xfer_hints(IOR_param_t * p){ } } -static void test_initialize(IOR_test_t * test){ +int aiori_warning_as_errors = 0; + +/* + Returns 1 if the process participates in the test + */ +static int test_initialize(IOR_test_t * test){ + int range[3]; + IOR_param_t *params = &test->params; + MPI_Group orig_group, new_group; + + /* set up communicator for test */ + MPI_CHECK(MPI_Comm_group(params->mpi_comm_world, &orig_group), + "MPI_Comm_group() error"); + range[0] = 0; /* first rank */ + range[1] = params->numTasks - 1; /* last rank */ + range[2] = 1; /* stride */ + MPI_CHECK(MPI_Group_range_incl(orig_group, 1, &range, &new_group), + "MPI_Group_range_incl() error"); + MPI_CHECK(MPI_Comm_create(params->mpi_comm_world, new_group, & params->testComm), + "MPI_Comm_create() error"); + MPI_CHECK(MPI_Group_free(&orig_group), "MPI_Group_Free() error"); + MPI_CHECK(MPI_Group_free(&new_group), "MPI_Group_Free() error"); + + + if (params->testComm == MPI_COMM_NULL) { + /* tasks not in the group do not participate in this test, this matches the proceses in test_finalize() that participate */ + MPI_CHECK(MPI_Barrier(params->mpi_comm_world), "barrier error"); + return 0; + } + + /* Setup global variables */ + testComm = params->testComm; verbose = test->params.verbose; backend = test->params.backend; + +#ifdef HAVE_CUDA + cudaError_t cret = cudaSetDevice(test->params.gpuID); + if(cret != cudaSuccess){ + EWARNF("cudaSetDevice(%d) error: %s", test->params.gpuID, cudaGetErrorString(cret)); + } +#endif + if(backend->initialize){ backend->initialize(test->params.backend_options); } ior_set_xfer_hints(& test->params); + aiori_warning_as_errors = test->params.warningAsErrors; if (rank == 0 && verbose >= VERBOSE_0) { ShowTestStart(& test->params); } + return 1; } static void test_finalize(IOR_test_t * test){ @@ -96,6 +141,8 @@ static void test_finalize(IOR_test_t * test){ if(backend->finalize){ backend->finalize(test->params.backend_options); } + MPI_CHECK(MPI_Barrier(test->params.mpi_comm_world), "barrier error"); + MPI_CHECK(MPI_Comm_free(& testComm), "MPI_Comm_free() error"); } @@ -104,20 +151,19 @@ IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out IOR_test_t *tptr; out_logfile = world_out; out_resultfile = world_out; - mpi_comm_world = world_com; - MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); + MPI_CHECK(MPI_Comm_rank(world_com, &rank), "cannot get rank"); /* setup tests, and validate parameters */ - tests_head = ParseCommandLine(argc, argv); - InitTests(tests_head, world_com); - verbose = tests_head->params.verbose; + tests_head = ParseCommandLine(argc, argv, world_com); + InitTests(tests_head); PrintHeader(argc, argv); /* perform each test */ for (tptr = tests_head; tptr != NULL; tptr = tptr->next) { - test_initialize(tptr); + int participate = test_initialize(tptr); + if( ! participate ) continue; totalErrorCount = 0; TestIoSys(tptr); tptr->results->errors = totalErrorCount; @@ -145,27 +191,26 @@ int ior_main(int argc, char **argv) /* * check -h option from commandline without starting MPI; */ - tests_head = ParseCommandLine(argc, argv); + tests_head = ParseCommandLine(argc, argv, MPI_COMM_WORLD); /* start the MPI code */ MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI"); - mpi_comm_world = MPI_COMM_WORLD; - MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); + MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "cannot get rank"); /* set error-handling */ /*MPI_CHECK(MPI_Errhandler_set(mpi_comm_world, MPI_ERRORS_RETURN), "cannot set errhandler"); */ /* setup tests, and validate parameters */ - InitTests(tests_head, mpi_comm_world); - verbose = tests_head->params.verbose; + InitTests(tests_head); PrintHeader(argc, argv); /* perform each test */ for (tptr = tests_head; tptr != NULL; tptr = tptr->next) { - test_initialize(tptr); + int participate = test_initialize(tptr); + if( ! participate ) continue; // This is useful for trapping a running MPI process. While // this is sleeping, run the script 'testing/hdfs/gdb.attach' @@ -200,11 +245,9 @@ int ior_main(int argc, char **argv) /* * Initialize an IOR_param_t structure to the defaults */ -void init_IOR_Param_t(IOR_param_t * p) +void init_IOR_Param_t(IOR_param_t * p, MPI_Comm com) { const char *default_aiori = aiori_default (); - char *hdfs_user; - assert (NULL != default_aiori); memset(p, 0, sizeof(IOR_param_t)); @@ -232,20 +275,10 @@ void init_IOR_Param_t(IOR_param_t * p) p->transferSize = 262144; p->randomSeed = -1; p->incompressibleSeed = 573; - p->testComm = mpi_comm_world; - - hdfs_user = getenv("USER"); - if (!hdfs_user) - hdfs_user = ""; - p->hdfs_user = strdup(hdfs_user); - p->hdfs_name_node = "default"; - p->hdfs_name_node_port = 0; /* ??? */ - p->hdfs_fs = NULL; - p->hdfs_replicas = 0; /* invokes the default */ - p->hdfs_block_size = 0; + p->testComm = com; // this com might change for smaller tests + p->mpi_comm_world = com; p->URI = NULL; - p->part_number = 0; } static void @@ -257,7 +290,7 @@ DisplayOutliers(int numTasks, double sum, mean, sqrDiff, var, sd; /* for local timerVal, don't compensate for wall clock delta */ - timerVal += wall_clock_delta; + //timerVal += wall_clock_delta; MPI_CHECK(MPI_Allreduce (&timerVal, &sum, 1, MPI_DOUBLE, MPI_SUM, testComm), @@ -281,10 +314,8 @@ DisplayOutliers(int numTasks, if (ret != 0) strcpy(hostname, "unknown"); - fprintf(out_logfile, "WARNING: for %s, task %d, %s %s is %f\n", - hostname, rank, accessString, timeString, timerVal); - fprintf(out_logfile, " (mean=%f, stddev=%f)\n", mean, sd); - fflush(out_logfile); + EWARNF("for %s, task %d, %s %s is %f (mean=%f, stddev=%f)\n", + hostname, rank, accessString, timeString, timerVal, mean, sd); } } @@ -314,37 +345,54 @@ CheckForOutliers(IOR_param_t *test, const double *timer, const int access) * Check if actual file size equals expected size; if not use actual for * calculating performance rate. */ -static void CheckFileSize(IOR_test_t *test, IOR_offset_t dataMoved, int rep, - const int access) +static void CheckFileSize(IOR_test_t *test, char * testFilename, IOR_offset_t dataMoved, int rep, const int access) { IOR_param_t *params = &test->params; IOR_results_t *results = test->results; IOR_point_t *point = (access == WRITE) ? &results[rep].write : &results[rep].read; + /* get the size of the file */ + IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum; + aggFileSizeFromStat = backend->get_file_size(params->backend_options, testFilename); + + if (params->hints.filePerProc == TRUE) { + MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1, + MPI_LONG_LONG_INT, MPI_SUM, testComm), + "cannot reduce total data moved"); + aggFileSizeFromStat = tmpSum; + } else { + MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMin, 1, + MPI_LONG_LONG_INT, MPI_MIN, testComm), + "cannot reduce total data moved"); + MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMax, 1, + MPI_LONG_LONG_INT, MPI_MAX, testComm), + "cannot reduce total data moved"); + if (tmpMin != tmpMax) { + if (rank == 0) { + WARN("inconsistent file size by different tasks"); + } + /* incorrect, but now consistent across tasks */ + aggFileSizeFromStat = tmpMin; + } + } + point->aggFileSizeFromStat = aggFileSizeFromStat; + MPI_CHECK(MPI_Allreduce(&dataMoved, &point->aggFileSizeFromXfer, 1, MPI_LONG_LONG_INT, MPI_SUM, testComm), "cannot total data moved"); - if (strcasecmp(params->api, "HDF5") != 0 && strcasecmp(params->api, "NCMPI") != 0 && - strcasecmp(params->api, "DAOS") != 0) { + if (strcasecmp(params->api, "HDF5") != 0 && strcasecmp(params->api, "NCMPI") != 0) { if (verbose >= VERBOSE_0 && rank == 0) { if ((params->expectedAggFileSize != point->aggFileSizeFromXfer) || (point->aggFileSizeFromStat != point->aggFileSizeFromXfer)) { - fprintf(out_logfile, - "WARNING: Expected aggregate file size = %lld.\n", - (long long) params->expectedAggFileSize); - fprintf(out_logfile, - "WARNING: Stat() of aggregate file size = %lld.\n", - (long long) point->aggFileSizeFromStat); - fprintf(out_logfile, - "WARNING: Using actual aggregate bytes moved = %lld.\n", - (long long) point->aggFileSizeFromXfer); + EWARNF("Expected aggregate file size = %lld", (long long) params->expectedAggFileSize); + EWARNF("Stat() of aggregate file size = %lld", (long long) point->aggFileSizeFromStat); + EWARNF("Using actual aggregate bytes moved = %lld", (long long) point->aggFileSizeFromXfer); if(params->deadlineForStonewalling){ - fprintf(out_logfile, - "WARNING: maybe caused by deadlineForStonewalling\n"); + EWARN("Maybe caused by deadlineForStonewalling"); } } } @@ -358,101 +406,10 @@ static void CheckFileSize(IOR_test_t *test, IOR_offset_t dataMoved, int rep, * difference in buffers and returns total errors counted. */ static size_t -CompareBuffers(void *expectedBuffer, - void *unknownBuffer, - size_t size, - IOR_offset_t transferCount, IOR_param_t *test, int access) +CompareData(void *expectedBuffer, size_t size, IOR_offset_t transferCount, IOR_param_t *test, IOR_offset_t offset, int fillrank, int access) { - char testFileName[MAX_PATHLEN]; - char bufferLabel1[MAX_STR]; - char bufferLabel2[MAX_STR]; - size_t i, j, length, first, last; - size_t errorCount = 0; - int inError = 0; - unsigned long long *goodbuf = (unsigned long long *)expectedBuffer; - unsigned long long *testbuf = (unsigned long long *)unknownBuffer; - - if (access == WRITECHECK || access == READCHECK) { - strcpy(bufferLabel1, "Expected: "); - strcpy(bufferLabel2, "Actual: "); - } else { - ERR("incorrect argument for CompareBuffers()"); - } - - length = size / sizeof(IOR_size_t); - first = -1; - if (verbose >= VERBOSE_3) { - fprintf(out_logfile, - "[%d] At file byte offset %lld, comparing %llu-byte transfer\n", - rank, (long long) offset, (long long)size); - } - for (i = 0; i < length; i++) { - if (testbuf[i] != goodbuf[i]) { - errorCount++; - if (verbose >= VERBOSE_2) { - fprintf(out_logfile, - "[%d] At transfer buffer #%lld, index #%lld (file byte offset %lld):\n", - rank, transferCount - 1, (long long)i, - (long long) offset + - (IOR_size_t) (i * sizeof(IOR_size_t))); - fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1); - fprintf(out_logfile, "%016llx\n", goodbuf[i]); - fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel2); - fprintf(out_logfile, "%016llx\n", testbuf[i]); - } - if (!inError) { - inError = 1; - first = i; - last = i; - } else { - last = i; - } - } else if (verbose >= VERBOSE_5 && i % 4 == 0) { - fprintf(out_logfile, - "[%d] PASSED offset = %lu bytes, transfer %lld\n", - rank, - ((i * sizeof(unsigned long long)) + - offset), transferCount); - fprintf(out_logfile, "[%d] GOOD %s0x", rank, bufferLabel1); - for (j = 0; j < 4; j++) - fprintf(out_logfile, "%016llx ", goodbuf[i + j]); - fprintf(out_logfile, "\n[%d] GOOD %s0x", rank, bufferLabel2); - for (j = 0; j < 4; j++) - fprintf(out_logfile, "%016llx ", testbuf[i + j]); - fprintf(out_logfile, "\n"); - } - } - if (inError) { - inError = 0; - GetTestFileName(testFileName, test); - fprintf(out_logfile, - "[%d] FAILED comparison of buffer containing %d-byte ints:\n", - rank, (int)sizeof(unsigned long long int)); - fprintf(out_logfile, "[%d] File name = %s\n", rank, testFileName); - fprintf(out_logfile, "[%d] In transfer %lld, ", rank, - transferCount); - fprintf(out_logfile, - "%lld errors between buffer indices %lld and %lld.\n", - (long long)errorCount, (long long)first, - (long long)last); - fprintf(out_logfile, "[%d] File byte offset = %lu:\n", rank, - ((first * sizeof(unsigned long long)) + offset)); - - fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1); - for (j = first; j < length && j < first + 4; j++) - fprintf(out_logfile, "%016llx ", goodbuf[j]); - if (j == length) - fprintf(out_logfile, "[end of buffer]"); - fprintf(out_logfile, "\n[%d] %s0x", rank, bufferLabel2); - for (j = first; j < length && j < first + 4; j++) - fprintf(out_logfile, "%016llx ", testbuf[j]); - if (j == length) - fprintf(out_logfile, "[end of buffer]"); - fprintf(out_logfile, "\n"); - if (test->quitOnError == TRUE) - ERR("data check error, aborting execution"); - } - return (errorCount); + assert(access == WRITECHECK || access == READCHECK); + return verify_memory_pattern(offset, expectedBuffer, transferCount, test->setTimeStampSignature, fillrank, test->dataPacketType); } /* @@ -476,7 +433,7 @@ static int CountErrors(IOR_param_t * test, int access, int errors) WARN("overflow in errors counted"); allErrors = -1; } - fprintf(out_logfile, "WARNING: incorrect data on %s (%d errors found).\n", + EWARNF("Incorrect data on %s (%d errors found).\n", access == WRITECHECK ? "write" : "read", allErrors); fprintf(out_logfile, "Used Time Stamp %u (0x%x) for Data Signature\n", @@ -487,44 +444,6 @@ static int CountErrors(IOR_param_t * test, int access, int errors) return (allErrors); } -/* - * Allocate a page-aligned (required by O_DIRECT) buffer. - */ -static void *aligned_buffer_alloc(size_t size) -{ - size_t pageMask; - char *buf, *tmp; - char *aligned; - -#ifdef HAVE_SYSCONF - long pageSize = sysconf(_SC_PAGESIZE); -#else - size_t pageSize = getpagesize(); -#endif - - pageMask = pageSize - 1; - buf = malloc(size + pageSize + sizeof(void *)); - if (buf == NULL) - ERR("out of memory"); - /* find the alinged buffer */ - tmp = buf + sizeof(char *); - aligned = tmp + pageSize - ((size_t) tmp & pageMask); - /* write a pointer to the original malloc()ed buffer into the bytes - preceding "aligned", so that the aligned buffer can later be free()ed */ - tmp = aligned - sizeof(void *); - *(void **)tmp = buf; - - return (void *)aligned; -} - -/* - * Free a buffer allocated by aligned_buffer_alloc(). - */ -static void aligned_buffer_free(void *buf) -{ - free(*(void **)((char *)buf - sizeof(char *))); -} - void AllocResults(IOR_test_t *test) { int reps; @@ -581,7 +500,7 @@ static void DestroyTests(IOR_test_t *tests_head) /* * Distribute IOR_HINTs to all tasks' environments. */ -void DistributeHints(void) +static void DistributeHints(MPI_Comm com) { char hint[MAX_HINTS][MAX_STR], fullHint[MAX_STR], hintVariable[MAX_STR]; int hintCount = 0, i; @@ -603,11 +522,9 @@ void DistributeHints(void) } } - MPI_CHECK(MPI_Bcast(&hintCount, sizeof(hintCount), MPI_BYTE, - 0, MPI_COMM_WORLD), "cannot broadcast hints"); + MPI_CHECK(MPI_Bcast(&hintCount, sizeof(hintCount), MPI_BYTE, 0, com), "cannot broadcast hints"); for (i = 0; i < hintCount; i++) { - MPI_CHECK(MPI_Bcast(&hint[i], MAX_STR, MPI_BYTE, - 0, MPI_COMM_WORLD), + MPI_CHECK(MPI_Bcast(&hint[i], MAX_STR, MPI_BYTE, 0, com), "cannot broadcast hints"); strcpy(fullHint, hint[i]); strcpy(hintVariable, strtok(fullHint, "=")); @@ -619,64 +536,6 @@ void DistributeHints(void) } } -/* - * Fill buffer, which is transfer size bytes long, with known 8-byte long long - * int values. In even-numbered 8-byte long long ints, store MPI task in high - * bits and timestamp signature in low bits. In odd-numbered 8-byte long long - * ints, store transfer offset. If storeFileOffset option is used, the file - * (not transfer) offset is stored instead. - */ - -static void -FillIncompressibleBuffer(void* buffer, IOR_param_t * test) - -{ - size_t i; - unsigned long long hi, lo; - unsigned long long *buf = (unsigned long long *)buffer; - - for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) { - hi = ((unsigned long long) rand_r(&test->incompressibleSeed) << 32); - lo = (unsigned long long) rand_r(&test->incompressibleSeed); - buf[i] = hi | lo; - } -} - -unsigned int reseed_incompressible_prng = TRUE; - -static void -FillBuffer(void *buffer, - IOR_param_t * test, unsigned long long offset, int fillrank) -{ - size_t i; - unsigned long long hi, lo; - unsigned long long *buf = (unsigned long long *)buffer; - - if(test->dataPacketType == incompressible ) { /* Make for some non compressable buffers with randomish data */ - - /* In order for write checks to work, we have to restart the psuedo random sequence */ - if(reseed_incompressible_prng == TRUE) { - test->incompressibleSeed = test->setTimeStampSignature + rank; /* We copied seed into timestampSignature at initialization, also add the rank to add randomness between processes */ - reseed_incompressible_prng = FALSE; - } - FillIncompressibleBuffer(buffer, test); - } - - else { - hi = ((unsigned long long)fillrank) << 32; - lo = (unsigned long long)test->timeStampSignatureValue; - for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) { - if ((i % 2) == 0) { - /* evens contain MPI rank and time in seconds */ - buf[i] = hi | lo; - } else { - /* odds contain offset */ - buf[i] = offset + (i * sizeof(unsigned long long)); - } - } - } -} - /* * Return string describing machine name and type. */ @@ -778,8 +637,7 @@ void GetTestFileName(char *testFileName, IOR_param_t * test) strcpy(initialTestFileName, test->testFileName); if(test->dualMount){ GetProcessorAndCore(&socket, &core); - sprintf(tmpString, "%s%d/%s",initialTestFileName, - socket, "data"); + sprintf(tmpString, "%s%d/%s",initialTestFileName, socket, "data"); strcpy(initialTestFileName, tmpString); } fileNames = ParseFileName(initialTestFileName, &count); @@ -971,12 +829,19 @@ static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test) * Setup tests by parsing commandline and creating test script. * Perform a sanity-check on the configured parameters. */ -static void InitTests(IOR_test_t *tests, MPI_Comm com) +static void InitTests(IOR_test_t *tests) { + if(tests == NULL){ + return; + } + MPI_Comm com = tests->params.mpi_comm_world; int mpiNumNodes = 0; int mpiNumTasks = 0; int mpiNumTasksOnNode0 = 0; + verbose = tests->params.verbose; + aiori_warning_as_errors = tests->params.warningAsErrors; + /* * These default values are the same for every test and expensive to * retrieve so just do it once. @@ -990,7 +855,7 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com) * task 0 has the environment settings for the hints, pass * the hint=value pair to everyone else in mpi_comm_world */ - DistributeHints(); + DistributeHints(com); /* check validity of tests and create test queue */ while (tests != NULL) { @@ -1005,11 +870,9 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com) params->numTasks = mpiNumTasks; } else if (params->numTasks > mpiNumTasks) { if (rank == 0) { - fprintf(out_logfile, - "WARNING: More tasks requested (%d) than available (%d),", + EWARNF("More tasks requested (%d) than available (%d),", params->numTasks, mpiNumTasks); - fprintf(out_logfile, " running with %d tasks.\n", - mpiNumTasks); + EWARNF(" running with %d tasks.\n", mpiNumTasks); } params->numTasks = mpiNumTasks; } @@ -1021,14 +884,11 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com) params->expectedAggFileSize = params->blockSize * params->segmentCount * params->numTasks; - ValidateTests(&tests->params); + ValidateTests(&tests->params, com); tests = tests->next; } - init_clock(); - - /* seed random number generator */ - SeedRandGen(mpi_comm_world); + init_clock(com); } /* @@ -1037,16 +897,7 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com) static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test, int pretendRank) { - ioBuffers->buffer = aligned_buffer_alloc(test->transferSize); - - if (test->checkWrite || test->checkRead) { - ioBuffers->checkBuffer = aligned_buffer_alloc(test->transferSize); - } - if (test->checkRead || test->checkWrite) { - ioBuffers->readCheckBuffer = aligned_buffer_alloc(test->transferSize); - } - - return; + ioBuffers->buffer = aligned_buffer_alloc(test->transferSize, test->gpuMemoryFlags); } /* @@ -1055,16 +906,7 @@ static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test, static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test) { - aligned_buffer_free(ioBuffers->buffer); - - if (test->checkWrite || test->checkRead) { - aligned_buffer_free(ioBuffers->checkBuffer); - } - if (test->checkRead) { - aligned_buffer_free(ioBuffers->readCheckBuffer); - } - - return; + aligned_buffer_free(ioBuffers->buffer, test->gpuMemoryFlags); } @@ -1109,7 +951,7 @@ static void file_hits_histogram(IOR_param_t *params) } MPI_CHECK(MPI_Gather(&rankOffset, 1, MPI_INT, rankoffs, - 1, MPI_INT, 0, mpi_comm_world), + 1, MPI_INT, 0, params->testComm), "MPI_Gather error"); if (rank != 0) @@ -1245,6 +1087,55 @@ WriteTimes(IOR_param_t *test, const double *timer, const int iteration, timerName); } } + +static void StoreRankInformation(IOR_test_t *test, double *timer, const int rep, const int access){ + IOR_param_t *params = &test->params; + double totalTime = timer[5] - timer[0]; + double accessTime = timer[3] - timer[2]; + double times[] = {totalTime, accessTime}; + + if(rank == 0){ + FILE* fd = fopen(params->saveRankDetailsCSV, "a"); + if (fd == NULL){ + FAIL("Cannot open saveRankPerformanceDetailsCSV file for writes!"); + } + int size; + MPI_Comm_size(params->testComm, & size); + double *all_times = malloc(2* size * sizeof(double)); + MPI_Gather(times, 2, MPI_DOUBLE, all_times, 2, MPI_DOUBLE, 0, params->testComm); + IOR_point_t *point = (access == WRITE) ? &test->results[rep].write : &test->results[rep].read; + double file_size = ((double) point->aggFileSizeForBW) / size; + + for(int i=0; i < size; i++){ + char buff[1024]; + sprintf(buff, "%s,%d,%.10e,%.10e,%.10e,%.10e\n", access==WRITE ? "write" : "read", i, all_times[i*2], all_times[i*2+1], file_size/all_times[i*2], file_size/all_times[i*2+1] ); + int ret = fwrite(buff, strlen(buff), 1, fd); + if(ret != 1){ + WARN("Couln't append to saveRankPerformanceDetailsCSV file\n"); + break; + } + } + fclose(fd); + }else{ + MPI_Gather(& times, 2, MPI_DOUBLE, NULL, 2, MPI_DOUBLE, 0, testComm); + } +} + +static void ProcessIterResults(IOR_test_t *test, double *timer, const int rep, const int access){ + IOR_param_t *params = &test->params; + + if (verbose >= VERBOSE_3) + WriteTimes(params, timer, rep, access); + ReduceIterResults(test, timer, rep, access); + if (params->outlierThreshold) { + CheckForOutliers(params, timer, access); + } + + if(params->saveRankDetailsCSV){ + StoreRankInformation(test, timer, rep, access); + } +} + /* * Using the test parameters, run iteration(s) of single test. */ @@ -1258,30 +1149,10 @@ static void TestIoSys(IOR_test_t *test) int pretendRank; int rep; aiori_fd_t *fd; - MPI_Group orig_group, new_group; - int range[3]; IOR_offset_t dataMoved; /* for data rate calculation */ void *hog_buf; IOR_io_buffers ioBuffers; - /* set up communicator for test */ - MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group), - "MPI_Comm_group() error"); - range[0] = 0; /* first rank */ - range[1] = params->numTasks - 1; /* last rank */ - range[2] = 1; /* stride */ - MPI_CHECK(MPI_Group_range_incl(orig_group, 1, &range, &new_group), - "MPI_Group_range_incl() error"); - MPI_CHECK(MPI_Comm_create(mpi_comm_world, new_group, &testComm), - "MPI_Comm_create() error"); - MPI_CHECK(MPI_Group_free(&orig_group), "MPI_Group_Free() error"); - MPI_CHECK(MPI_Group_free(&new_group), "MPI_Group_Free() error"); - params->testComm = testComm; - if (testComm == MPI_COMM_NULL) { - /* tasks not in the group do not participate in this test */ - MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error"); - return; - } if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, "Participating tasks : %d\n", params->numTasks); fflush(out_logfile); @@ -1305,15 +1176,24 @@ static void TestIoSys(IOR_test_t *test) params->timeStampSignatureValue = (unsigned int) params->setTimeStampSignature; } XferBuffersSetup(&ioBuffers, params, pretendRank); - reseed_incompressible_prng = TRUE; // reset pseudo random generator, necessary to guarantee the next call to FillBuffer produces the same value as it is right now - + /* Initial time stamp */ startTime = GetTimeStamp(); /* loop over test iterations */ uint64_t params_saved_wearout = params->stoneWallingWearOutIterations; + + /* Check if the file exists and warn users */ + if((params->writeFile || params->checkWrite) && (params->hints.filePerProc || rank == 0)){ + struct stat sb; + GetTestFileName(testFileName, params); + int ret = backend->stat(testFileName, & sb, params->backend_options); + if(ret == 0) { + EWARNF("The file \"%s\" exists already and will be overwritten", testFileName); + } + } + for (rep = 0; rep < params->repetitions; rep++) { - PrintRepeatStart(); /* Get iteration start time in seconds in task 0 and broadcast to all tasks */ if (rank == 0) { @@ -1339,7 +1219,8 @@ static void TestIoSys(IOR_test_t *test) (¶ms->timeStampSignatureValue, 1, MPI_UNSIGNED, 0, testComm), "cannot broadcast start time value"); - FillBuffer(ioBuffers.buffer, params, 0, pretendRank); + generate_memory_pattern((char*) ioBuffers.buffer, params->transferSize, params->setTimeStampSignature, pretendRank, params->dataPacketType); + /* use repetition count for number of multiple files */ if (params->multiFile) params->repCounter = rep; @@ -1365,6 +1246,7 @@ static void TestIoSys(IOR_test_t *test) params->open = WRITE; timer[0] = GetTimeStamp(); fd = backend->create(testFileName, IOR_WRONLY | IOR_CREAT | IOR_TRUNC, params->backend_options); + if(fd == NULL) FAIL("Cannot create file"); timer[1] = GetTimeStamp(); if (params->intraTestBarriers) MPI_CHECK(MPI_Barrier(testComm), @@ -1390,20 +1272,11 @@ static void TestIoSys(IOR_test_t *test) timer[5] = GetTimeStamp(); MPI_CHECK(MPI_Barrier(testComm), "barrier error"); - /* get the size of the file just written */ - results[rep].write.aggFileSizeFromStat = - backend->get_file_size(params->backend_options, testComm, testFileName); - /* check if stat() of file doesn't equal expected file size, use actual amount of byte moved */ - CheckFileSize(test, dataMoved, rep, WRITE); + CheckFileSize(test, testFileName, dataMoved, rep, WRITE); - if (verbose >= VERBOSE_3) - WriteTimes(params, timer, rep, WRITE); - ReduceIterResults(test, timer, rep, WRITE); - if (params->outlierThreshold) { - CheckForOutliers(params, timer, WRITE); - } + ProcessIterResults(test, timer, rep, WRITE); /* check if in this round we run write with stonewalling */ if(params->deadlineForStonewalling > 0){ @@ -1430,15 +1303,11 @@ static void TestIoSys(IOR_test_t *test) } rankOffset = (2 * shift) % params->numTasks; } - - // update the check buffer - FillBuffer(ioBuffers.readCheckBuffer, params, 0, (rank + rankOffset) % params->numTasks); - - reseed_incompressible_prng = TRUE; /* Re-Seed the PRNG to get same sequence back, if random */ - + GetTestFileName(testFileName, params); params->open = WRITECHECK; fd = backend->open(testFileName, IOR_RDONLY, params->backend_options); + if(fd == NULL) FAIL("Cannot open file"); dataMoved = WriteOrRead(params, &results[rep], fd, WRITECHECK, &ioBuffers); backend->close(fd, params->backend_options); rankOffset = 0; @@ -1449,9 +1318,9 @@ static void TestIoSys(IOR_test_t *test) if ((params->readFile || params->checkRead ) && !test_time_elapsed(params, startTime)) { /* check for stonewall */ if(params->stoneWallingStatusFile){ - params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile); + params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile, params->testComm); if(params->stoneWallingWearOutIterations == -1 && rank == 0){ - fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!\n"); + WARN("Could not read back the stonewalling status from the file!"); params->stoneWallingWearOutIterations = 0; } } @@ -1495,10 +1364,6 @@ static void TestIoSys(IOR_test_t *test) file_hits_histogram(params); } } - if(operation_flag == READCHECK){ - FillBuffer(ioBuffers.readCheckBuffer, params, 0, (rank + rankOffset) % params->numTasks); - } - /* Using globally passed rankOffset, following function generates testFileName to read */ GetTestFileName(testFileName, params); @@ -1511,6 +1376,7 @@ static void TestIoSys(IOR_test_t *test) params->open = READ; timer[0] = GetTimeStamp(); fd = backend->open(testFileName, IOR_RDONLY, params->backend_options); + if(fd == NULL) FAIL("Cannot open file"); timer[1] = GetTimeStamp(); if (params->intraTestBarriers) MPI_CHECK(MPI_Barrier(testComm), @@ -1530,21 +1396,11 @@ static void TestIoSys(IOR_test_t *test) backend->close(fd, params->backend_options); timer[5] = GetTimeStamp(); - /* get the size of the file just read */ - results[rep].read.aggFileSizeFromStat = - backend->get_file_size(params->backend_options, testComm, - testFileName); - /* check if stat() of file doesn't equal expected file size, use actual amount of byte moved */ - CheckFileSize(test, dataMoved, rep, READ); + CheckFileSize(test, testFileName, dataMoved, rep, READ); - if (verbose >= VERBOSE_3) - WriteTimes(params, timer, rep, READ); - ReduceIterResults(test, timer, rep, READ); - if (params->outlierThreshold) { - CheckForOutliers(params, timer, READ); - } + ProcessIterResults(test, timer, rep, READ); } if (!params->keepFile @@ -1562,10 +1418,8 @@ static void TestIoSys(IOR_test_t *test) params->errorFound = FALSE; rankOffset = 0; - PrintRepeatEnd(); } - - MPI_CHECK(MPI_Comm_free(&testComm), "MPI_Comm_free() error"); + PrintRepeatEnd(); if (params->summary_every_test) { PrintLongSummaryHeader(); @@ -1578,20 +1432,24 @@ static void TestIoSys(IOR_test_t *test) if (hog_buf != NULL) free(hog_buf); - - /* Sync with the tasks that did not participate in this test */ - MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error"); - } /* * Determine if valid tests from parameters. */ -static void ValidateTests(IOR_param_t * test) +static void ValidateTests(IOR_param_t * test, MPI_Comm com) { IOR_param_t defaults; - init_IOR_Param_t(&defaults); + init_IOR_Param_t(&defaults, com); + if (test->stoneWallingStatusFile && test->keepFile == 0) + ERR("a StoneWallingStatusFile is only sensible when splitting write/read into multiple executions of ior, please use -k"); + if (test->stoneWallingStatusFile && test->stoneWallingWearOut == 0 && test->writeFile) + ERR("the StoneWallingStatusFile is only sensible for a write test when using stoneWallingWearOut"); + if (test->deadlineForStonewalling == 0 && test->stoneWallingWearOut > 0) + ERR("the stoneWallingWearOut is only sensible when setting a stonewall deadline with -D"); + if (test->stoneWallingStatusFile && test->testscripts) + WARN("the StoneWallingStatusFile only preserves the last experiment, make sure that each run uses a separate status file!"); if (test->repetitions <= 0) WARN_RESET("too few test repetitions", test, &defaults, repetitions); @@ -1623,7 +1481,12 @@ static void ValidateTests(IOR_param_t * test) } if (test->blockSize < test->transferSize) ERR("block size must not be smaller than transfer size"); - + if (test->randomOffset && test->blockSize == test->transferSize) + ERR("IOR will randomize access within a block and repeats the same pattern for all segments, therefore choose blocksize > transferSize"); + if (! test->randomOffset && test->randomPrefillBlocksize) + ERR("Setting the randomPrefill option without using random is not useful"); + if (test->randomPrefillBlocksize && (test->blockSize % test->randomPrefillBlocksize != 0)) + ERR("The randomPrefill option must divide the blockSize"); /* specific APIs */ if ((strcasecmp(test->api, "MPIIO") == 0) && (test->blockSize < sizeof(IOR_size_t) @@ -1637,21 +1500,17 @@ static void ValidateTests(IOR_param_t * test) && (test->blockSize < sizeof(IOR_size_t) || test->transferSize < sizeof(IOR_size_t))) ERR("block/transfer size may not be smaller than IOR_size_t for NCMPI"); - if ((strcasecmp(test->api, "POSIX") != 0) && test->singleXferAttempt) - WARN_RESET("retry only available in POSIX", - test, &defaults, singleXferAttempt); if (((strcasecmp(test->api, "POSIX") != 0) && (strcasecmp(test->api, "MPIIO") != 0) && (strcasecmp(test->api, "MMAP") != 0) && (strcasecmp(test->api, "HDFS") != 0) && (strcasecmp(test->api, "DFS") != 0) - && (strcasecmp(test->api, "DAOS") != 0) && (strcasecmp(test->api, "Gfarm") != 0) && (strcasecmp(test->api, "RADOS") != 0) && (strcasecmp(test->api, "CEPHFS") != 0)) && test->fsync) WARN_RESET("fsync() not supported in selected backend", test, &defaults, fsync); - /* parameter consitency */ + /* parameter consistency */ if (test->reorderTasks == TRUE && test->reorderTasksRandom == TRUE) ERR("Both Constant and Random task re-ordering specified. Choose one and resubmit"); if (test->randomOffset && test->reorderTasksRandom @@ -1660,9 +1519,9 @@ static void ValidateTests(IOR_param_t * test) if (test->randomOffset && test->reorderTasks && test->filePerProc == FALSE) ERR("random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit"); - if (test->randomOffset && test->checkRead) - ERR("random offset not available with read check option (use write check)"); - if (test->randomOffset && test->storeFileOffset) + if (test->randomOffset && test->checkRead && test->randomSeed == -1) + ERR("random offset with read check option requires to set the random seed"); + if (test->randomOffset && test->dataPacketType == DATA_OFFSET) ERR("random offset not available with store file offset option)"); if ((strcasecmp(test->api, "HDF5") == 0) && test->randomOffset) ERR("random offset not available with HDF5"); @@ -1684,147 +1543,101 @@ static void ValidateTests(IOR_param_t * test) /** * Returns a precomputed array of IOR_offset_t for the inner benchmark loop. - * They are sequential and the last element is set to -1 as end marker. - * @param test IOR_param_t for getting transferSize, blocksize and SegmentCount - * @param pretendRank int pretended Rank for shifting the offsest corectly - * @return IOR_offset_t - */ -IOR_offset_t *GetOffsetArraySequential(IOR_param_t * test, int pretendRank) -{ - IOR_offset_t i, j, k = 0; - IOR_offset_t offsets; - IOR_offset_t *offsetArray; - - /* count needed offsets */ - offsets = (test->blockSize / test->transferSize) * test->segmentCount; - - /* setup empty array */ - offsetArray = - (IOR_offset_t *) malloc((offsets + 1) * sizeof(IOR_offset_t)); - if (offsetArray == NULL) - ERR("malloc() failed"); - offsetArray[offsets] = -1; /* set last offset with -1 */ - - /* fill with offsets */ - for (i = 0; i < test->segmentCount; i++) { - for (j = 0; j < (test->blockSize / test->transferSize); j++) { - offsetArray[k] = j * test->transferSize; - if (test->filePerProc) { - offsetArray[k] += i * test->blockSize; - } else { - offsetArray[k] += - (i * test->numTasks * test->blockSize) - + (pretendRank * test->blockSize); - } - k++; - } - } - - return (offsetArray); -} - -/** - * Returns a precomputed array of IOR_offset_t for the inner benchmark loop. - * They get created sequentially and mixed up in the end. The last array element - * is set to -1 as end marker. - * It should be noted that as the seeds get synchronised across all processes - * every process computes the same random order if used with filePerProc. + * They get created sequentially and mixed up in the end. + * It should be noted that as the seeds get synchronised across all processes if not FilePerProcess is set + * every process computes the same random order. * For a shared file all transfers get randomly assigned to ranks. The processes * can also have differen't numbers of transfers. This might lead to a bigger * diversion in accesse as it dose with filePerProc. This is expected but * should be mined. * @param test IOR_param_t for getting transferSize, blocksize and SegmentCount - * @param pretendRank int pretended Rank for shifting the offsest corectly + * @param pretendRank int pretended Rank for shifting the offsets correctly * @return IOR_offset_t - * @return */ -IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int access) +IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, IOR_offset_t * out_count) { int seed; - IOR_offset_t i, value, tmp; - IOR_offset_t offsets = 0; + IOR_offset_t i; + IOR_offset_t offsets; IOR_offset_t offsetCnt = 0; - IOR_offset_t fileSize; IOR_offset_t *offsetArray; - /* set up seed for random() */ - if (access == WRITE || access == READ) { - test->randomSeed = seed = rand(); - } else { - seed = test->randomSeed; - } - srand(seed); - - fileSize = test->blockSize * test->segmentCount; - if (test->filePerProc == FALSE) { - fileSize *= test->numTasks; + if (test->filePerProc) { + /* set up seed, each process can determine which regions to access individually */ + if (test->randomSeed == -1) { + seed = time(NULL); + test->randomSeed = seed; + } else { + seed = test->randomSeed + pretendRank; + } + }else{ + /* Shared file requires that the seed is synchronized */ + if (test->randomSeed == -1) { + // all processes need to have the same seed. + if(rank == 0){ + seed = time(NULL); + } + MPI_CHECK(MPI_Bcast(& seed, 1, MPI_INT, 0, test->testComm), "cannot broadcast random seed value"); + test->randomSeed = seed; + }else{ + seed = test->randomSeed; + } } + srandom(seed); /* count needed offsets (pass 1) */ - for (i = 0; i < fileSize; i += test->transferSize) { - if (test->filePerProc == FALSE) { - // this counts which process get how many transferes in - // a shared file - if ((rand() % test->numTasks) == pretendRank) { - offsets++; - } - } else { - offsets++; - } + if (test->filePerProc) { + offsets = test->blockSize / test->transferSize; + }else{ + offsets = 0; + for (i = 0; i < test->blockSize * test->numTasks; i += test->transferSize) { + // this counts which process get how many transferes in the shared file + if ((rand() % test->numTasks) == pretendRank) { + offsets++; + } + } } /* setup empty array */ - offsetArray = - (IOR_offset_t *) malloc((offsets + 1) * sizeof(IOR_offset_t)); - if (offsetArray == NULL) - ERR("malloc() failed"); - offsetArray[offsets] = -1; /* set last offset with -1 */ + offsetArray = (IOR_offset_t *) safeMalloc(offsets * sizeof(IOR_offset_t)); + + *out_count = offsets; if (test->filePerProc) { - /* fill array */ - for (i = 0; i < offsets; i++) { - offsetArray[i] = i * test->transferSize; - } + /* fill array */ + for (i = 0; i < offsets; i++) { + offsetArray[i] = i * test->transferSize; + } } else { - /* fill with offsets (pass 2) */ - srand(seed); /* need same seed to get same transfers as counted in the beginning*/ - for (i = 0; i < fileSize; i += test->transferSize) { - if ((rand() % test->numTasks) == pretendRank) { - offsetArray[offsetCnt] = i; - offsetCnt++; - } + /* fill with offsets (pass 2) */ + srandom(seed); /* need same seed to get same transfers as counted in the beginning*/ + for (i = 0; i < test->blockSize * test->numTasks; i += test->transferSize) { + if ((rand() % test->numTasks) == pretendRank) { + offsetArray[offsetCnt] = i; + offsetCnt++; } + } } /* reorder array */ for (i = 0; i < offsets; i++) { + IOR_offset_t value, tmp; value = rand() % offsets; tmp = offsetArray[value]; offsetArray[value] = offsetArray[i]; offsetArray[i] = tmp; } - SeedRandGen(test->testComm); /* synchronize seeds across tasks */ return (offsetArray); } -static IOR_offset_t WriteOrReadSingle(IOR_offset_t pairCnt, IOR_offset_t *offsetArray, int pretendRank, - IOR_offset_t * transferCount, int * errors, IOR_param_t * test, aiori_fd_t * fd, IOR_io_buffers* ioBuffers, int access){ +static IOR_offset_t WriteOrReadSingle(IOR_offset_t offset, int pretendRank, IOR_offset_t transfer, IOR_offset_t * transferCount, int * errors, IOR_param_t * test, aiori_fd_t * fd, IOR_io_buffers* ioBuffers, int access){ IOR_offset_t amtXferred = 0; - IOR_offset_t transfer; void *buffer = ioBuffers->buffer; - void *checkBuffer = ioBuffers->checkBuffer; - void *readCheckBuffer = ioBuffers->readCheckBuffer; - - IOR_offset_t offset = offsetArray[pairCnt]; // this looks inappropriate - - transfer = test->transferSize; if (access == WRITE) { /* fills each transfer with a unique pattern * containing the offset into the file */ - if (test->storeFileOffset == TRUE) { - FillBuffer(buffer, test, offset, pretendRank); - } + update_write_memory_pattern(offset, ioBuffers->buffer, transfer, test->setTimeStampSignature, pretendRank, test->dataPacketType); amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options); if (amtXferred != transfer) ERR("cannot write to file"); @@ -1843,34 +1656,43 @@ static IOR_offset_t WriteOrReadSingle(IOR_offset_t pairCnt, IOR_offset_t *offset nanosleep( & wait, NULL); } } else if (access == WRITECHECK) { - memset(checkBuffer, 'a', transfer); - - if (test->storeFileOffset == TRUE) { - FillBuffer(readCheckBuffer, test, offset, pretendRank); - } - - amtXferred = backend->xfer(access, fd, checkBuffer, transfer, offset, test->backend_options); + ((long long int*) buffer)[0] = ~((long long int*) buffer)[0]; // changes the buffer, no memset to reduce the memory pressure + amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options); if (amtXferred != transfer) ERR("cannot read from file write check"); - (*transferCount)++; - *errors += CompareBuffers(readCheckBuffer, checkBuffer, transfer, - *transferCount, test, - WRITECHECK); + *errors += CompareData(buffer, transfer, *transferCount, test, offset, pretendRank, WRITECHECK); } else if (access == READCHECK) { - memset(checkBuffer, 'a', transfer); - - amtXferred = backend->xfer(access, fd, checkBuffer, transfer, offset, test->backend_options); + ((long long int*) buffer)[0] = ~((long long int*) buffer)[0]; // changes the buffer, no memset to reduce the memory pressure + amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options); if (amtXferred != transfer){ ERR("cannot read from file"); } - if (test->storeFileOffset == TRUE) { - FillBuffer(readCheckBuffer, test, offset, pretendRank); - } - *errors += CompareBuffers(readCheckBuffer, checkBuffer, transfer, *transferCount, test, READCHECK); + *errors += CompareData(buffer, transfer, *transferCount, test, offset, pretendRank, READCHECK); } return amtXferred; } +static void prefillSegment(IOR_param_t *test, void * randomPrefillBuffer, int pretendRank, aiori_fd_t *fd, IOR_io_buffers *ioBuffers, int startSegment, int endSegment){ + // prefill the whole file already with an invalid pattern + int offsets = test->blockSize / test->randomPrefillBlocksize; + void * oldBuffer = ioBuffers->buffer; + IOR_offset_t transferCount; + int errors; + ioBuffers->buffer = randomPrefillBuffer; + for (int i = startSegment; i < endSegment; i++){ + for (int j = 0; j < offsets; j++) { + IOR_offset_t offset = j * test->randomPrefillBlocksize; + if (test->filePerProc) { + offset += i * test->blockSize; + } else { + offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize); + } + WriteOrReadSingle(offset, pretendRank, test->randomPrefillBlocksize, & transferCount, & errors, test, fd, ioBuffers, WRITE); + } + } + ioBuffers->buffer = oldBuffer; +} + /* * Write or Read data to file(s). This loops through the strides, writing * out the data to each block in transfer sizes, until the remainder left is 0. @@ -1881,41 +1703,87 @@ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, int errors = 0; IOR_offset_t transferCount = 0; uint64_t pairCnt = 0; - IOR_offset_t *offsetArray; int pretendRank; IOR_offset_t dataMoved = 0; /* for data rate calculation */ double startForStonewall; int hitStonewall; + int i, j; IOR_point_t *point = ((access == WRITE) || (access == WRITECHECK)) ? &results->write : &results->read; /* initialize values */ pretendRank = (rank + rankOffset) % test->numTasks; + // offsetArray = GetOffsetArraySequential(test, pretendRank); + + IOR_offset_t offsets; + IOR_offset_t * offsets_rnd; if (test->randomOffset) { - offsetArray = GetOffsetArrayRandom(test, pretendRank, access); - } else { - offsetArray = GetOffsetArraySequential(test, pretendRank); + offsets_rnd = GetOffsetArrayRandom(test, pretendRank, & offsets); + }else{ + offsets = (test->blockSize / test->transferSize); } + void * randomPrefillBuffer = NULL; + if(test->randomPrefillBlocksize && (access == WRITE || access == WRITECHECK)){ + randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize, test->gpuMemoryFlags); + // store invalid data into the buffer + memset(randomPrefillBuffer, -1, test->randomPrefillBlocksize); + } + + // start timer after random offset was generated startForStonewall = GetTimeStamp(); hitStonewall = 0; - /* loop over offsets to access */ - while ((offsetArray[pairCnt] != -1) && !hitStonewall ) { - dataMoved += WriteOrReadSingle(pairCnt, offsetArray, pretendRank, & transferCount, & errors, test, fd, ioBuffers, access); - pairCnt++; + if(randomPrefillBuffer && test->deadlineForStonewalling == 0){ + double t_start = GetTimeStamp(); + prefillSegment(test, randomPrefillBuffer, pretendRank, fd, ioBuffers, 0, test->segmentCount); + if(rank == 0 && verbose > VERBOSE_1){ + fprintf(out_logfile, "Random prefill took: %fs\n", GetTimeStamp() - t_start); + } + // must synchronize processes to ensure they are not running ahead + MPI_Barrier(test->testComm); + } - hitStonewall = ((test->deadlineForStonewalling != 0 - && (GetTimeStamp() - startForStonewall) - > test->deadlineForStonewalling)) || (test->stoneWallingWearOutIterations != 0 && pairCnt == test->stoneWallingWearOutIterations) ; + for (i = 0; i < test->segmentCount && !hitStonewall; i++) { + if(randomPrefillBuffer && test->deadlineForStonewalling != 0){ + // prefill the whole segment with data, this needs to be done collectively + double t_start = GetTimeStamp(); + prefillSegment(test, randomPrefillBuffer, pretendRank, fd, ioBuffers, i, i+1); + MPI_Barrier(test->testComm); + if(rank == 0 && verbose > VERBOSE_1){ + fprintf(out_logfile, "Random: synchronizing segment count with barrier and prefill took: %fs\n", GetTimeStamp() - t_start); + } + } + for (j = 0; j < offsets && !hitStonewall ; j++) { + IOR_offset_t offset; + if (test->randomOffset) { + if(test->filePerProc){ + offset = offsets_rnd[j] + (i * test->blockSize); + }else{ + offset = offsets_rnd[j] + (i * test->numTasks * test->blockSize); + } + }else{ + offset = j * test->transferSize; + if (test->filePerProc) { + offset += i * test->blockSize; + } else { + offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize); + } + } + dataMoved += WriteOrReadSingle(offset, pretendRank, test->transferSize, & transferCount, & errors, test, fd, ioBuffers, access); + pairCnt++; - if ( test->collective && test->deadlineForStonewalling ) { - // if collective-mode, you'll get a HANG, if some rank 'accidentally' leave this loop - // it absolutely must be an 'all or none': - MPI_CHECK(MPI_Bcast(&hitStonewall, 1, MPI_INT, 0, MPI_COMM_WORLD), "hitStonewall broadcast failed"); - } + hitStonewall = ((test->deadlineForStonewalling != 0 + && (GetTimeStamp() - startForStonewall) > test->deadlineForStonewalling)) + || (test->stoneWallingWearOutIterations != 0 && pairCnt == test->stoneWallingWearOutIterations) ; + if ( test->collective && test->deadlineForStonewalling ) { + // if collective-mode, you'll get a HANG, if some rank 'accidentally' leave this loop + // it absolutely must be an 'all or none': + MPI_CHECK(MPI_Bcast(&hitStonewall, 1, MPI_INT, 0, testComm), "hitStonewall broadcast failed"); + } + } } if (test->stoneWallingWearOut){ if (verbose >= VERBOSE_1){ @@ -1931,32 +1799,57 @@ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved"); MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_min_data_accessed, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved"); - MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_avg_data_accessed, + MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_total_data_accessed, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm), "cannot reduce pairs moved"); if(rank == 0){ + point->stonewall_avg_data_accessed = point->stonewall_total_data_accessed / test->numTasks; fprintf(out_logfile, "stonewalling pairs accessed min: %lld max: %zu -- min data: %.1f GiB mean data: %.1f GiB time: %.1fs\n", pairs_accessed_min, point->pairs_accessed, - point->stonewall_min_data_accessed /1024.0 / 1024 / 1024, point->stonewall_avg_data_accessed / 1024.0 / 1024 / 1024 / test->numTasks , point->stonewall_time); - point->stonewall_min_data_accessed *= test->numTasks; + point->stonewall_min_data_accessed /1024.0 / 1024 / 1024, point->stonewall_avg_data_accessed / 1024.0 / 1024 / 1024 , point->stonewall_time); } if(pairCnt != point->pairs_accessed){ - // some work needs still to be done ! - for(; pairCnt < point->pairs_accessed; pairCnt++ ) { - dataMoved += WriteOrReadSingle(pairCnt, offsetArray, pretendRank, & transferCount, & errors, test, fd, ioBuffers, access); + // some work needs still to be done, complete the current block ! + i--; + if(j == offsets){ + j = 0; // current block is completed + i++; + } + for ( ; pairCnt < point->pairs_accessed; i++) { + for ( ; j < offsets && pairCnt < point->pairs_accessed ; j++) { + IOR_offset_t offset; + if (test->randomOffset) { + if(test->filePerProc){ + offset = offsets_rnd[j] + (i * test->blockSize); + }else{ + offset = offsets_rnd[j] + (i * test->numTasks * test->blockSize); + } + }else{ + offset = j * test->transferSize; + if (test->filePerProc) { + offset += i * test->blockSize; + } else { + offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize); + } + } + dataMoved += WriteOrReadSingle(offset, pretendRank, test->transferSize, & transferCount, & errors, test, fd, ioBuffers, access); + pairCnt++; + } + j = 0; } } }else{ point->pairs_accessed = pairCnt; } - totalErrorCount += CountErrors(test, access, errors); - free(offsetArray); - if (access == WRITE && test->fsync == TRUE) { backend->fsync(fd, test->backend_options); /*fsync after all accesses */ } + if(randomPrefillBuffer){ + aligned_buffer_free(randomPrefillBuffer, test->gpuMemoryFlags); + } + return (dataMoved); } diff --git a/src/ior.h b/src/ior.h index c3d9ad4..2effa9a 100755 --- a/src/ior.h +++ b/src/ior.h @@ -39,19 +39,19 @@ #include "iordef.h" #include "aiori.h" +#include + +#ifndef MPI_FILE_NULL +# include +#endif /* not MPI_FILE_NULL */ + #define ISPOWEROFTWO(x) ((x != 0) && !(x & (x - 1))) -/******************** DATA Packet Type ***************************************/ -/* Holds the types of data packets: generic, offset, timestamp, incompressible */ - -enum PACKET_TYPE -{ - generic = 0, /* No packet type specified */ - timestamp=1, /* Timestamp packet set with -l */ - offset=2, /* Offset packet set with -l */ - incompressible=3 /* Incompressible packet set with -l */ - -}; +typedef enum{ + IOR_MEMORY_TYPE_CPU = 0, + IOR_MEMORY_TYPE_GPU_MANAGED = 1, + IOR_MEMORY_TYPE_GPU_DEVICE_ONLY = 2, +} ior_memory_flags; /***************** IOR_BUFFERS *************************************************/ @@ -92,9 +92,13 @@ typedef struct char * options; /* options string */ // intermediate options int collective; /* collective I/O */ - MPI_Comm testComm; /* MPI communicator */ + MPI_Comm testComm; /* Current MPI communicator */ + MPI_Comm mpi_comm_world; /* The global MPI communicator */ int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */ - int dualMount; /* dual mount points */ + int dualMount; /* dual mount points */ + ior_memory_flags gpuMemoryFlags; /* use the GPU to store the data */ + int gpuDirect; /* use gpuDirect, this influences gpuMemoryFlags as well */ + int gpuID; /* the GPU to use for gpuDirect or memory options */ int numTasks; /* number of tasks for test */ int numNodes; /* number of nodes for test */ int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */ @@ -117,18 +121,18 @@ typedef struct int keepFile; /* don't delete the testfile on exit */ int keepFileWithError; /* don't delete the testfile with errors */ int errorFound; /* error found in data check */ - int quitOnError; /* quit code when error in check */ IOR_offset_t segmentCount; /* number of segments (or HDF5 datasets) */ IOR_offset_t blockSize; /* contiguous bytes to write per task */ IOR_offset_t transferSize; /* size of transfer in bytes */ IOR_offset_t expectedAggFileSize; /* calculated aggregate file size */ + IOR_offset_t randomPrefillBlocksize; /* prefill option for random IO, the amount of data used for prefill */ + char * saveRankDetailsCSV; /* save the details about the performance to a file */ int summary_every_test; /* flag to print summary every test, not just at end */ int uniqueDir; /* use unique directory for each fpp */ int useExistingTestFile; /* do not delete test file before access */ - int storeFileOffset; /* use file offset as stored signature */ int deadlineForStonewalling; /* max time in seconds to run any test phase */ - int stoneWallingWearOut; /* wear out the stonewalling, once the timout is over, each process has to write the same amount */ + int stoneWallingWearOut; /* wear out the stonewalling, once the timeout is over, each process has to write the same amount */ uint64_t stoneWallingWearOutIterations; /* the number of iterations for the stonewallingWearOut, needed for readBack */ char * stoneWallingStatusFile; @@ -145,7 +149,7 @@ typedef struct char * memoryPerNodeStr; /* for parsing */ char * testscripts; /* for parsing */ char * buffer_type; /* for parsing */ - enum PACKET_TYPE dataPacketType; /* The type of data packet. */ + ior_dataPacketType_e dataPacketType; /* The type of data packet. */ void * backend_options; /* Backend-specific options */ @@ -154,27 +158,15 @@ typedef struct int fsyncPerWrite; /* fsync() after each write */ int fsync; /* fsync() after write */ - /* HDFS variables */ - char * hdfs_user; /* copied from ENV, for now */ - const char* hdfs_name_node; - tPort hdfs_name_node_port; /* (uint16_t) */ - hdfsFS hdfs_fs; /* file-system handle */ - int hdfs_replicas; /* n block replicas. (0 gets default) */ - int hdfs_block_size; /* internal blk-size. (0 gets default) */ - char* URI; /* "path" to target object */ - size_t part_number; /* multi-part upload increment (PER-RANK!) */ - char* UploadId; /* key for multi-part-uploads */ /* RADOS variables */ rados_t rados_cluster; /* RADOS cluster handle */ rados_ioctx_t rados_ioctx; /* I/O context for our pool in the RADOS cluster */ - /* NCMPI variables */ - int var_id; /* variable id handle for data set */ - int id; /* test's unique ID */ int intraTestBarriers; /* barriers between open/op and op/close */ + int warningAsErrors; /* treat any warning as an error */ aiori_xfer_hint_t hints; } IOR_param_t; @@ -185,8 +177,9 @@ typedef struct { size_t pairs_accessed; // number of I/Os done, useful for deadlineForStonewalling double stonewall_time; - long long stonewall_min_data_accessed; - long long stonewall_avg_data_accessed; + long long stonewall_min_data_accessed; // of all processes + long long stonewall_avg_data_accessed; // across all processes + long long stonewall_total_data_accessed; // sum accross all processes IOR_offset_t aggFileSizeFromStat; IOR_offset_t aggFileSizeFromXfer; @@ -210,7 +203,7 @@ IOR_test_t *CreateTest(IOR_param_t *init_params, int test_num); void AllocResults(IOR_test_t *test); char * GetPlatformName(void); -void init_IOR_Param_t(IOR_param_t *p); +void init_IOR_Param_t(IOR_param_t *p, MPI_Comm global_com); /* * This function runs IOR given by command line, useful for testing diff --git a/src/iordef.h b/src/iordef.h index 4c46b29..79f98f1 100755 --- a/src/iordef.h +++ b/src/iordef.h @@ -18,8 +18,12 @@ #include #include #include -#include -#include + +typedef enum { + DATA_TIMESTAMP, /* Will not include any offset, hence each buffer will be the same */ + DATA_OFFSET, + DATA_INCOMPRESSIBLE /* Will include the offset as well */ +} ior_dataPacketType_e; #ifdef _WIN32 # define _CRT_SECURE_NO_WARNINGS @@ -52,13 +56,6 @@ # include #endif -/************************** D E C L A R A T I O N S ***************************/ - -extern int numTasks; /* MPI variables */ -extern int rank; -extern int rankOffset; -extern int verbose; /* verbose output */ - /*************************** D E F I N I T I O N S ****************************/ enum OutputFormat_t{ @@ -115,117 +112,11 @@ enum OutputFormat_t{ #define DELIMITERS " \t\r\n=" /* ReadScript() */ #define FILENAME_DELIMITER '@' /* ParseFileName() */ -/* MACROs for debugging */ -#define HERE fprintf(stdout, "** LINE %d (TASK=%d) **\n", \ - __LINE__, rank); - typedef long long int IOR_offset_t; typedef long long int IOR_size_t; #define IOR_format "%016llx" - -/******************************** M A C R O S *********************************/ - -/******************************************************************************/ -/* - * WARN_RESET will display a custom error message and set value to default - */ -#define WARN_RESET(MSG, TO_STRUCT_PTR, FROM_STRUCT_PTR, MEMBER) do { \ - (TO_STRUCT_PTR)->MEMBER = (FROM_STRUCT_PTR)->MEMBER; \ - if (rank == 0) { \ - fprintf(stdout, "ior WARNING: %s. Using value of %d.\n", \ - MSG, (TO_STRUCT_PTR)->MEMBER); \ - } \ - fflush(stdout); \ -} while (0) - - -#define WARN(MSG) do { \ - if (verbose > VERBOSE_2) { \ - fprintf(stdout, "ior WARNING: %s, (%s:%d).\n", \ - MSG, __FILE__, __LINE__); \ - } else { \ - fprintf(stdout, "ior WARNING: %s.\n", MSG); \ - } \ - fflush(stdout); \ -} while (0) - - -/* warning with format string and errno printed */ -#define EWARNF(FORMAT, ...) do { \ - if (verbose > VERBOSE_2) { \ - fprintf(stdout, "ior WARNING: " FORMAT ", errno %d, %s (%s:%d).\n", \ - __VA_ARGS__, errno, strerror(errno), __FILE__, __LINE__); \ - } else { \ - fprintf(stdout, "ior WARNING: " FORMAT ", errno %d, %s \n", \ - __VA_ARGS__, errno, strerror(errno)); \ - } \ - fflush(stdout); \ -} while (0) - - -/* warning with errno printed */ -#define EWARN(MSG) do { \ - EWARNF("%s", MSG); \ -} while (0) - - -/* display error message with format string and terminate execution */ -#define ERRF(FORMAT, ...) do { \ - fprintf(stdout, "ior ERROR: " FORMAT ", errno %d, %s (%s:%d)\n", \ - __VA_ARGS__, errno, strerror(errno), __FILE__, __LINE__); \ - fflush(stdout); \ - MPI_Abort(MPI_COMM_WORLD, -1); \ -} while (0) - - -/* display error message and terminate execution */ -#define ERR_ERRNO(MSG) do { \ - ERRF("%s", MSG); \ -} while (0) - - -/* display a simple error message (i.e. errno is not set) and terminate execution */ -#define ERR(MSG) do { \ - fprintf(stdout, "ior ERROR: %s, (%s:%d)\n", \ - MSG, __FILE__, __LINE__); \ - fflush(stdout); \ - MPI_Abort(MPI_COMM_WORLD, -1); \ -} while (0) - - -/******************************************************************************/ -/* - * MPI_CHECKF will display a custom format string as well as an error string - * from the MPI_STATUS and then exit the program - */ - -#define MPI_CHECKF(MPI_STATUS, FORMAT, ...) do { \ - char resultString[MPI_MAX_ERROR_STRING]; \ - int resultLength; \ - \ - if (MPI_STATUS != MPI_SUCCESS) { \ - MPI_Error_string(MPI_STATUS, resultString, &resultLength); \ - fprintf(stdout, "ior ERROR: " FORMAT ", MPI %s, (%s:%d)\n", \ - __VA_ARGS__, resultString, __FILE__, __LINE__); \ - fflush(stdout); \ - MPI_Abort(MPI_COMM_WORLD, -1); \ - } \ -} while(0) - - -/******************************************************************************/ -/* - * MPI_CHECK will display a custom error message as well as an error string - * from the MPI_STATUS and then exit the program - */ - -#define MPI_CHECK(MPI_STATUS, MSG) do { \ - MPI_CHECKF(MPI_STATUS, "%s", MSG); \ -} while(0) - - /******************************************************************************/ /* * System info for Windows. diff --git a/src/md-workbench-main.c b/src/md-workbench-main.c new file mode 100644 index 0000000..bb94126 --- /dev/null +++ b/src/md-workbench-main.c @@ -0,0 +1,13 @@ +#include + +#include "md-workbench.h" + +int main(int argc, char ** argv){ + MPI_Init(& argc, & argv); + //phase_stat_t* results = + md_workbench_run(argc, argv, MPI_COMM_WORLD, stdout); + // API check, access the results of the first phase which is precrate. + //printf("Max op runtime: %f\n", results->max_op_time); + MPI_Finalize(); + return 0; +} diff --git a/src/md-workbench.c b/src/md-workbench.c new file mode 100644 index 0000000..4b3372d --- /dev/null +++ b/src/md-workbench.c @@ -0,0 +1,1053 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "md-workbench.h" +#include "config.h" +#include "aiori.h" +#include "utilities.h" +#include "parse_options.h" + +/* +This is the modified version md-workbench-fs that can utilize AIORI. +It follows the hierarchical file system semantics in contrast to the md-workbench (without -fs) which has dataset and object semantics. + */ + +#define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH + +#define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} +#define LLU (long long unsigned) +#define min(a,b) (a < b ? a : b) + +#define oprintf(...) do { fprintf(o.logfile, __VA_ARGS__); fflush(o.logfile); } while(0); + +// successfull, errors +typedef struct { + int suc; + int err; +} op_stat_t; + +// A runtime for an operation and when the operation was started +typedef struct{ + float time_since_app_start; + float runtime; +} time_result_t; + + +// statistics for running a single phase +typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! + double t; // maximum time + double * t_all; + + op_stat_t dset_create; + op_stat_t dset_delete; + + op_stat_t obj_create; + op_stat_t obj_read; + op_stat_t obj_stat; + op_stat_t obj_delete; + + // time measurements of individual runs, these are not returned for now by the API! + uint64_t repeats; + time_result_t * time_create; + time_result_t * time_read; + time_result_t * time_stat; + time_result_t * time_delete; + + time_statistics_t stats_create; + time_statistics_t stats_read; + time_statistics_t stats_stat; + time_statistics_t stats_delete; + + // the maximum time for any single operation + double max_op_time; + double phase_start_timer; + int stonewall_iterations; +} phase_stat_t; + +struct benchmark_options{ + ior_aiori_t const * backend; + void * backend_options; + aiori_xfer_hint_t hints; + MPI_Comm com; + FILE * logfile; + + char * interface; + int num; + int precreate; + int dset_count; + + mdworkbench_results_t * results; // the results + + ior_dataPacketType_e dataPacketType; + char * packetTypeStr; + int offset; + int iterations; + int global_iteration; + int file_size; + int read_only; + int stonewall_timer; + int stonewall_timer_wear_out; + int gpu_memory_flags; /* use the GPU to store the data */ + + char * latency_file_prefix; + int latency_keep_all; + + int phase_cleanup; + int phase_precreate; + int phase_benchmark; + + //int limit_memory; + //int limit_memory_between_phases; + + int verbosity; + int process_report; + + int print_detailed_stats; + int quiet_output; + + char * run_info_file; + char * prefix; // directory to work on + + int ignore_precreate_errors; + int rank; + int size; + int verify_read; + int random_seed; + + float relative_waiting_factor; + int adaptive_waiting_mode; + + uint64_t start_item_number; +}; + +struct benchmark_options o; + +static void def_dset_name(char * out_name, int n, int d){ + sprintf(out_name, "%s/%d_%d", o.prefix, n, d); +} + +static void def_obj_name(char * out_name, int n, int d, int i){ + sprintf(out_name, "%s/%d_%d/file-%d", o.prefix, n, d, i); +} + +void init_options(){ + o = (struct benchmark_options){ + .interface = "POSIX", + .prefix = "./out", + .num = 1000, + .random_seed = -1, + .precreate = 3000, + .dset_count = 10, + .offset = 1, + .iterations = 3, + .file_size = 3901, + .packetTypeStr = "t", + .run_info_file = "md-workbench.status"}; +} + +static void mdw_wait(double runtime){ + double waittime = runtime * o.relative_waiting_factor; + //printf("waittime: %e\n", waittime); + if(waittime < 0.01){ + double start; + start = GetTimeStamp(); + double cur = GetTimeStamp(); + double end = cur + waittime; + while (cur < end){ + cur = GetTimeStamp(); + } + }else{ + struct timespec w; + w.tv_sec = (time_t) (waittime); + w.tv_nsec = (long) ((waittime - w.tv_sec) * 1000 * 1000 * 1000); + nanosleep(& w, NULL); + } +} + +static void init_stats(phase_stat_t * p, size_t repeats){ + memset(p, 0, sizeof(phase_stat_t)); + p->repeats = repeats; + size_t timer_size = repeats * sizeof(time_result_t); + p->time_create = (time_result_t *) malloc(timer_size); + p->time_read = (time_result_t *) malloc(timer_size); + p->time_stat = (time_result_t *) malloc(timer_size); + p->time_delete = (time_result_t *) malloc(timer_size); +} + +static float add_timed_result(double start, double phase_start_timer, time_result_t * results, size_t pos, double * max_time, double * out_op_time){ + float curtime = start - phase_start_timer; + double op_time = GetTimeStamp() - start; + results[pos].runtime = (float) op_time; + results[pos].time_since_app_start = curtime; + if (op_time > *max_time){ + *max_time = op_time; + } + *out_op_time = op_time; + return curtime; +} + +static void print_detailed_stat_header(){ + printf("phase\t\td name\tcreate\tdelete\tob nam\tcreate\tread\tstat\tdelete\tt_inc_b\tt_no_bar\tthp\tmax_t\n"); +} + +static int sum_err(phase_stat_t * p){ + return p->dset_create.err + p->dset_delete.err + p->obj_create.err + p->obj_read.err + p->obj_stat.err + p->obj_delete.err; +} + +static double statistics_mean(int count, double * arr){ + double sum = 0; + for(int i=0; i < o.size; i++){ + sum += arr[i]; + } + return sum / o.size; +} + +static double statistics_std_dev(int count, double * arr){ + double mean = statistics_mean(count, arr); + double sum = 0; + for(int i=0; i < o.size; i++){ + sum += (mean - arr[i])*(mean - arr[i]); + } + return sqrt(sum / (o.size-1)); +} + +static void statistics_minmax(int count, double * arr, double * out_min, double * out_max){ + double min = 1e308; + double max = 0; + for(int i=0; i < o.size; i++){ + min = (arr[i] < min) ? arr[i] : min; + max = (arr[i] > max) ? arr[i] : max; + } + *out_min = min; + *out_max = max; +} + +static void print_p_stat(char * buff, const char * name, phase_stat_t * p, double t, int print_global){ + const double tp = (double)(p->obj_create.suc + p->obj_read.suc) * o.file_size / t / 1024 / 1024; + + const int errs = sum_err(p); + double r_min = 0; + double r_max = 0; + double r_mean = 0; + double r_std = 0; + + if(p->t_all){ + // we can compute several derived values that provide insight about quality of service, latency distribution and load balancing + statistics_minmax(o.size, p->t_all, & r_min, & r_max); + r_mean = statistics_mean(o.size, p->t_all); + r_std = statistics_std_dev(o.size, p->t_all); + } + + if (o.print_detailed_stats){ + sprintf(buff, "%s \t%d\t%d\t%d\t%d\t%d\t%d\t%.3fs\t%.3fs\t%.2f MiB/s %.4e", name, p->dset_create.suc, p->dset_delete.suc, p->obj_create.suc, p->obj_read.suc, p->obj_stat.suc, p->obj_delete.suc, p->t, t, tp, p->max_op_time); + + if (errs > 0){ + sprintf(buff, "%s err\t%d\t%d\t%d\t%d\t%d\t%d", name, p->dset_create.err, p->dset_delete.err, p->obj_create.err, p->obj_read.err, p->obj_stat.err, p->obj_delete.err); + } + }else{ + int pos = 0; + // single line + pos += sprintf(buff, "%s process max:%.2fs ", name, t); + if(print_global){ + pos += sprintf(buff + pos, "min:%.2fs mean: %.2fs balance:%.1f stddev:%.1f ", r_min, r_mean, r_min/r_max * 100.0, r_std); + } + int ioops_per_iter = 4; + if(o.read_only){ + ioops_per_iter = 2; + } + + double rate; + + switch(name[0]){ + case('b'): + rate = p->obj_read.suc * ioops_per_iter / t; + pos += sprintf(buff + pos, "rate:%.1f iops/s objects:%d rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es", + rate, // write, stat, read, delete + p->obj_read.suc, + p->obj_read.suc / t, + tp, + p->max_op_time); + + if(o.relative_waiting_factor > 1e-9){ + pos += sprintf(buff + pos, " waiting_factor:%.2f", o.relative_waiting_factor); + } + break; + case('p'): + rate = (p->dset_create.suc + p->obj_create.suc) / t; + pos += sprintf(buff + pos, "rate:%.1f iops/s dsets: %d objects:%d rate:%.3f dset/s rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es", + rate, + p->dset_create.suc, + p->obj_create.suc, + p->dset_create.suc / t, + p->obj_create.suc / t, + tp, + p->max_op_time); + break; + case('c'): + rate = (p->obj_delete.suc + p->dset_delete.suc) / t; + pos += sprintf(buff + pos, "rate:%.1f iops/s objects:%d dsets: %d rate:%.1f obj/s rate:%.3f dset/s op-max:%.4es", + rate, + p->obj_delete.suc, + p->dset_delete.suc, + p->obj_delete.suc / t, + p->dset_delete.suc / t, + p->max_op_time); + break; + default: + pos = sprintf(buff, "%s: unknown phase", name); + break; + } + + if(print_global){ + mdworkbench_result_t * res = & o.results->result[o.results->count]; + res->errors = errs; + o.results->errors += errs; + res->rate = rate; + res->max_op_time = p->max_op_time; + res->runtime = t; + res->iterations_done = p->repeats; + } + + if(! o.quiet_output || errs > 0){ + pos += sprintf(buff + pos, " (%d errs", errs); + if(errs > 0){ + pos += sprintf(buff + pos, "!!!)" ); + }else{ + pos += sprintf(buff + pos, ")" ); + } + } + if(! o.quiet_output && p->stonewall_iterations){ + pos += sprintf(buff + pos, " stonewall-iter:%d", p->stonewall_iterations); + } + + if(p->stats_read.max > 1e-9){ + time_statistics_t stat = p->stats_read; + pos += sprintf(buff + pos, " read(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + if(p->stats_stat.max > 1e-9){ + time_statistics_t stat = p->stats_stat; + pos += sprintf(buff + pos, " stat(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + if(p->stats_create.max > 1e-9){ + time_statistics_t stat = p->stats_create; + pos += sprintf(buff + pos, " create(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + if(p->stats_delete.max > 1e-9){ + time_statistics_t stat = p->stats_delete; + pos += sprintf(buff + pos, " delete(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + } +} + +static int compare_floats(time_result_t * x, time_result_t * y){ + return x->runtime < y->runtime ? -1 : (x->runtime > y->runtime ? +1 : 0); +} + +static double runtime_quantile(int repeats, time_result_t * times, float quantile){ + int pos = round(quantile * (repeats - 1) + 0.49); + assert(pos < repeats); + return times[pos].runtime; +} + +static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t * times, time_result_t * global_times){ + uint64_t count = 0; + int ret; + // due to stonewall, the number of repeats may be different per process + if(o.rank == 0){ + MPI_Status status; + memcpy(global_times, times, repeats * 2 * sizeof(float)); + count += repeats; + for(int i=1; i < o.size; i++){ + int cnt; + ret = MPI_Recv(& global_times[count], max_repeats*2, MPI_FLOAT, i, 888, o.com, & status); + CHECK_MPI_RET(ret) + MPI_Get_count(& status, MPI_FLOAT, & cnt); + count += cnt / 2; + } + }else{ + ret = MPI_Send(times, repeats * 2, MPI_FLOAT, 0, 888, o.com); + CHECK_MPI_RET(ret) + } + + return count; +} + +static void compute_histogram(const char * name, time_result_t * times, time_statistics_t * stats, size_t repeats, int writeLatencyFile){ + if(writeLatencyFile && o.latency_file_prefix ){ + char file[MAX_PATHLEN]; + sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, o.global_iteration, name); + FILE * f = fopen(file, "w+"); + if(f == NULL){ + ERRF("%d: Error writing to latency file: %s", o.rank, file); + return; + } + fprintf(f, "time,runtime\n"); + for(size_t i = 0; i < repeats; i++){ + fprintf(f, "%.7f,%.4e\n", times[i].time_since_app_start, times[i].runtime); + } + fclose(f); + } + // now sort the times and pick the quantiles + qsort(times, repeats, sizeof(time_result_t), (int (*)(const void *, const void *)) compare_floats); + stats->min = times[0].runtime; + stats->q1 = runtime_quantile(repeats, times, 0.25); + if(repeats % 2 == 0){ + stats->median = (times[repeats/2].runtime + times[repeats/2 - 1].runtime)/2.0; + }else{ + stats->median = times[repeats/2].runtime; + } + stats->q3 = runtime_quantile(repeats, times, 0.75); + stats->q90 = runtime_quantile(repeats, times, 0.90); + stats->q99 = runtime_quantile(repeats, times, 0.99); + stats->max = times[repeats - 1].runtime; +} + +static void end_phase(const char * name, phase_stat_t * p){ + int ret; + char buff[MAX_PATHLEN]; + + //char * limit_memory_P = NULL; + MPI_Barrier(o.com); + + int max_repeats = o.precreate * o.dset_count; + if(strcmp(name,"benchmark") == 0){ + max_repeats = o.num * o.dset_count; + } + + // prepare the summarized report + phase_stat_t g_stat; + init_stats(& g_stat, (o.rank == 0 ? 1 : 0) * ((size_t) max_repeats) * o.size); + // reduce timers + ret = MPI_Reduce(& p->t, & g_stat.t, 2, MPI_DOUBLE, MPI_MAX, 0, o.com); + CHECK_MPI_RET(ret) + if(o.rank == 0) { + g_stat.t_all = (double*) malloc(sizeof(double) * o.size); + } + ret = MPI_Gather(& p->t, 1, MPI_DOUBLE, g_stat.t_all, 1, MPI_DOUBLE, 0, o.com); + CHECK_MPI_RET(ret) + ret = MPI_Reduce(& p->dset_create, & g_stat.dset_create, 2*(2+4), MPI_INT, MPI_SUM, 0, o.com); + CHECK_MPI_RET(ret) + ret = MPI_Reduce(& p->max_op_time, & g_stat.max_op_time, 1, MPI_DOUBLE, MPI_MAX, 0, o.com); + CHECK_MPI_RET(ret) + if( p->stonewall_iterations ){ + ret = MPI_Reduce(& p->repeats, & g_stat.repeats, 1, MPI_UINT64_T, MPI_MIN, 0, o.com); + CHECK_MPI_RET(ret) + g_stat.stonewall_iterations = p->stonewall_iterations; + } + int write_rank0_latency_file = (o.rank == 0) && ! o.latency_keep_all; + + if(strcmp(name,"precreate") == 0){ + uint64_t repeats = aggregate_timers(p->repeats, max_repeats, p->time_create, g_stat.time_create); + if(o.rank == 0){ + compute_histogram("precreate-all", g_stat.time_create, & g_stat.stats_create, repeats, o.latency_keep_all); + } + compute_histogram("precreate", p->time_create, & p->stats_create, p->repeats, write_rank0_latency_file); + }else if(strcmp(name,"cleanup") == 0){ + uint64_t repeats = aggregate_timers(p->repeats, max_repeats, p->time_delete, g_stat.time_delete); + if(o.rank == 0) { + compute_histogram("cleanup-all", g_stat.time_delete, & g_stat.stats_delete, repeats, o.latency_keep_all); + } + compute_histogram("cleanup", p->time_delete, & p->stats_delete, p->repeats, write_rank0_latency_file); + }else if(strcmp(name,"benchmark") == 0){ + uint64_t repeats = aggregate_timers(p->repeats, max_repeats, p->time_read, g_stat.time_read); + if(o.rank == 0) { + compute_histogram("read-all", g_stat.time_read, & g_stat.stats_read, repeats, o.latency_keep_all); + } + compute_histogram("read", p->time_read, & p->stats_read, p->repeats, write_rank0_latency_file); + + repeats = aggregate_timers(p->repeats, max_repeats, p->time_stat, g_stat.time_stat); + if(o.rank == 0) { + compute_histogram("stat-all", g_stat.time_stat, & g_stat.stats_stat, repeats, o.latency_keep_all); + } + compute_histogram("stat", p->time_stat, & p->stats_stat, p->repeats, write_rank0_latency_file); + + if(! o.read_only){ + repeats = aggregate_timers(p->repeats, max_repeats, p->time_create, g_stat.time_create); + if(o.rank == 0) { + compute_histogram("create-all", g_stat.time_create, & g_stat.stats_create, repeats, o.latency_keep_all); + } + compute_histogram("create", p->time_create, & p->stats_create, p->repeats, write_rank0_latency_file); + + repeats = aggregate_timers(p->repeats, max_repeats, p->time_delete, g_stat.time_delete); + if(o.rank == 0) { + compute_histogram("delete-all", g_stat.time_delete, & g_stat.stats_delete, repeats, o.latency_keep_all); + } + compute_histogram("delete", p->time_delete, & p->stats_delete, p->repeats, write_rank0_latency_file); + } + } + + if (o.rank == 0){ + //print the stats: + print_p_stat(buff, name, & g_stat, g_stat.t, 1); + oprintf("%s\n", buff); + } + + if(o.process_report){ + if(o.rank == 0){ + print_p_stat(buff, name, p, p->t, 0); + oprintf("0: %s\n", buff); + for(int i=1; i < o.size; i++){ + MPI_Recv(buff, MAX_PATHLEN, MPI_CHAR, i, 4711, o.com, MPI_STATUS_IGNORE); + oprintf("%d: %s\n", i, buff); + } + }else{ + print_p_stat(buff, name, p, p->t, 0); + MPI_Send(buff, MAX_PATHLEN, MPI_CHAR, 0, 4711, o.com); + } + } + + if(g_stat.t_all){ + free(g_stat.t_all); + } + if(p->time_create){ + free(p->time_create); + free(p->time_read); + free(p->time_stat); + free(p->time_delete); + } + if(g_stat.time_create){ + free(g_stat.time_create); + free(g_stat.time_read); + free(g_stat.time_stat); + free(g_stat.time_delete); + } + + // copy the result back for the API + mdworkbench_result_t * res = & o.results->result[o.results->count]; + memcpy(& res->stats_create, & g_stat.stats_create, sizeof(time_statistics_t)); + memcpy(& res->stats_read, & g_stat.stats_read, sizeof(time_statistics_t)); + memcpy(& res->stats_stat, & g_stat.stats_stat, sizeof(time_statistics_t)); + memcpy(& res->stats_delete, & g_stat.stats_delete, sizeof(time_statistics_t)); + + o.results->count++; + + // allocate memory if necessary + // ret = mem_preallocate(& limit_memory_P, o.limit_memory_between_phases, o.verbosity >= 3); + // if( ret != 0){ + // printf("%d: Error allocating memory!\n", o.rank); + // } + // mem_free_preallocated(& limit_memory_P); +} + +void run_precreate(phase_stat_t * s, int current_index){ + char dset[MAX_PATHLEN]; + char obj_name[MAX_PATHLEN]; + int ret; + + for(int i=0; i < o.dset_count; i++){ + def_dset_name(dset, o.rank, i); + + ret = o.backend->mkdir(dset, DIRMODE, o.backend_options); + if (ret == 0){ + s->dset_create.suc++; + }else{ + s->dset_create.err++; + if (! o.ignore_precreate_errors){ + ERRF("%d: Error while creating the dset: %s", o.rank, dset); + } + } + } + + char * buf = aligned_buffer_alloc(o.file_size, o.gpu_memory_flags); + generate_memory_pattern(buf, o.file_size, o.random_seed, o.rank, o.dataPacketType); + double op_timer; // timer for individual operations + size_t pos = -1; // position inside the individual measurement array + double op_time; + + // create the obj + for(int f=current_index; f < o.precreate; f++){ + for(int d=0; d < o.dset_count; d++){ + pos++; + def_obj_name(obj_name, o.rank, d, f); + + op_timer = GetTimeStamp(); + aiori_fd_t * aiori_fh = o.backend->create(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL == aiori_fh){ + FAIL("Unable to open file %s", obj_name); + } + update_write_memory_pattern(f * o.dset_count + d, buf, o.file_size, o.random_seed, o.rank, o.dataPacketType); + if ( o.file_size == (int) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { + s->obj_create.suc++; + }else{ + s->obj_create.err++; + if (! o.ignore_precreate_errors){ + ERRF("%d: Error while creating the obj: %s", o.rank, obj_name); + } + } + o.backend->close(aiori_fh, o.backend_options); + + add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); + + if (o.verbosity >= 2){ + oprintf("%d: write %s:%s (%d) pretend: %d\n", o.rank, dset, obj_name, ret, o.rank); + } + } + } + aligned_buffer_free(buf, o.gpu_memory_flags); +} + +/* FIFO: create a new file, write to it. Then read from the first created file, delete it... */ +void run_benchmark(phase_stat_t * s, int * current_index_p){ + char obj_name[MAX_PATHLEN]; + int ret; + char * buf = aligned_buffer_alloc(o.file_size, o.gpu_memory_flags); + memset(buf, o.rank % 256, o.file_size); + double op_timer; // timer for individual operations + size_t pos = -1; // position inside the individual measurement array + int start_index = *current_index_p; + int total_num = o.num; + int armed_stone_wall = (o.stonewall_timer > 0); + int f; + double phase_allreduce_time = 0; + aiori_fd_t * aiori_fh; + + for(f=0; f < total_num; f++){ + float bench_runtime = 0; // the time since start + for(int d=0; d < o.dset_count; d++){ + double op_time; + struct stat stat_buf; + const int prevFile = f + start_index; + pos++; + + int readRank = (o.rank - o.offset * (d+1)) % o.size; + readRank = readRank < 0 ? readRank + o.size : readRank; + def_obj_name(obj_name, readRank, d, prevFile); + + op_timer = GetTimeStamp(); + + ret = o.backend->stat(obj_name, & stat_buf, o.backend_options); + // TODO potentially check return value must be identical to o.file_size + + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_stat, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + mdw_wait(op_time); + } + + if (o.verbosity >= 2){ + oprintf("%d: stat %s (%d)\n", o.rank, obj_name, ret); + } + + if(ret != 0){ + if (o.verbosity) + ERRF("%d: Error while stating the obj: %s", o.rank, obj_name); + s->obj_stat.err++; + continue; + } + s->obj_stat.suc++; + + if (o.verbosity >= 2){ + oprintf("%d: read %s pretend: %d\n", o.rank, obj_name, readRank); + } + + op_timer = GetTimeStamp(); + aiori_fh = o.backend->open(obj_name, IOR_RDONLY, o.backend_options); + if (NULL == aiori_fh){ + FAIL("Unable to open file %s", obj_name); + } + if ( o.file_size == (int) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options) ) { + if(o.verify_read){ + if(verify_memory_pattern(prevFile * o.dset_count + d, buf, o.file_size, o.random_seed, readRank, o.dataPacketType) == 0){ + s->obj_read.suc++; + }else{ + s->obj_read.err++; + } + }else{ + s->obj_read.suc++; + } + }else{ + s->obj_read.err++; + EWARNF("%d: Error while reading the obj: %s", o.rank, obj_name); + } + o.backend->close(aiori_fh, o.backend_options); + + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_read, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + mdw_wait(op_time); + } + if(o.read_only){ + continue; + } + + op_timer = GetTimeStamp(); + o.backend->delete(obj_name, o.backend_options); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + mdw_wait(op_time); + } + + if (o.verbosity >= 2){ + oprintf("%d: delete %s\n", o.rank, obj_name); + } + s->obj_delete.suc++; + + int writeRank = (o.rank + o.offset * (d+1)) % o.size; + const int newFileIndex = o.precreate + prevFile; + def_obj_name(obj_name, writeRank, d, newFileIndex); + + op_timer = GetTimeStamp(); + aiori_fh = o.backend->create(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL != aiori_fh){ + generate_memory_pattern(buf, o.file_size, o.random_seed, writeRank, o.dataPacketType); + update_write_memory_pattern(newFileIndex * o.dset_count + d, buf, o.file_size, o.random_seed, writeRank, o.dataPacketType); + + if ( o.file_size == (int) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { + s->obj_create.suc++; + }else{ + s->obj_create.err++; + if (! o.ignore_precreate_errors){ + ERRF("%d: Error while creating the obj: %s\n", o.rank, obj_name); + } + } + o.backend->close(aiori_fh, o.backend_options); + }else{ + if (! o.ignore_precreate_errors){ + ERRF("%d: Error while creating the obj: %s", o.rank, obj_name); + } + EWARNF("Unable to open file %s", obj_name); + s->obj_create.err++; + } + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + mdw_wait(op_time); + } + + if (o.verbosity >= 2){ + oprintf("%d: write %s (%d) pretend: %d\n", o.rank, obj_name, ret, writeRank); + } + } // end loop + + if(armed_stone_wall && bench_runtime >= o.stonewall_timer){ + if(o.verbosity){ + oprintf("%d: stonewall runtime %fs (%ds)\n", o.rank, bench_runtime, o.stonewall_timer); + } + if(! o.stonewall_timer_wear_out){ + s->stonewall_iterations = f; + break; + } + armed_stone_wall = 0; + // wear out mode, now reduce the maximum + int cur_pos = f + 1; + phase_allreduce_time = GetTimeStamp() - s->phase_start_timer; + int ret = MPI_Allreduce(& cur_pos, & total_num, 1, MPI_INT, MPI_MAX, o.com); + CHECK_MPI_RET(ret) + s->phase_start_timer = GetTimeStamp(); + s->stonewall_iterations = total_num; + if(o.rank == 0){ + oprintf("stonewall wear out %fs (%d iter)\n", bench_runtime, total_num); + } + if(f == total_num){ + break; + } + } + } + s->t = GetTimeStamp() - s->phase_start_timer + phase_allreduce_time; + if(armed_stone_wall && o.stonewall_timer_wear_out){ + int f = total_num; + int ret = MPI_Allreduce(& f, & total_num, 1, MPI_INT, MPI_MAX, o.com); + CHECK_MPI_RET(ret) + s->stonewall_iterations = total_num; + } + if(o.stonewall_timer && ! o.stonewall_timer_wear_out){ + // TODO FIXME + int sh = s->stonewall_iterations; + int ret = MPI_Allreduce(& sh, & s->stonewall_iterations, 1, MPI_INT, MPI_MAX, o.com); + CHECK_MPI_RET(ret) + } + + if(! o.read_only) { + *current_index_p += f; + } + s->repeats = pos + 1; + aligned_buffer_free(buf, o.gpu_memory_flags); +} + +void run_cleanup(phase_stat_t * s, int start_index){ + char dset[MAX_PATHLEN]; + char obj_name[MAX_PATHLEN]; + double op_timer; // timer for individual operations + size_t pos = -1; // position inside the individual measurement array + + for(int d=0; d < o.dset_count; d++){ + for(int f=0; f < o.precreate; f++){ + double op_time; + pos++; + def_obj_name(obj_name, o.rank, d, f + start_index); + + op_timer = GetTimeStamp(); + o.backend->delete(obj_name, o.backend_options); + add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); + + if (o.verbosity >= 2){ + oprintf("%d: delete %s\n", o.rank, obj_name); + } + s->obj_delete.suc++; + } + + def_dset_name(dset, o.rank, d); + if (o.backend->rmdir(dset, o.backend_options) == 0) { + s->dset_delete.suc++; + }else{ + oprintf("Unable to remove directory %s\n", dset); + } + if (o.verbosity >= 2){ + oprintf("%d: delete dset %s\n", o.rank, dset); + } + } +} + + +static option_help options [] = { + {'O', "offset", "Offset in o.ranks between writers and readers. Writers and readers should be located on different nodes.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.offset}, + {'a', "api", "The API (plugin) to use for the benchmark, use list to show all compiled plugins.", OPTION_OPTIONAL_ARGUMENT, 's', & o.interface}, + {'I', "obj-per-proc", "Number of I/O operations per data set.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.num}, + {'L', "latency", "Measure the latency for individual operations, prefix the result files with the provided filename.", OPTION_OPTIONAL_ARGUMENT, 's', & o.latency_file_prefix}, + {0, "latency-all", "Keep the latency files from all ranks.", OPTION_FLAG, 'd', & o.latency_keep_all}, + {'P', "precreate-per-set", "Number of object to precreate per data set.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.precreate}, + {'D', "data-sets", "Number of data sets covered per process and iteration.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.dset_count}, + {'G', NULL, "Timestamp/Random seed for access pattern, if not set, a random value is used", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_seed}, + {'o', NULL, "Output directory", OPTION_OPTIONAL_ARGUMENT, 's', & o.prefix}, + {'q', "quiet", "Avoid irrelevant printing.", OPTION_FLAG, 'd', & o.quiet_output}, + //{'m', "lim-free-mem", "Allocate memory until this limit (in MiB) is reached.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.limit_memory}, + // {'M', "lim-free-mem-phase", "Allocate memory until this limit (in MiB) is reached between the phases, but free it before starting the next phase; the time is NOT included for the phase.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.limit_memory_between_phases}, + {'S', "object-size", "Size for the created objects.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.file_size}, + {'R', "iterations", "Number of times to rerun the main phase", OPTION_OPTIONAL_ARGUMENT, 'd', & o.iterations}, + {'t', "waiting-time", "Waiting time relative to runtime (1.0 is 100%%)", OPTION_OPTIONAL_ARGUMENT, 'f', & o.relative_waiting_factor}, + {'T', "adaptive-waiting", "Compute an adaptive waiting time", OPTION_FLAG, 'd', & o.adaptive_waiting_mode}, + {'1', "run-precreate", "Run precreate phase", OPTION_FLAG, 'd', & o.phase_precreate}, + {'2', "run-benchmark", "Run benchmark phase", OPTION_FLAG, 'd', & o.phase_benchmark}, + {'3', "run-cleanup", "Run cleanup phase (only run explicit phases)", OPTION_FLAG, 'd', & o.phase_cleanup}, + {'w', "stonewall-timer", "Stop each benchmark iteration after the specified seconds (if not used with -W this leads to process-specific progress!)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stonewall_timer}, + {'W', "stonewall-wear-out", "Stop with stonewall after specified time and use a soft wear-out phase -- all processes perform the same number of iterations", OPTION_FLAG, 'd', & o.stonewall_timer_wear_out}, + {'X', "verify-read", "Verify the data on read", OPTION_FLAG, 'd', & o.verify_read}, + {0, "dataPacketType", "type of packet that will be created [offset|incompressible|timestamp|o|i|t]", OPTION_OPTIONAL_ARGUMENT, 's', & o.packetTypeStr}, + {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags}, + {0, "start-item", "The iteration number of the item to start with, allowing to offset the operations", OPTION_OPTIONAL_ARGUMENT, 'l', & o.start_item_number}, + {0, "print-detailed-stats", "Print detailed machine parsable statistics.", OPTION_FLAG, 'd', & o.print_detailed_stats}, + {0, "read-only", "Run read-only during benchmarking phase (no deletes/writes), probably use with -2", OPTION_FLAG, 'd', & o.read_only}, + {0, "ignore-precreate-errors", "Ignore errors occuring during the pre-creation phase", OPTION_FLAG, 'd', & o.ignore_precreate_errors}, + {0, "process-reports", "Independent report per process/rank", OPTION_FLAG, 'd', & o.process_report}, + {'v', "verbose", "Increase the verbosity level", OPTION_FLAG, 'd', & o.verbosity}, + {0, "run-info-file", "The log file for resuming a previous run", OPTION_OPTIONAL_ARGUMENT, 's', & o.run_info_file}, + LAST_OPTION + }; + +static void printTime(){ + char buff[100]; + time_t now = time(0); + strftime (buff, 100, "%Y-%m-%d %H:%M:%S", localtime (&now)); + oprintf("%s\n", buff); +} + +static int return_position(){ + int position, ret; + if( o.rank == 0){ + FILE * f = fopen(o.run_info_file, "r"); + if(! f){ + ERRF("[ERROR] Could not open %s for restart", o.run_info_file); + exit(1); + } + ret = fscanf(f, "pos: %d", & position); + if (ret != 1){ + ERRF("Could not read from %s for restart", o.run_info_file); + exit(1); + } + fclose(f); + } + ret = MPI_Bcast( & position, 1, MPI_INT, 0, o.com ); + return position; +} + +static void store_position(int position){ + if (o.rank != 0){ + return; + } + FILE * f = fopen(o.run_info_file, "w"); + if(! f){ + ERRF("[ERROR] Could not open %s for saving data", o.run_info_file); + exit(1); + } + fprintf(f, "pos: %d\n", position); + fclose(f); +} + +mdworkbench_results_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile){ + int ret; + int printhelp = 0; + char * limit_memory_P = NULL; + init_options(); + init_clock(world_com); + + o.com = world_com; + o.logfile = out_logfile; + + MPI_Comm_rank(o.com, & o.rank); + MPI_Comm_size(o.com, & o.size); + + if (o.rank == 0 && ! o.quiet_output){ + oprintf("Args: %s", argv[0]); + for(int i=1; i < argc; i++){ + oprintf(" \"%s\"", argv[i]); + } + oprintf("\n"); + } + + memset(& o.hints, 0, sizeof(o.hints)); + options_all_t * global_options = airoi_create_all_module_options(options); + int parsed = option_parse(argc, argv, global_options); + o.backend = aiori_select(o.interface); + if (o.backend == NULL){ + ERR("Unrecognized I/O API"); + } + if (! o.backend->enable_mdtest){ + ERR("Backend doesn't support MDWorbench"); + } + o.backend_options = airoi_update_module_options(o.backend, global_options); + + o.dataPacketType = parsePacketType(o.packetTypeStr[0]); + + if (!(o.phase_cleanup || o.phase_precreate || o.phase_benchmark)){ + // enable all phases + o.phase_cleanup = o.phase_precreate = o.phase_benchmark = 1; + } + if (! o.phase_precreate && o.phase_benchmark && o.stonewall_timer && ! o.stonewall_timer_wear_out){ + if(o.rank == 0) + ERR("Invalid options, if running only the benchmark phase using -2 with stonewall option then use stonewall wear-out"); + exit(1); + } + if( o.random_seed == -1 ){ + o.random_seed = time(NULL); + MPI_Bcast(& o.random_seed, 1, MPI_INT, 0, o.com); + } + + if(o.backend->xfer_hints){ + o.backend->xfer_hints(& o.hints); + } + if(o.backend->check_params){ + o.backend->check_params(o.backend_options); + } + if (o.backend->initialize){ + o.backend->initialize(o.backend_options); + } + + int current_index = 0; + + if ( (o.phase_cleanup || o.phase_benchmark) && ! o.phase_precreate ){ + current_index = return_position(); + } + + if(o.start_item_number){ + oprintf("Using start position %lld\n", (long long) o.start_item_number); + current_index = o.start_item_number; + } + + size_t total_obj_count = o.dset_count * (size_t) (o.num * o.iterations + o.precreate) * o.size; + if (o.rank == 0 && ! o.quiet_output){ + oprintf("MD-Workbench total objects: %zu workingset size: %.3f MiB (version: %s) time: ", total_obj_count, ((double) o.size) * o.dset_count * o.precreate * o.file_size / 1024.0 / 1024.0, PACKAGE_VERSION); + printTime(); + if(o.num > o.precreate){ + oprintf("WARNING: num > precreate, this may cause the situation that no objects are available to read\n"); + } + } + + if ( o.rank == 0 && ! o.quiet_output ){ + // print the set output options + // option_print_current(options); + // oprintf("\n"); + } + + // preallocate memory if necessary + //ret = mem_preallocate(& limit_memory_P, o.limit_memory, o.verbosity >= 3); + //if(ret != 0){ + // printf("%d: Error allocating memory\n", o.rank); + // MPI_Abort(o.com, 1); + //} + + double t_bench_start; + t_bench_start = GetTimeStamp(); + phase_stat_t phase_stats; + size_t result_count = (2 + o.iterations) * (o.adaptive_waiting_mode ? 7 : 1); + o.results = malloc(sizeof(mdworkbench_results_t) + sizeof(mdworkbench_result_t) * result_count); + memset(o.results, 0, sizeof(mdworkbench_results_t) + sizeof(mdworkbench_result_t) * result_count); + o.results->count = 0; + + if(o.rank == 0 && o.print_detailed_stats && ! o.quiet_output){ + print_detailed_stat_header(); + } + + if (o.phase_precreate){ + if (o.rank == 0){ + if (o.backend->mkdir(o.prefix, DIRMODE, o.backend_options) != 0) { + EWARNF("Unable to create test directory %s", o.prefix); + } + } + init_stats(& phase_stats, o.precreate * o.dset_count); + MPI_Barrier(o.com); + + // pre-creation phase + phase_stats.phase_start_timer = GetTimeStamp(); + run_precreate(& phase_stats, current_index); + phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; + end_phase("precreate", & phase_stats); + } + + if (o.phase_benchmark){ + // benchmark phase + for(o.global_iteration = 0; o.global_iteration < o.iterations; o.global_iteration++){ + if(o.adaptive_waiting_mode){ + o.relative_waiting_factor = 0; + } + init_stats(& phase_stats, o.num * o.dset_count); + MPI_Barrier(o.com); + phase_stats.phase_start_timer = GetTimeStamp(); + run_benchmark(& phase_stats, & current_index); + end_phase("benchmark", & phase_stats); + + if(o.adaptive_waiting_mode){ + o.relative_waiting_factor = 0.0625; + for(int r=0; r <= 6; r++){ + init_stats(& phase_stats, o.num * o.dset_count); + MPI_Barrier(o.com); + phase_stats.phase_start_timer = GetTimeStamp(); + run_benchmark(& phase_stats, & current_index); + end_phase("benchmark", & phase_stats); + o.relative_waiting_factor *= 2; + } + } + } + } + + // cleanup phase + if (o.phase_cleanup){ + init_stats(& phase_stats, o.precreate * o.dset_count); + phase_stats.phase_start_timer = GetTimeStamp(); + run_cleanup(& phase_stats, current_index); + phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; + end_phase("cleanup", & phase_stats); + + if (o.rank == 0){ + if (o.backend->rmdir(o.prefix, o.backend_options) != 0) { + oprintf("Unable to remove directory %s\n", o.prefix); + } + } + }else{ + store_position(current_index); + } + + double t_all = GetTimeStamp() - t_bench_start; + if(o.backend->finalize){ + o.backend->finalize(o.backend_options); + } + if (o.rank == 0 && ! o.quiet_output){ + oprintf("Total runtime: %.0fs time: ", t_all); + printTime(); + } + //mem_free_preallocated(& limit_memory_P); + return o.results; +} diff --git a/src/md-workbench.h b/src/md-workbench.h new file mode 100644 index 0000000..394a43c --- /dev/null +++ b/src/md-workbench.h @@ -0,0 +1,42 @@ +#ifndef IOR_MD_WORKBENCH_H +#define IOR_MD_WORKBENCH_H + +#include +#include +#include + +typedef struct{ + float min; + float q1; + float median; + float q3; + float q90; + float q99; + float max; +} time_statistics_t; + + +// statistics for running a single phase +typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! + time_statistics_t stats_create; + time_statistics_t stats_read; + time_statistics_t stats_stat; + time_statistics_t stats_delete; + + int errors; + double rate; + double max_op_time; + double runtime; + uint64_t iterations_done; +} mdworkbench_result_t; + +typedef struct{ + int count; // the number of results + int errors; + mdworkbench_result_t result[]; +} mdworkbench_results_t; + +// @Return The first statistics returned are precreate, then iteration many benchmark runs, the last is cleanup +mdworkbench_results_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile); + +#endif diff --git a/src/mdtest.c b/src/mdtest.c index 5488834..3c49a85 100644 --- a/src/mdtest.c +++ b/src/mdtest.c @@ -76,6 +76,8 @@ #include +#pragma GCC diagnostic ignored "-Wformat-overflow" + #ifdef HAVE_LUSTRE_LUSTREAPI #include #endif /* HAVE_LUSTRE_LUSTREAPI */ @@ -88,87 +90,99 @@ #define LLU "%lu" -static int size; -static uint64_t *rand_array; -static char testdir[MAX_PATHLEN]; -static char testdirpath[MAX_PATHLEN]; -static char base_tree_name[MAX_PATHLEN]; -static char **filenames; -static char hostname[MAX_PATHLEN]; -static char mk_name[MAX_PATHLEN]; -static char stat_name[MAX_PATHLEN]; -static char read_name[MAX_PATHLEN]; -static char rm_name[MAX_PATHLEN]; -static char unique_mk_dir[MAX_PATHLEN]; -static char unique_chdir_dir[MAX_PATHLEN]; -static char unique_stat_dir[MAX_PATHLEN]; -static char unique_read_dir[MAX_PATHLEN]; -static char unique_rm_dir[MAX_PATHLEN]; -static char unique_rm_uni_dir[MAX_PATHLEN]; -static char *write_buffer; -static char *read_buffer; -static char *verify_read_buffer; -static char *stoneWallingStatusFile; +typedef struct { + int size; + uint64_t *rand_array; + char testdir[MAX_PATHLEN]; + char testdirpath[MAX_PATHLEN]; + char base_tree_name[MAX_PATHLEN]; + char **filenames; + char hostname[MAX_PATHLEN]; + char mk_name[MAX_PATHLEN]; + char stat_name[MAX_PATHLEN]; + char read_name[MAX_PATHLEN]; + char rm_name[MAX_PATHLEN]; + char unique_mk_dir[MAX_PATHLEN]; + char unique_chdir_dir[MAX_PATHLEN]; + char unique_stat_dir[MAX_PATHLEN]; + char unique_read_dir[MAX_PATHLEN]; + char unique_rm_dir[MAX_PATHLEN]; + char unique_rm_uni_dir[MAX_PATHLEN]; + char *write_buffer; + char *stoneWallingStatusFile; + int gpu_memory_flags; -static int barriers; -static int create_only; -static int stat_only; -static int read_only; -static int verify_read; -static int verification_error; -static int remove_only; -static int leaf_only; -static unsigned branch_factor; -static int depth; + int barriers; + int create_only; + int stat_only; + int read_only; + int verify_read; + int verify_write; + int verification_error; + int remove_only; + int rename_dirs; + int leaf_only; + unsigned branch_factor; + int depth; + int random_buffer_offset; /* user settable value, otherwise random */ -/* - * This is likely a small value, but it's sometimes computed by - * branch_factor^(depth+1), so we'll make it a larger variable, - * just in case. - */ -static uint64_t num_dirs_in_tree; -/* - * As we start moving towards Exascale, we could have billions - * of files in a directory. Make room for that possibility with - * a larger variable. - */ -static uint64_t items; -static uint64_t items_per_dir; -static uint64_t num_dirs_in_tree_calc; /* this is a workaround until the overal code is refactored */ -static int directory_loops; -static int print_time; -static int print_rate_and_time; -static int random_seed; -static int shared_file; -static int files_only; -static int dirs_only; -static int pre_delay; -static int unique_dir_per_task; -static int time_unique_dir_overhead; -static int throttle; -static int collective_creates; -static size_t write_bytes; -static int stone_wall_timer_seconds; -static size_t read_bytes; -static int sync_file; -static int call_sync; -static int path_count; -static int nstride; /* neighbor stride */ -static int make_node = 0; -#ifdef HAVE_LUSTRE_LUSTREAPI -static int global_dir_layout; -#endif /* HAVE_LUSTRE_LUSTREAPI */ + /* + * This is likely a small value, but it's sometimes computed by + * branch_factor^(depth+1), so we'll make it a larger variable, + * just in case. + */ + uint64_t num_dirs_in_tree; + /* + * As we start moving towards Exascale, we could have billions + * of files in a directory. Make room for that possibility with + * a larger variable. + */ + uint64_t items; + uint64_t items_per_dir; + uint64_t num_dirs_in_tree_calc; /* this is a workaround until the overal code is refactored */ + int directory_loops; + int print_time; + int print_rate_and_time; + int print_all_proc; + int show_perrank_statistics; + ior_dataPacketType_e dataPacketType; + int random_seed; + int shared_file; + int files_only; + int dirs_only; + int pre_delay; + int unique_dir_per_task; + int time_unique_dir_overhead; + int collective_creates; + size_t write_bytes; + int stone_wall_timer_seconds; + size_t read_bytes; + int sync_file; + int call_sync; + int path_count; + int nstride; /* neighbor stride */ + int make_node; + #ifdef HAVE_LUSTRE_LUSTREAPI + int global_dir_layout; + #endif /* HAVE_LUSTRE_LUSTREAPI */ + char * saveRankDetailsCSV; /* save the details about the performance to a file */ + const char *prologue; + const char *epilogue; -static mdtest_results_t * summary_table; -static pid_t pid; -static uid_t uid; + mdtest_results_t * summary_table; + pid_t pid; + uid_t uid; + + /* Use the POSIX backend by default */ + const ior_aiori_t *backend; + void * backend_options; + aiori_xfer_hint_t hints; + char * api; +} mdtest_options_t; + +static mdtest_options_t o; -/* Use the POSIX backend by default */ -static const ior_aiori_t *backend; -static void * backend_options; -static aiori_xfer_hint_t hints; -static char * api = NULL; /* This structure describes the processing status for stonewalling */ typedef struct{ @@ -187,6 +201,8 @@ typedef struct{ /* for making/removing unique directory && stating/deleting subdirectory */ enum {MK_UNI_DIR, STAT_SUB_DIR, READ_SUB_DIR, RM_SUB_DIR, RM_UNI_DIR}; +#define PRINT(...) fprintf(out_logfile, __VA_ARGS__); + /* a helper function for passing debug and verbose messages. use the MACRO as it will insert __LINE__ for you. Pass the verbose level for root to print, then the verbose level for anyone to print. @@ -212,25 +228,6 @@ void VerboseMessage (int root_level, int any_level, int line, char * format, ... } } -void generate_memory_pattern(char * buffer, size_t bytes){ - for(int i=0; i < bytes; i++){ - buffer[i] = i + 1; - } -} - -void offset_timers(double * t, int tcount) { - double toffset; - int i; - - - VERBOSE(1,-1,"V-1: Entering offset_timers..." ); - - toffset = GetTimeStamp() - t[tcount]; - for (i = 0; i < tcount+1; i++) { - t[i] += toffset; - } -} - void parse_dirpath(char *dirpath_arg) { char * tmp, * token; char delimiter_string[3] = { '@', '\n', '\0' }; @@ -241,46 +238,57 @@ void parse_dirpath(char *dirpath_arg) { tmp = dirpath_arg; - if (* tmp != '\0') path_count++; + if (* tmp != '\0') o.path_count++; while (* tmp != '\0') { if (* tmp == '@') { - path_count++; + o.path_count++; } tmp++; } // prevent changes to the original dirpath_arg dirpath_arg = strdup(dirpath_arg); - filenames = (char **)malloc(path_count * sizeof(char **)); - if (filenames == NULL || dirpath_arg == NULL) { - FAIL("out of memory"); - } + o.filenames = (char **) safeMalloc(o.path_count * sizeof(char **)); token = strtok(dirpath_arg, delimiter_string); while (token != NULL) { - filenames[i] = token; + o.filenames[i] = token; token = strtok(NULL, delimiter_string); i++; } } static void prep_testdir(int j, int dir_iter){ - int pos = sprintf(testdir, "%s", testdirpath); - if ( testdir[strlen( testdir ) - 1] != '/' ) { - pos += sprintf(& testdir[pos], "/"); + int pos = sprintf(o.testdir, "%s", o.testdirpath); + if ( o.testdir[strlen( o.testdir ) - 1] != '/' ) { + pos += sprintf(& o.testdir[pos], "/"); + } + pos += sprintf(& o.testdir[pos], "%s", TEST_DIR); + pos += sprintf(& o.testdir[pos], ".%d-%d", j, dir_iter); +} + +static void phase_prepare(){ + if (*o.prologue){ + VERBOSE(0,5,"calling prologue: \"%s\"", o.prologue); + system(o.prologue); + if (o.barriers) { + MPI_Barrier(testComm); + } } - pos += sprintf(& testdir[pos], "%s", TEST_DIR); - pos += sprintf(& testdir[pos], ".%d-%d", j, dir_iter); } static void phase_end(){ - if (call_sync){ - if(! backend->sync){ + if (o.call_sync){ + if(! o.backend->sync){ FAIL("Error, backend does not provide the sync method, but you requested to use sync.\n"); } - backend->sync(backend_options); + o.backend->sync(o.backend_options); + } + if (*o.epilogue){ + VERBOSE(0,5,"calling epilogue: \"%s\"", o.epilogue); + system(o.epilogue); } - if (barriers) { + if (o.barriers) { MPI_Barrier(testComm); } } @@ -293,15 +301,15 @@ static void phase_end(){ void unique_dir_access(int opt, char *to) { if (opt == MK_UNI_DIR) { MPI_Barrier(testComm); - sprintf( to, "%s/%s", testdir, unique_chdir_dir ); + sprintf( to, "%s/%s", o.testdir, o.unique_chdir_dir ); } else if (opt == STAT_SUB_DIR) { - sprintf( to, "%s/%s", testdir, unique_stat_dir ); + sprintf( to, "%s/%s", o.testdir, o.unique_stat_dir ); } else if (opt == READ_SUB_DIR) { - sprintf( to, "%s/%s", testdir, unique_read_dir ); + sprintf( to, "%s/%s", o.testdir, o.unique_read_dir ); } else if (opt == RM_SUB_DIR) { - sprintf( to, "%s/%s", testdir, unique_rm_dir ); + sprintf( to, "%s/%s", o.testdir, o.unique_rm_dir ); } else if (opt == RM_UNI_DIR) { - sprintf( to, "%s/%s", testdir, unique_rm_uni_dir ); + sprintf( to, "%s/%s", o.testdir, o.unique_rm_uni_dir ); } VERBOSE(1,-1,"Entering unique_dir_access, set it to %s", to ); } @@ -315,16 +323,16 @@ static void create_remove_dirs (const char *path, bool create, uint64_t itemNum) } //create dirs - sprintf(curr_item, "%s/dir.%s%" PRIu64, path, create ? mk_name : rm_name, itemNum); + sprintf(curr_item, "%s/dir.%s%" PRIu64, path, create ? o.mk_name : o.rm_name, itemNum); VERBOSE(3,5,"create_remove_items_helper (dirs %s): curr_item is '%s'", operation, curr_item); if (create) { - if (backend->mkdir(curr_item, DIRMODE, backend_options) == -1) { - FAIL("unable to create directory %s", curr_item); + if (o.backend->mkdir(curr_item, DIRMODE, o.backend_options) == -1) { + EWARNF("unable to create directory %s", curr_item); } } else { - if (backend->rmdir(curr_item, backend_options) == -1) { - FAIL("unable to remove directory %s", curr_item); + if (o.backend->rmdir(curr_item, o.backend_options) == -1) { + EWARNF("unable to remove directory %s", curr_item); } } } @@ -337,13 +345,14 @@ static void remove_file (const char *path, uint64_t itemNum) { } //remove files - sprintf(curr_item, "%s/file.%s"LLU"", path, rm_name, itemNum); + sprintf(curr_item, "%s/file.%s"LLU"", path, o.rm_name, itemNum); VERBOSE(3,5,"create_remove_items_helper (non-dirs remove): curr_item is '%s'", curr_item); - if (!(shared_file && rank != 0)) { - backend->delete (curr_item, backend_options); + if (!(o.shared_file && rank != 0)) { + o.backend->delete (curr_item, o.backend_options); } } + static void create_file (const char *path, uint64_t itemNum) { char curr_item[MAX_PATHLEN]; aiori_fd_t *aiori_fh = NULL; @@ -353,52 +362,66 @@ static void create_file (const char *path, uint64_t itemNum) { } //create files - sprintf(curr_item, "%s/file.%s"LLU"", path, mk_name, itemNum); + sprintf(curr_item, "%s/file.%s"LLU"", path, o.mk_name, itemNum); VERBOSE(3,5,"create_remove_items_helper (non-dirs create): curr_item is '%s'", curr_item); - if (make_node) { + if (o.make_node) { int ret; VERBOSE(3,5,"create_remove_items_helper : mknod..." ); - ret = backend->mknod (curr_item); + ret = o.backend->mknod (curr_item); if (ret != 0) - FAIL("unable to mknode file %s", curr_item); + EWARNF("unable to mknode file %s", curr_item); return; - } else if (collective_creates) { + } else if (o.collective_creates) { VERBOSE(3,5,"create_remove_items_helper (collective): open..." ); - aiori_fh = backend->open (curr_item, IOR_WRONLY | IOR_CREAT, backend_options); - if (NULL == aiori_fh) - FAIL("unable to open file %s", curr_item); + aiori_fh = o.backend->open (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL == aiori_fh){ + EWARNF("unable to open file %s", curr_item); + return; + } /* * !collective_creates */ } else { - hints.filePerProc = !shared_file; + o.hints.filePerProc = ! o.shared_file; VERBOSE(3,5,"create_remove_items_helper (non-collective, shared): open..." ); - aiori_fh = backend->create (curr_item, IOR_WRONLY | IOR_CREAT, backend_options); - if (NULL == aiori_fh) - FAIL("unable to create file %s", curr_item); + aiori_fh = o.backend->create (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL == aiori_fh){ + EWARNF("unable to create file %s", curr_item); + return; + } } - if (write_bytes > 0) { + if (o.write_bytes > 0) { VERBOSE(3,5,"create_remove_items_helper: write..." ); - /* - * According to Bill Loewe, writes are only done one time, so they are always at - * offset 0 (zero). - */ - hints.fsyncPerWrite = sync_file; - if ( write_bytes != (size_t) backend->xfer (WRITE, aiori_fh, (IOR_size_t *) write_buffer, write_bytes, 0, backend_options)) { - FAIL("unable to write file %s", curr_item); + o.hints.fsyncPerWrite = o.sync_file; + update_write_memory_pattern(itemNum, o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType); + + if ( o.write_bytes != (size_t) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) o.write_buffer, o.write_bytes, 0, o.backend_options)) { + EWARNF("unable to write file %s", curr_item); + } + + if (o.verify_write) { + o.write_buffer[0] = 42; + if (o.write_bytes != (size_t) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) o.write_buffer, o.write_bytes, 0, o.backend_options)) { + EWARNF("unable to verify write (read/back) file %s", curr_item); + } + int error = verify_memory_pattern(itemNum, o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType); + o.verification_error += error; + if(error){ + VERBOSE(1,1,"verification error in file: %s", curr_item); + } } } VERBOSE(3,5,"create_remove_items_helper: close..." ); - backend->close (aiori_fh, backend_options); + o.backend->close (aiori_fh, o.backend_options); } /* helper for creating/removing items */ @@ -438,22 +461,22 @@ void collective_helper(const int dirs, const int create, const char* path, uint6 continue; } - sprintf(curr_item, "%s/file.%s"LLU"", path, create ? mk_name : rm_name, itemNum+i); + sprintf(curr_item, "%s/file.%s"LLU"", path, create ? o.mk_name : o.rm_name, itemNum+i); VERBOSE(3,5,"create file: %s", curr_item); if (create) { aiori_fd_t *aiori_fh; //create files - aiori_fh = backend->create (curr_item, IOR_WRONLY | IOR_CREAT, backend_options); + aiori_fh = o.backend->create (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options); if (NULL == aiori_fh) { - FAIL("unable to create file %s", curr_item); + EWARNF("unable to create file %s", curr_item); + }else{ + o.backend->close (aiori_fh, o.backend_options); } - - backend->close (aiori_fh, backend_options); - } else if (!(shared_file && rank != 0)) { + } else if (!(o.shared_file && rank != 0)) { //remove files - backend->delete (curr_item, backend_options); + o.backend->delete (curr_item, o.backend_options); } if(CHECK_STONE_WALL(progress)){ progress->items_done = i + 1; @@ -463,7 +486,7 @@ void collective_helper(const int dirs, const int create, const char* path, uint6 progress->items_done = progress->items_per_dir; } -/* recusive function to create and remove files/directories from the +/* recursive function to create and remove files/directories from the directory tree */ void create_remove_items(int currDepth, const int dirs, const int create, const int collective, const char *path, uint64_t dirNum, rank_progress_t * progress) { unsigned i; @@ -482,7 +505,7 @@ void create_remove_items(int currDepth, const int dirs, const int create, const if (currDepth == 0) { /* create items at this depth */ - if (!leaf_only || (depth == 0 && leaf_only)) { + if (! o.leaf_only || (o.depth == 0 && o.leaf_only)) { if (collective) { collective_helper(dirs, create, temp_path, 0, progress); } else { @@ -490,28 +513,28 @@ void create_remove_items(int currDepth, const int dirs, const int create, const } } - if (depth > 0) { + if (o.depth > 0) { create_remove_items(++currDepth, dirs, create, collective, temp_path, ++dirNum, progress); } - } else if (currDepth <= depth) { + } else if (currDepth <= o.depth) { /* iterate through the branches */ - for (i=0; i 0) { //item is not in tree's root directory /* prepend parent directory to item's path */ - sprintf(temp, "%s."LLU"/%s", base_tree_name, parent_dir, item); + sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); //still not at the tree's root dir - while (parent_dir > branch_factor) { - parent_dir = (uint64_t) ((parent_dir-1) / branch_factor); - sprintf(temp, "%s."LLU"/%s", base_tree_name, parent_dir, item); + while (parent_dir > o.branch_factor) { + parent_dir = (uint64_t) ((parent_dir-1) / o.branch_factor); + sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); } } @@ -610,13 +633,12 @@ void mdtest_stat(const int random, const int dirs, const long dir_iter, const ch /* below temp used to be hiername */ VERBOSE(3,5,"mdtest_stat %4s: %s", (dirs ? "dir" : "file"), item); - if (-1 == backend->stat (item, &buf, backend_options)) { - FAIL("unable to stat %s %s", dirs ? "directory" : "file", item); + if (-1 == o.backend->stat (item, &buf, o.backend_options)) { + EWARNF("unable to stat %s %s", dirs ? "directory" : "file", item); } } } - /* reads all of the items created as specified by the input parameters */ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { uint64_t parent_dir, item_num = 0; @@ -624,27 +646,18 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { aiori_fd_t *aiori_fh; VERBOSE(1,-1,"Entering mdtest_read on %s", path ); + char *read_buffer; /* allocate read buffer */ - if (read_bytes > 0) { - int alloc_res = posix_memalign((void**)&read_buffer, sysconf(_SC_PAGESIZE), read_bytes); - if (alloc_res) { - FAIL("out of memory"); - } - - if (verify_read > 0) { - verify_read_buffer = (char *)malloc(read_bytes); - if (verify_read_buffer == NULL) { - FAIL("out of memory"); - } - generate_memory_pattern(verify_read_buffer, read_bytes); - } + if (o.read_bytes > 0) { + read_buffer = aligned_buffer_alloc(o.read_bytes, o.gpu_memory_flags); + memset(read_buffer, -1, o.read_bytes); } - uint64_t stop_items = items; + uint64_t stop_items = o.items; - if( directory_loops != 1 ){ - stop_items = items_per_dir; + if( o.directory_loops != 1 ){ + stop_items = o.items_per_dir; } /* iterate over all of the item IDs */ @@ -663,15 +676,15 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { /* determine the item number to read */ if (random) { - item_num = rand_array[i]; + item_num = o.rand_array[i]; } else { item_num = i; } /* make adjustments if in leaf only mode*/ - if (leaf_only) { - item_num += items_per_dir * - (num_dirs_in_tree - (uint64_t) pow (branch_factor, depth)); + if (o.leaf_only) { + item_num += o.items_per_dir * + (o.num_dirs_in_tree - (uint64_t) pow (o.branch_factor, o.depth)); } /* create name of file to read */ @@ -679,22 +692,22 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { if ((i%ITEM_COUNT == 0) && (i != 0)) { VERBOSE(3,5,"read file: "LLU"", i); } - sprintf(item, "file.%s"LLU"", read_name, item_num); + sprintf(item, "file.%s"LLU"", o.read_name, item_num); } /* determine the path to the file/dir to be read'ed */ - parent_dir = item_num / items_per_dir; + parent_dir = item_num / o.items_per_dir; if (parent_dir > 0) { //item is not in tree's root directory /* prepend parent directory to item's path */ - sprintf(temp, "%s."LLU"/%s", base_tree_name, parent_dir, item); + sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); /* still not at the tree's root dir */ - while (parent_dir > branch_factor) { - parent_dir = (unsigned long long) ((parent_dir-1) / branch_factor); - sprintf(temp, "%s."LLU"/%s", base_tree_name, parent_dir, item); + while (parent_dir > o.branch_factor) { + parent_dir = (unsigned long long) ((parent_dir-1) / o.branch_factor); + sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); } } @@ -707,27 +720,37 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { VERBOSE(3,5,"mdtest_read file: %s", item); /* open file for reading */ - aiori_fh = backend->open (item, O_RDONLY, backend_options); + aiori_fh = o.backend->open (item, O_RDONLY, o.backend_options); if (NULL == aiori_fh) { - FAIL("unable to open file %s", item); + EWARNF("unable to open file %s", item); + continue; } /* read file */ - if (read_bytes > 0) { - read_buffer[0] = 42; /* use a random value to ensure that the read_buffer is now different from the expected buffer and read isn't sometimes NOOP */ - if (read_bytes != (size_t) backend->xfer (READ, aiori_fh, (IOR_size_t *) read_buffer, read_bytes, 0, backend_options)) { - FAIL("unable to read file %s", item); + if (o.read_bytes > 0) { + read_buffer[0] = 42; + if (o.read_bytes != (size_t) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) read_buffer, o.read_bytes, 0, o.backend_options)) { + EWARNF("unable to read file %s", item); + continue; } - if(verify_read){ - if (memcmp(read_buffer, verify_read_buffer, read_bytes) != 0){ - VERBOSE(2, -1, "Error verifying %s", item); - verification_error++; + int pretend_rank = (2 * o.nstride + rank) % o.size; + if(o.verify_read){ + if (o.shared_file) { + pretend_rank = rank; + } + int error = verify_memory_pattern(item_num, read_buffer, o.read_bytes, o.random_buffer_offset, pretend_rank, o.dataPacketType); + o.verification_error += error; + if(error){ + VERBOSE(1,1,"verification error in file: %s", item); } } } /* close file */ - backend->close (aiori_fh, backend_options); + o.backend->close (aiori_fh, o.backend_options); + } + if(o.read_bytes){ + aligned_buffer_free(read_buffer, o.gpu_memory_flags); } } @@ -742,40 +765,40 @@ void collective_create_remove(const int create, const int dirs, const int ntasks for (int i = 0 ; i < ntasks ; ++i) { memset(temp, 0, MAX_PATHLEN); - strcpy(temp, testdir); + strcpy(temp, o.testdir); strcat(temp, "/"); /* set the base tree name appropriately */ - if (unique_dir_per_task) { - sprintf(base_tree_name, "mdtest_tree.%d", i); + if (o.unique_dir_per_task) { + sprintf(o.base_tree_name, "mdtest_tree.%d", i); } else { - sprintf(base_tree_name, "mdtest_tree"); + sprintf(o.base_tree_name, "mdtest_tree"); } /* Setup to do I/O to the appropriate test dir */ - strcat(temp, base_tree_name); + strcat(temp, o.base_tree_name); strcat(temp, ".0"); /* set all item names appropriately */ - if (!shared_file) { - sprintf(mk_name, "mdtest.%d.", (i+(0*nstride))%ntasks); - sprintf(stat_name, "mdtest.%d.", (i+(1*nstride))%ntasks); - sprintf(read_name, "mdtest.%d.", (i+(2*nstride))%ntasks); - sprintf(rm_name, "mdtest.%d.", (i+(3*nstride))%ntasks); + if (! o.shared_file) { + sprintf(o.mk_name, "mdtest.%d.", (i+(0*o.nstride))%ntasks); + sprintf(o.stat_name, "mdtest.%d.", (i+(1*o.nstride))%ntasks); + sprintf(o.read_name, "mdtest.%d.", (i+(2*o.nstride))%ntasks); + sprintf(o.rm_name, "mdtest.%d.", (i+(3*o.nstride))%ntasks); } - if (unique_dir_per_task) { - VERBOSE(3,5,"i %d nstride %d ntasks %d", i, nstride, ntasks); - sprintf(unique_mk_dir, "%s/mdtest_tree.%d.0", testdir, - (i+(0*nstride))%ntasks); - sprintf(unique_chdir_dir, "%s/mdtest_tree.%d.0", testdir, - (i+(1*nstride))%ntasks); - sprintf(unique_stat_dir, "%s/mdtest_tree.%d.0", testdir, - (i+(2*nstride))%ntasks); - sprintf(unique_read_dir, "%s/mdtest_tree.%d.0", testdir, - (i+(3*nstride))%ntasks); - sprintf(unique_rm_dir, "%s/mdtest_tree.%d.0", testdir, - (i+(4*nstride))%ntasks); - sprintf(unique_rm_uni_dir, "%s", testdir); + if (o.unique_dir_per_task) { + VERBOSE(3,5,"i %d nstride %d ntasks %d", i, o.nstride, ntasks); + sprintf(o.unique_mk_dir, "%s/mdtest_tree.%d.0", o.testdir, + (i+(0*o.nstride))%ntasks); + sprintf(o.unique_chdir_dir, "%s/mdtest_tree.%d.0", o.testdir, + (i+(1*o.nstride))%ntasks); + sprintf(o.unique_stat_dir, "%s/mdtest_tree.%d.0", o.testdir, + (i+(2*o.nstride))%ntasks); + sprintf(o.unique_read_dir, "%s/mdtest_tree.%d.0", o.testdir, + (i+(3*o.nstride))%ntasks); + sprintf(o.unique_rm_dir, "%s/mdtest_tree.%d.0", o.testdir, + (i+(4*o.nstride))%ntasks); + sprintf(o.unique_rm_uni_dir, "%s", o.testdir); } /* Now that everything is set up as it should be, do the create or remove */ @@ -785,61 +808,155 @@ void collective_create_remove(const int create, const int dirs, const int ntasks } /* reset all of the item names */ - if (unique_dir_per_task) { - sprintf(base_tree_name, "mdtest_tree.0"); + if (o.unique_dir_per_task) { + sprintf(o.base_tree_name, "mdtest_tree.0"); } else { - sprintf(base_tree_name, "mdtest_tree"); + sprintf(o.base_tree_name, "mdtest_tree"); } - if (!shared_file) { - sprintf(mk_name, "mdtest.%d.", (0+(0*nstride))%ntasks); - sprintf(stat_name, "mdtest.%d.", (0+(1*nstride))%ntasks); - sprintf(read_name, "mdtest.%d.", (0+(2*nstride))%ntasks); - sprintf(rm_name, "mdtest.%d.", (0+(3*nstride))%ntasks); + if (! o.shared_file) { + sprintf(o.mk_name, "mdtest.%d.", (0+(0*o.nstride))%ntasks); + sprintf(o.stat_name, "mdtest.%d.", (0+(1*o.nstride))%ntasks); + sprintf(o.read_name, "mdtest.%d.", (0+(2*o.nstride))%ntasks); + sprintf(o.rm_name, "mdtest.%d.", (0+(3*o.nstride))%ntasks); } - if (unique_dir_per_task) { - sprintf(unique_mk_dir, "%s/mdtest_tree.%d.0", testdir, - (0+(0*nstride))%ntasks); - sprintf(unique_chdir_dir, "%s/mdtest_tree.%d.0", testdir, - (0+(1*nstride))%ntasks); - sprintf(unique_stat_dir, "%s/mdtest_tree.%d.0", testdir, - (0+(2*nstride))%ntasks); - sprintf(unique_read_dir, "%s/mdtest_tree.%d.0", testdir, - (0+(3*nstride))%ntasks); - sprintf(unique_rm_dir, "%s/mdtest_tree.%d.0", testdir, - (0+(4*nstride))%ntasks); - sprintf(unique_rm_uni_dir, "%s", testdir); + if (o.unique_dir_per_task) { + sprintf(o.unique_mk_dir, "%s/mdtest_tree.%d.0", o.testdir, + (0+(0*o.nstride))%ntasks); + sprintf(o.unique_chdir_dir, "%s/mdtest_tree.%d.0", o.testdir, + (0+(1*o.nstride))%ntasks); + sprintf(o.unique_stat_dir, "%s/mdtest_tree.%d.0", o.testdir, + (0+(2*o.nstride))%ntasks); + sprintf(o.unique_read_dir, "%s/mdtest_tree.%d.0", o.testdir, + (0+(3*o.nstride))%ntasks); + sprintf(o.unique_rm_dir, "%s/mdtest_tree.%d.0", o.testdir, + (0+(4*o.nstride))%ntasks); + sprintf(o.unique_rm_uni_dir, "%s", o.testdir); } } +void rename_dir_test(const int dirs, const long dir_iter, const char *path, rank_progress_t * progress) { + uint64_t parent_dir, item_num = 0; + char item[MAX_PATHLEN], temp[MAX_PATHLEN]; + char item_last[MAX_PATHLEN]; + + if(o.backend->rename == NULL){ + WARN("Backend doesn't support rename\n"); + return; + } + + VERBOSE(1,-1,"Entering mdtest_rename on %s", path ); + + uint64_t stop_items = o.items; + + if( o.directory_loops != 1 ){ + stop_items = o.items_per_dir; + } + + if(stop_items == 1) return; + + /* iterate over all of the item IDs */ + char first_item_name[MAX_PATHLEN]; + for (uint64_t i = 0 ; i < stop_items; ++i) { + item_num = i; + /* make adjustments if in leaf only mode*/ + if (o.leaf_only) { + item_num += o.items_per_dir * (o.num_dirs_in_tree - (uint64_t) pow( o.branch_factor, o.depth )); + } + + /* create name of file/dir to stat */ + if (dirs) { + sprintf(item, "dir.%s"LLU"", o.stat_name, item_num); + } else { + sprintf(item, "file.%s"LLU"", o.stat_name, item_num); + } + + /* determine the path to the file/dir to be stat'ed */ + parent_dir = item_num / o.items_per_dir; + + if (parent_dir > 0) { //item is not in tree's root directory + /* prepend parent directory to item's path */ + sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); + strcpy(item, temp); + + //still not at the tree's root dir + while (parent_dir > o.branch_factor) { + parent_dir = (uint64_t) ((parent_dir-1) / o.branch_factor); + sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); + strcpy(item, temp); + } + } + + /* Now get item to have the full path */ + sprintf( temp, "%s/%s", path, item ); + strcpy( item, temp ); + + VERBOSE(3,5,"mdtest_rename %4s: %s", (dirs ? "dir" : "file"), item); + if(i == 0){ + sprintf(first_item_name, "%s-XX", item); + strcpy(item_last, first_item_name); + }else if(i == stop_items - 1){ + strcpy(item, first_item_name); + } + if (-1 == o.backend->rename(item, item_last, o.backend_options)) { + EWARNF("unable to rename %s %s", dirs ? "directory" : "file", item); + } + + strcpy(item_last, item); + } +} + +static void updateResult(mdtest_results_t * res, mdtest_test_num_t test, uint64_t item_count, double t_start, double t_end, double t_end_before_barrier){ + res->time[test] = t_end - t_start; + if(isfinite(t_end_before_barrier)){ + res->time_before_barrier[test] = t_end_before_barrier - t_start; + }else{ + res->time_before_barrier[test] = res->time[test]; + } + if(item_count == 0){ + res->rate[test] = 0.0; + res->rate_before_barrier[test] = 0.0; + }else{ + res->rate[test] = item_count/res->time[test]; + res->rate_before_barrier[test] = item_count/res->time_before_barrier[test]; + } + res->items[test] = item_count; + res->stonewall_last_item[test] = o.items; +} + void directory_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) { int size; - double t[5] = {0}; + double t_start, t_end, t_end_before_barrier; char temp_path[MAX_PATHLEN]; + mdtest_results_t * res = & o.summary_table[iteration]; MPI_Comm_size(testComm, &size); VERBOSE(1,-1,"Entering directory_test on %s", path ); MPI_Barrier(testComm); - t[0] = GetTimeStamp(); /* create phase */ - if(create_only) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + if(o.create_only) { + phase_prepare(); + t_start = GetTimeStamp(); + progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds; + progress->items_done = 0; + progress->start_time = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { + if (o.unique_dir_per_task) { unique_dir_access(MK_UNI_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 0); + if (! o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,-1,"directory_test: create path is '%s'", temp_path ); /* "touch" the files */ - if (collective_creates) { + if (o.collective_creates) { if (rank == 0) { collective_create_remove(1, 1, ntasks, temp_path, progress); } @@ -848,80 +965,117 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran create_remove_items(0, 1, 1, 0, temp_path, 0, progress); } } + progress->stone_wall_timer_seconds = 0; + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_DIR_CREATE_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[1] = GetTimeStamp(); - /* stat phase */ - if (stat_only) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + if (o.stat_only) { + phase_prepare(); + t_start = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { + if (o.unique_dir_per_task) { unique_dir_access(STAT_SUB_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 1); + if (! o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"stat path is '%s'", temp_path ); /* stat directories */ - if (random_seed > 0) { + if (o.random_seed > 0) { mdtest_stat(1, 1, dir_iter, temp_path, progress); } else { mdtest_stat(0, 1, dir_iter, temp_path, progress); } } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_DIR_STAT_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[2] = GetTimeStamp(); /* read phase */ - if (read_only) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + if (o.read_only) { + phase_prepare(); + t_start = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { + if (o.unique_dir_per_task) { unique_dir_access(READ_SUB_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 2); + if (! o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"directory_test: read path is '%s'", temp_path ); /* read directories */ - if (random_seed > 0) { + if (o.random_seed > 0) { ; /* N/A */ } else { ; /* N/A */ } } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_DIR_READ_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[3] = GetTimeStamp(); - - if (remove_only) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + /* rename phase */ + if(o.rename_dirs && o.items > 1){ + phase_prepare(); + t_start = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { - unique_dir_access(RM_SUB_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 3); + if (o.unique_dir_per_task) { + unique_dir_access(STAT_SUB_DIR, temp_path); + if (! o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); + } + + VERBOSE(3,5,"rename path is '%s'", temp_path ); + + rename_dir_test(1, dir_iter, temp_path, progress); + } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_DIR_RENAME_NUM, o.items, t_start, t_end, t_end_before_barrier); + } + + /* remove phase */ + if (o.remove_only) { + phase_prepare(); + t_start = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ + prep_testdir(iteration, dir_iter); + if (o.unique_dir_per_task) { + unique_dir_access(RM_SUB_DIR, temp_path); + if (!o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); + } + } else { + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"directory_test: remove directories path is '%s'", temp_path ); /* remove directories */ - if (collective_creates) { + if (o.collective_creates) { if (rank == 0) { collective_create_remove(0, 1, ntasks, temp_path, progress); } @@ -929,234 +1083,231 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran create_remove_items(0, 1, 0, 0, temp_path, 0, progress); } } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_DIR_REMOVE_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[4] = GetTimeStamp(); - - if (remove_only) { - if (unique_dir_per_task) { + if (o.remove_only) { + if (o.unique_dir_per_task) { unique_dir_access(RM_UNI_DIR, temp_path); } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"directory_test: remove unique directories path is '%s'\n", temp_path ); } - if (unique_dir_per_task && !time_unique_dir_overhead) { - offset_timers(t, 4); - } - - /* calculate times */ - if (create_only) { - summary_table[iteration].rate[0] = items*size/(t[1] - t[0]); - summary_table[iteration].time[0] = t[1] - t[0]; - summary_table[iteration].items[0] = items*size; - summary_table[iteration].stonewall_last_item[0] = items; - } - if (stat_only) { - summary_table[iteration].rate[1] = items*size/(t[2] - t[1]); - summary_table[iteration].time[1] = t[2] - t[1]; - summary_table[iteration].items[1] = items*size; - summary_table[iteration].stonewall_last_item[1] = items; - } - if (read_only) { - summary_table[iteration].rate[2] = items*size/(t[3] - t[2]); - summary_table[iteration].time[2] = t[3] - t[2]; - summary_table[iteration].items[2] = items*size; - summary_table[iteration].stonewall_last_item[2] = items; - } - if (remove_only) { - summary_table[iteration].rate[3] = items*size/(t[4] - t[3]); - summary_table[iteration].time[3] = t[4] - t[3]; - summary_table[iteration].items[3] = items*size; - summary_table[iteration].stonewall_last_item[3] = items; - } - - VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", t[1] - t[0], summary_table[iteration].rate[0]); - VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", t[2] - t[1], summary_table[iteration].rate[1]); - /* N/A - VERBOSE(1,-1," Directory read : %14.3f sec, %14.3f ops/sec", t[3] - t[2], summary_table[iteration].rate[2]); - */ - VERBOSE(1,-1," Directory removal : %14.3f sec, %14.3f ops/sec", t[4] - t[3], summary_table[iteration].rate[3]); + VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_CREATE_NUM], o.summary_table[iteration].rate[MDTEST_DIR_CREATE_NUM]); + VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_STAT_NUM], o.summary_table[iteration].rate[MDTEST_DIR_STAT_NUM]); + VERBOSE(1,-1," Directory rename : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_RENAME_NUM], o.summary_table[iteration].rate[MDTEST_DIR_RENAME_NUM]); + VERBOSE(1,-1," Directory removal : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_REMOVE_NUM], o.summary_table[iteration].rate[MDTEST_DIR_REMOVE_NUM]); } /* Returns if the stonewall was hit */ -int updateStoneWallIterations(int iteration, rank_progress_t * progress, double tstart){ +int updateStoneWallIterations(int iteration, uint64_t items_done, double tstart, uint64_t * out_max_iter){ int hit = 0; - uint64_t done = progress->items_done; long long unsigned max_iter = 0; - VERBOSE(1,1,"stonewall hit with %lld items", (long long) progress->items_done ); - MPI_Allreduce(& progress->items_done, & max_iter, 1, MPI_LONG_LONG_INT, MPI_MAX, testComm); - summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM] = GetTimeStamp() - tstart; + VERBOSE(1,1,"stonewall hit with %lld items", (long long) items_done ); + MPI_Allreduce(& items_done, & max_iter, 1, MPI_LONG_LONG_INT, MPI_MAX, testComm); + o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM] = GetTimeStamp() - tstart; + o.summary_table[iteration].stonewall_last_item[MDTEST_FILE_CREATE_NUM] = items_done; + *out_max_iter = max_iter; // continue to the maximum... long long min_accessed = 0; - MPI_Reduce(& progress->items_done, & min_accessed, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm); + MPI_Reduce(& items_done, & min_accessed, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm); long long sum_accessed = 0; - MPI_Reduce(& progress->items_done, & sum_accessed, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm); - summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM] = sum_accessed; - summary_table[iteration].stonewall_item_min[MDTEST_FILE_CREATE_NUM] = min_accessed * size; + MPI_Reduce(& items_done, & sum_accessed, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm); + o.summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM] = sum_accessed; + o.summary_table[iteration].stonewall_item_min[MDTEST_FILE_CREATE_NUM] = min_accessed * o.size; - if(items != (sum_accessed / size)){ - VERBOSE(0,-1, "Continue stonewall hit min: %lld max: %lld avg: %.1f \n", min_accessed, max_iter, ((double) sum_accessed) / size); + if(o.items != (sum_accessed / o.size)){ + VERBOSE(0,-1, "Continue stonewall hit min: %lld max: %lld avg: %.1f \n", min_accessed, max_iter, ((double) sum_accessed) / o.size); hit = 1; } - progress->items_start = done; - progress->items_per_dir = max_iter; return hit; } +void file_test_create(const int iteration, const int ntasks, const char *path, rank_progress_t * progress, double *t_start){ + char temp_path[MAX_PATHLEN]; + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ + prep_testdir(iteration, dir_iter); + + if (o.unique_dir_per_task) { + unique_dir_access(MK_UNI_DIR, temp_path); + VERBOSE(5,5,"operating on %s", temp_path); + if (! o.time_unique_dir_overhead) { + *t_start = GetTimeStamp(); + } + } else { + sprintf( temp_path, "%s/%s", o.testdir, path ); + } + + VERBOSE(3,-1,"file_test: create path is '%s'", temp_path ); + /* "touch" the files */ + if (o.collective_creates) { + if (rank == 0) { + collective_create_remove(1, 0, ntasks, temp_path, progress); + } + MPI_Barrier(testComm); + } + + /* create files */ + create_remove_items(0, 0, 1, 0, temp_path, 0, progress); + if(o.stone_wall_timer_seconds){ + // hit the stonewall + uint64_t max_iter = 0; + uint64_t items_done = progress->items_done + dir_iter * o.items_per_dir; + int hit = updateStoneWallIterations(iteration, items_done, *t_start, & max_iter); + progress->items_start = items_done; + progress->items_per_dir = max_iter; + if (hit){ + progress->stone_wall_timer_seconds = 0; + VERBOSE(1,1,"stonewall: %lld of %lld", (long long) progress->items_start, (long long) progress->items_per_dir); + create_remove_items(0, 0, 1, 0, temp_path, 0, progress); + // now reset the values + progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds; + o.items = progress->items_done; + } + if (o.stoneWallingStatusFile){ + StoreStoneWallingIterations(o.stoneWallingStatusFile, max_iter); + } + // reset stone wall timer to allow proper cleanup + progress->stone_wall_timer_seconds = 0; + // at the moment, stonewall can be done only with one directory_loop, so we can return here safely + break; + } + } +} + void file_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) { int size; - double t[5] = {0}; + double t_start, t_end, t_end_before_barrier; char temp_path[MAX_PATHLEN]; + mdtest_results_t * res = & o.summary_table[iteration]; + MPI_Comm_size(testComm, &size); VERBOSE(3,5,"Entering file_test on %s", path); MPI_Barrier(testComm); - t[0] = GetTimeStamp(); /* create phase */ - if (create_only ) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ - prep_testdir(iteration, dir_iter); - - if (unique_dir_per_task) { - unique_dir_access(MK_UNI_DIR, temp_path); - VERBOSE(5,5,"operating on %s", temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 0); - } - } else { - sprintf( temp_path, "%s/%s", testdir, path ); - } - - - - VERBOSE(3,-1,"file_test: create path is '%s'", temp_path ); - - /* "touch" the files */ - if (collective_creates) { - if (rank == 0) { - collective_create_remove(1, 0, ntasks, temp_path, progress); - } - MPI_Barrier(testComm); - } - - /* create files */ - create_remove_items(0, 0, 1, 0, temp_path, 0, progress); - if(stone_wall_timer_seconds){ - int hit = updateStoneWallIterations(iteration, progress, t[0]); - - if (hit){ - progress->stone_wall_timer_seconds = 0; - VERBOSE(1,1,"stonewall: %lld of %lld", (long long) progress->items_start, (long long) progress->items_per_dir); - create_remove_items(0, 0, 1, 0, temp_path, 0, progress); - // now reset the values - progress->stone_wall_timer_seconds = stone_wall_timer_seconds; - items = progress->items_done; - } - if (stoneWallingStatusFile){ - StoreStoneWallingIterations(stoneWallingStatusFile, progress->items_done); - } - // reset stone wall timer to allow proper cleanup - progress->stone_wall_timer_seconds = 0; - } - } + if (o.create_only ) { + phase_prepare(); + t_start = GetTimeStamp(); + progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds; + progress->items_done = 0; + progress->start_time = GetTimeStamp(); + file_test_create(iteration, ntasks, path, progress, &t_start); + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_FILE_CREATE_NUM, o.items, t_start, t_end, t_end_before_barrier); }else{ - if (stoneWallingStatusFile){ + if (o.stoneWallingStatusFile){ int64_t expected_items; /* The number of items depends on the stonewalling file */ - expected_items = ReadStoneWallingIterations(stoneWallingStatusFile); + expected_items = ReadStoneWallingIterations(o.stoneWallingStatusFile, testComm); if(expected_items >= 0){ - items = expected_items; - progress->items_per_dir = items; + if(o.directory_loops > 1){ + o.directory_loops = expected_items / o.items_per_dir; + o.items = o.items_per_dir; + }else{ + o.items = expected_items; + progress->items_per_dir = o.items; + } } if (rank == 0) { if(expected_items == -1){ - fprintf(out_logfile, "WARNING: could not read stonewall status file\n"); + WARN("Could not read stonewall status file"); }else { - VERBOSE(1,1, "Read stonewall status; items: "LLU"\n", items); + VERBOSE(1,1, "Read stonewall status; items: "LLU"\n", o.items); } } } } - phase_end(); - t[1] = GetTimeStamp(); - /* stat phase */ - if (stat_only ) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + if (o.stat_only ) { + phase_prepare(); + t_start = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { + if (o.unique_dir_per_task) { unique_dir_access(STAT_SUB_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 1); + if (!o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"file_test: stat path is '%s'", temp_path ); /* stat files */ - mdtest_stat((random_seed > 0 ? 1 : 0), 0, dir_iter, temp_path, progress); + mdtest_stat((o.random_seed > 0 ? 1 : 0), 0, dir_iter, temp_path, progress); } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_FILE_STAT_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[2] = GetTimeStamp(); - /* read phase */ - if (read_only ) { - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + if (o.read_only ) { + phase_prepare(); + t_start = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { + if (o.unique_dir_per_task) { unique_dir_access(READ_SUB_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 2); + if (! o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"file_test: read path is '%s'", temp_path ); /* read files */ - if (random_seed > 0) { + if (o.random_seed > 0) { mdtest_read(1,0, dir_iter, temp_path); } else { mdtest_read(0,0, dir_iter, temp_path); } } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_FILE_READ_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[3] = GetTimeStamp(); - - if (remove_only) { + /* remove phase */ + if (o.remove_only) { + phase_prepare(); + t_start = GetTimeStamp(); progress->items_start = 0; - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); - if (unique_dir_per_task) { + if (o.unique_dir_per_task) { unique_dir_access(RM_SUB_DIR, temp_path); - if (!time_unique_dir_overhead) { - offset_timers(t, 3); + if (! o.time_unique_dir_overhead) { + t_start = GetTimeStamp(); } } else { - sprintf( temp_path, "%s/%s", testdir, path ); + sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"file_test: rm directories path is '%s'", temp_path ); - if (collective_creates) { + if (o.collective_creates) { if (rank == 0) { collective_create_remove(0, 0, ntasks, temp_path, progress); } @@ -1165,12 +1316,14 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro create_remove_items(0, 0, 0, 0, temp_path, 0, progress); } } + t_end_before_barrier = GetTimeStamp(); + phase_end(); + t_end = GetTimeStamp(); + updateResult(res, MDTEST_FILE_REMOVE_NUM, o.items, t_start, t_end, t_end_before_barrier); } - phase_end(); - t[4] = GetTimeStamp(); - if (remove_only) { - if (unique_dir_per_task) { + if (o.remove_only) { + if (o.unique_dir_per_task) { unique_dir_access(RM_UNI_DIR, temp_path); } else { strcpy( temp_path, path ); @@ -1179,225 +1332,391 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro VERBOSE(3,5,"file_test: rm unique directories path is '%s'", temp_path ); } - if (unique_dir_per_task && !time_unique_dir_overhead) { - offset_timers(t, 4); + if(o.num_dirs_in_tree_calc){ /* this is temporary fix needed when using -n and -i together */ + o.items *= o.num_dirs_in_tree_calc; } - if(num_dirs_in_tree_calc){ /* this is temporary fix needed when using -n and -i together */ - items *= num_dirs_in_tree_calc; + VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_CREATE_NUM], o.summary_table[iteration].rate[4]); + if(o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM]){ + VERBOSE(1,-1," File creation (stonewall): %14.3f sec, %14.3f ops/sec", o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM], o.summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM]); } - - /* calculate times */ - if (create_only) { - summary_table[iteration].rate[4] = items*size/(t[1] - t[0]); - summary_table[iteration].time[4] = t[1] - t[0]; - summary_table[iteration].items[4] = items*size; - summary_table[iteration].stonewall_last_item[4] = items; - } - if (stat_only) { - summary_table[iteration].rate[5] = items*size/(t[2] - t[1]); - summary_table[iteration].time[5] = t[2] - t[1]; - summary_table[iteration].items[5] = items*size; - summary_table[iteration].stonewall_last_item[5] = items; - } - if (read_only) { - summary_table[iteration].rate[6] = items*size/(t[3] - t[2]); - summary_table[iteration].time[6] = t[3] - t[2]; - summary_table[iteration].items[6] = items*size; - summary_table[iteration].stonewall_last_item[6] = items; - } - if (remove_only) { - summary_table[iteration].rate[7] = items*size/(t[4] - t[3]); - summary_table[iteration].time[7] = t[4] - t[3]; - summary_table[iteration].items[7] = items*size; - summary_table[iteration].stonewall_last_item[7] = items; - } - - VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", t[1] - t[0], summary_table[iteration].rate[4]); - if(summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM]){ - VERBOSE(1,-1," File creation (stonewall): %14.3f sec, %14.3f ops/sec", summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM], summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM]); - } - VERBOSE(1,-1," File stat : %14.3f sec, %14.3f ops/sec", t[2] - t[1], summary_table[iteration].rate[5]); - VERBOSE(1,-1," File read : %14.3f sec, %14.3f ops/sec", t[3] - t[2], summary_table[iteration].rate[6]); - VERBOSE(1,-1," File removal : %14.3f sec, %14.3f ops/sec", t[4] - t[3], summary_table[iteration].rate[7]); + VERBOSE(1,-1," File stat : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_STAT_NUM], o.summary_table[iteration].rate[5]); + VERBOSE(1,-1," File read : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_READ_NUM], o.summary_table[iteration].rate[6]); + VERBOSE(1,-1," File removal : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_REMOVE_NUM], o.summary_table[iteration].rate[7]); } -void summarize_results(int iterations, int print_time) { - char access[MAX_PATHLEN]; - int i, j, k; - int start, stop, tableSize = MDTEST_LAST_NUM; - double min, max, mean, sd, sum = 0, var = 0, curr = 0; +char const * mdtest_test_name(int i){ + switch (i) { + case MDTEST_DIR_CREATE_NUM: return "Directory creation"; + case MDTEST_DIR_STAT_NUM: return "Directory stat"; + case MDTEST_DIR_READ_NUM: return "Directory read"; + case MDTEST_DIR_REMOVE_NUM: return "Directory removal"; + case MDTEST_DIR_RENAME_NUM: return "Directory rename"; + case MDTEST_FILE_CREATE_NUM: return "File creation"; + case MDTEST_FILE_STAT_NUM: return "File stat"; + case MDTEST_FILE_READ_NUM: return "File read"; + case MDTEST_FILE_REMOVE_NUM: return "File removal"; + case MDTEST_TREE_CREATE_NUM: return "Tree creation"; + case MDTEST_TREE_REMOVE_NUM: return "Tree removal"; + default: return "ERR INVALID TESTNAME :"; + } + return NULL; +} - double all[iterations * size * tableSize]; +/* + * Store the results of each process in a file + */ +static void StoreRankInformation(int iterations, mdtest_results_t * agg){ + const size_t size = sizeof(mdtest_results_t) * iterations; + if(rank == 0){ + FILE* fd = fopen(o.saveRankDetailsCSV, "a"); + if (fd == NULL){ + FAIL("Cannot open saveRankPerformanceDetails file for writes!"); + } + mdtest_results_t * results = safeMalloc(size * o.size); + MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, results, size / sizeof(double), MPI_DOUBLE, 0, testComm); - VERBOSE(1,-1,"Entering summarize_results..." ); - - MPI_Barrier(testComm); - for(int i=0; i < iterations; i++){ - if(print_time){ - MPI_Gather(& summary_table[i].time[0], tableSize, MPI_DOUBLE, & all[i*tableSize*size], tableSize, MPI_DOUBLE, 0, testComm); + char buff[4096]; + char * cpos = buff; + cpos += sprintf(cpos, "all,%llu", (long long unsigned) o.items); + for(int e = 0; e < MDTEST_LAST_NUM; e++){ + if(agg->items[e] == 0){ + cpos += sprintf(cpos, ",,"); }else{ - MPI_Gather(& summary_table[i].rate[0], tableSize, MPI_DOUBLE, & all[i*tableSize*size], tableSize, MPI_DOUBLE, 0, testComm); + cpos += sprintf(cpos, ",%.10e,%.10e", agg->items[e] / agg->time[e], agg->time[e]); } } + cpos += sprintf(cpos, "\n"); + int ret = fwrite(buff, cpos - buff, 1, fd); - if (rank != 0) { - return; - } - - VERBOSE(0,-1,"\nSUMMARY %s: (of %d iterations)", print_time ? "time": "rate", iterations); - VERBOSE(0,-1," Operation Max Min Mean Std Dev"); - VERBOSE(0,-1," --------- --- --- ---- -------"); - - /* if files only access, skip entries 0-3 (the dir tests) */ - if (files_only && !dirs_only) { - start = 4; - } else { - start = 0; - } - - /* if directories only access, skip entries 4-7 (the file tests) */ - if (dirs_only && !files_only) { - stop = 4; - } else { - stop = 8; - } - - /* special case: if no directory or file tests, skip all */ - if (!dirs_only && !files_only) { - start = stop = 0; - } - - for (i = start; i < stop; i++) { - min = max = all[i]; - for (k=0; k < size; k++) { - for (j = 0; j < iterations; j++) { - curr = all[(k*tableSize*iterations) - + (j*tableSize) + i]; - if (min > curr) { - min = curr; - } - if (max < curr) { - max = curr; - } - sum += curr; - } - } - mean = sum / (iterations * size); - for (k=0; kitems[e] == 0){ + cpos += sprintf(cpos, ",,"); + }else{ + cpos += sprintf(cpos, ",%.10e,%.10e", cur->items[e] / cur->time_before_barrier[e], cur->time_before_barrier[e]); + } + } + cpos += sprintf(cpos, "\n"); + ret = fwrite(buff, cpos - buff, 1, fd); + if(ret != 1){ + WARN("Couln't append to saveRankPerformanceDetailsCSV file\n"); + break; + } } } - if(stonewall_items != 0){ - fprintf(out_logfile, " File create (stonewall) : "); - fprintf(out_logfile, "%14s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA"); + fclose(fd); + free(results); + }else{ + /* this is a hack for now assuming all datatypes in the structure are double */ + MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm); + } +} + +static mdtest_results_t* get_result_index(mdtest_results_t* all_results, int proc, int iter, int interation_count){ + return & all_results[proc * interation_count + iter]; +} + +static void summarize_results_rank0(int iterations, mdtest_results_t * all_results, int print_time) { + int start, stop; + double min, max, mean, sd, sum, var, curr = 0; + double imin, imax, imean, isum, icur; // calculation per iteration + char const * access; + /* if files only access, skip entries 0-3 (the dir tests) */ + if (o.files_only && ! o.dirs_only) { + start = MDTEST_FILE_CREATE_NUM; + } else { + start = 0; + } + + /* if directories only access, skip entries 4-7 (the file tests) */ + if (o.dirs_only && !o.files_only) { + stop = MDTEST_FILE_CREATE_NUM; + } else { + stop = MDTEST_TREE_CREATE_NUM; + } + + /* special case: if no directory or file tests, skip all */ + if (!o.dirs_only && !o.files_only) { + start = stop = 0; + } + + if(o.print_all_proc){ + fprintf(out_logfile, "\nPer process result (%s):\n", print_time ? "time" : "rate"); + for (int j = 0; j < iterations; j++) { + fprintf(out_logfile, "iteration: %d\n", j); + for (int i = start; i < MDTEST_LAST_NUM; i++) { + access = mdtest_test_name(i); + if(access == NULL){ + continue; + } + fprintf(out_logfile, "Test %s", access); + for (int k=0; k < o.size; k++) { + mdtest_results_t * cur = get_result_index(all_results, k, j, iterations); + if(print_time){ + curr = cur->time_before_barrier[i]; + }else{ + curr = cur->rate_before_barrier[i]; + } + fprintf(out_logfile, "%c%e", (k==0 ? ' ': ','), curr); + } + fprintf(out_logfile, "\n"); + } } + } - /* calculate tree create/remove rates */ - for (i = 8; i < tableSize; i++) { - min = max = all[i]; - for (j = 0; j < iterations; j++) { - if(print_time){ - curr = summary_table[j].time[i]; - }else{ - curr = summary_table[j].rate[i]; - } + VERBOSE(0, -1, "\nSUMMARY %s: (of %d iterations)", print_time ? "time" : "rate", iterations); + PRINT(" Operation "); + if(o.show_perrank_statistics){ + PRINT("per Rank: Max Min Mean per Iteration:"); + }else{ + PRINT(" "); + } + PRINT(" Max Min Mean Std Dev\n"); + PRINT(" --------- "); - if (min > curr) { - min = curr; - } - if (max < curr) { - max = curr; - } - sum += curr; + if(o.show_perrank_statistics){ + PRINT(" --- --- ---- "); + } + PRINT(" --- --- ---- -------\n"); + for (int i = start; i < stop; i++) { + min = 1e308; + max = 0; + sum = var = 0; + imin = 1e308; + isum = imax = 0; + double iter_result[iterations]; + for (int j = 0; j < iterations; j++) { + icur = print_time ? 0 : 1e308; + for (int k = 0; k < o.size; k++) { + mdtest_results_t * cur = get_result_index(all_results, k, j, iterations); + if(print_time){ + curr = cur->time_before_barrier[i]; + }else{ + curr = cur->rate_before_barrier[i]; } - mean = sum / (iterations); - for (j = 0; j < iterations; j++) { - if(print_time){ - curr = summary_table[j].time[i]; - }else{ - curr = summary_table[j].rate[i]; - } + if (min > curr) { + min = curr; + } + if (max < curr) { + max = curr; + } + sum += curr; - var += pow((mean - curr), 2); + if (print_time) { + curr = cur->time[i]; + if (icur < curr) { + icur = curr; + } + } else { + curr = cur->rate[i]; + if (icur > curr) { + icur = curr; + } } - var = var / (iterations); - sd = sqrt(var); - switch (i) { - case 8: strcpy(access, "Tree creation :"); break; - case 9: strcpy(access, "Tree removal :"); break; - default: strcpy(access, "ERR"); break; - } - fprintf(out_logfile, " %s ", access); + } + + if (icur > imax) { + imax = icur; + } + if (icur < imin) { + imin = icur; + } + isum += icur; + if(print_time){ + iter_result[j] = icur; + }else{ + iter_result[j] = icur * o.size; + } + } + mean = sum / iterations / o.size; + imean = isum / iterations; + if(! print_time){ + imax *= o.size; + imin *= o.size; + isum *= o.size; + imean *= o.size; + } + for (int j = 0; j < iterations; j++) { + var += (imean - iter_result[j]) * (imean - iter_result[j]); + } + var = var / (iterations - 1); + sd = sqrt(var); + access = mdtest_test_name(i); + if (i != 2) { + fprintf(out_logfile, " %-18s ", access); + + if(o.show_perrank_statistics){ fprintf(out_logfile, "%14.3f ", max); fprintf(out_logfile, "%14.3f ", min); fprintf(out_logfile, "%14.3f ", mean); - fprintf(out_logfile, "%14.3f\n", sd); - fflush(out_logfile); - sum = var = 0; + fprintf(out_logfile, " "); + } + fprintf(out_logfile, " "); + fprintf(out_logfile, "%14.3f ", imax); + fprintf(out_logfile, "%14.3f ", imin); + fprintf(out_logfile, "%14.3f ", imean); + fprintf(out_logfile, "%14.3f\n", iterations == 1 ? 0 : sd); + fflush(out_logfile); } + } + + /* calculate tree create/remove rates, applies only to Rank 0 */ + for (int i = MDTEST_TREE_CREATE_NUM; i < MDTEST_LAST_NUM; i++) { + min = imin = 1e308; + max = imax = 0; + sum = var = 0; + for (int j = 0; j < iterations; j++) { + if(print_time){ + curr = o.summary_table[j].time[i]; + }else{ + curr = o.summary_table[j].rate[i]; + } + if (min > curr) { + min = curr; + } + if (max < curr) { + max = curr; + } + sum += curr; + if(curr > imax){ + imax = curr; + } + if(curr < imin){ + imin = curr; + } + } + + mean = sum / (iterations); + + for (int j = 0; j < iterations; j++) { + if(print_time){ + curr = o.summary_table[j].time[i]; + }else{ + curr = o.summary_table[j].rate[i]; + } + var += (mean - curr)*(mean - curr); + } + var = var / (iterations - 1); + sd = sqrt(var); + access = mdtest_test_name(i); + fprintf(out_logfile, " %-22s ", access); + if(o.show_perrank_statistics){ + fprintf(out_logfile, "%14.3f ", max); + fprintf(out_logfile, "%14.3f ", min); + fprintf(out_logfile, "%14.3f ", mean); + fprintf(out_logfile, " "); + } + fprintf(out_logfile, "%14.3f ", imax); + fprintf(out_logfile, "%14.3f ", imin); + fprintf(out_logfile, "%14.3f ", sum / iterations); + fprintf(out_logfile, "%14.3f\n", iterations == 1 ? 0 : sd); + fflush(out_logfile); + } +} + +/* + Output the results and summarize them into rank 0's o.summary_table + */ +void summarize_results(int iterations, mdtest_results_t * results) { + const size_t size = sizeof(mdtest_results_t) * iterations; + mdtest_results_t * all_results = NULL; + if(rank == 0){ + all_results = safeMalloc(size * o.size); + memset(all_results, 0, size * o.size); + MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, all_results, size / sizeof(double), MPI_DOUBLE, 0, testComm); + // calculate the aggregated values for all processes + for(int j=0; j < iterations; j++){ + for(int i=0; i < MDTEST_LAST_NUM; i++){ + //double sum_rate = 0; + double max_time = 0; + double max_stonewall_time = 0; + uint64_t sum_items = 0; + + // reduce over the processes + for(int p=0; p < o.size; p++){ + mdtest_results_t * cur = get_result_index(all_results, p, j, iterations); + //sum_rate += all_results[p + j*p]->rate[i]; + double t = cur->time[i]; + max_time = max_time < t ? t : max_time; + + sum_items += cur->items[i]; + + t = cur->stonewall_time[i]; + max_stonewall_time = max_stonewall_time < t ? t : max_stonewall_time; + } + + results[j].items[i] = sum_items; + results[j].time[i] = max_time; + results[j].stonewall_time[i] = max_stonewall_time; + if(sum_items == 0){ + results[j].rate[i] = 0.0; + }else{ + results[j].rate[i] = sum_items / max_time; + } + + /* These results have already been reduced to Rank 0 */ + results[j].stonewall_item_sum[i] = o.summary_table[j].stonewall_item_sum[i]; + results[j].stonewall_item_min[i] = o.summary_table[j].stonewall_item_min[i]; + results[j].stonewall_time[i] = o.summary_table[j].stonewall_time[i]; + } + } + }else{ + MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm); + } + + /* share global results across processes as these are returned by the API */ + MPI_Bcast(results, size / sizeof(double), MPI_DOUBLE, 0, testComm); + + /* update relevant result values with local values as these are returned by the API */ + for(int j=0; j < iterations; j++){ + for(int i=0; i < MDTEST_LAST_NUM; i++){ + results[j].time_before_barrier[i] = o.summary_table[j].time_before_barrier[i]; + results[j].stonewall_last_item[i] = o.summary_table[j].stonewall_last_item[i]; + } + } + + if(rank != 0){ + return; + } + + if (o.print_rate_and_time){ + summarize_results_rank0(iterations, all_results, 0); + summarize_results_rank0(iterations, all_results, 1); + }else{ + summarize_results_rank0(iterations, all_results, o.print_time); + } + + free(all_results); } /* Checks to see if the test setup is valid. If it isn't, fail. */ -void valid_tests() { +void md_validate_tests() { - if (((stone_wall_timer_seconds > 0) && (branch_factor > 1)) || ! barriers) { - FAIL( "Error, stone wall timer does only work with a branch factor <= 1 (current is %d) and with barriers\n", branch_factor); + if (((o.stone_wall_timer_seconds > 0) && (o.branch_factor > 1)) || ! o.barriers) { + FAIL( "Error, stone wall timer does only work with a branch factor <= 1 (current is %d) and with barriers\n", o.branch_factor); } - if (!create_only && !stat_only && !read_only && !remove_only) { - create_only = stat_only = read_only = remove_only = 1; + if (!o.create_only && ! o.stat_only && ! o.read_only && !o.remove_only && !o.rename_dirs) { + o.create_only = o.stat_only = o.read_only = o.remove_only = o.rename_dirs = 1; VERBOSE(1,-1,"main: Setting create/stat/read/remove_only to True" ); } - VERBOSE(1,-1,"Entering valid_tests..." ); + VERBOSE(1,-1,"Entering md_validate_tests..." ); /* if dirs_only and files_only were both left unset, set both now */ - if (!dirs_only && !files_only) { - dirs_only = files_only = 1; + if (!o.dirs_only && !o.files_only) { + o.dirs_only = o.files_only = 1; } /* if shared file 'S' access, no directory tests */ - if (shared_file) { - dirs_only = 0; + if (o.shared_file) { + o.dirs_only = 0; } /* check for no barriers with shifting processes for different phases. @@ -1405,63 +1724,95 @@ void valid_tests() { race conditions that may cause errors stat'ing or deleting after creates. */ - if (( barriers == 0 ) && ( nstride != 0 ) && ( rank == 0 )) { + if (( o.barriers == 0 ) && ( o.nstride != 0 ) && ( rank == 0 )) { FAIL( "Possible race conditions will occur: -B not compatible with -N"); } /* check for collective_creates incompatibilities */ - if (shared_file && collective_creates && rank == 0) { + if (o.shared_file && o.collective_creates && rank == 0) { FAIL("-c not compatible with -S"); } - if (path_count > 1 && collective_creates && rank == 0) { + if (o.path_count > 1 && o.collective_creates && rank == 0) { FAIL("-c not compatible with multiple test directories"); } - if (collective_creates && !barriers) { + if (o.collective_creates && !o.barriers) { FAIL("-c not compatible with -B"); } /* check for shared file incompatibilities */ - if (unique_dir_per_task && shared_file && rank == 0) { + if (o.unique_dir_per_task && o.shared_file && rank == 0) { FAIL("-u not compatible with -S"); } /* check multiple directory paths and strided option */ - if (path_count > 1 && nstride > 0) { + if (o.path_count > 1 && o.nstride > 0) { FAIL("cannot have multiple directory paths with -N strides between neighbor tasks"); } /* check for shared directory and multiple directories incompatibility */ - if (path_count > 1 && unique_dir_per_task != 1) { + if (o.path_count > 1 && o.unique_dir_per_task != 1) { FAIL("shared directory mode is not compatible with multiple directory paths"); } /* check if more directory paths than ranks */ - if (path_count > size) { + if (o.path_count > o.size) { FAIL("cannot have more directory paths than MPI tasks"); } /* check depth */ - if (depth < 0) { + if (o.depth < 0) { FAIL("depth must be greater than or equal to zero"); } /* check branch_factor */ - if (branch_factor < 1 && depth > 0) { + if (o.branch_factor < 1 && o.depth > 0) { FAIL("branch factor must be greater than or equal to zero"); } /* check for valid number of items */ - if ((items > 0) && (items_per_dir > 0)) { - if(unique_dir_per_task){ + if ((o.items > 0) && (o.items_per_dir > 0)) { + if(o.unique_dir_per_task){ FAIL("only specify the number of items or the number of items per directory"); - }else if( items % items_per_dir != 0){ + }else if( o.items % o.items_per_dir != 0){ FAIL("items must be a multiple of items per directory"); - }else if( stone_wall_timer_seconds != 0){ - FAIL("items + items_per_dir can only be set without stonewalling"); } } /* check for using mknod */ - if (write_bytes > 0 && make_node) { + if (o.write_bytes > 0 && o.make_node) { FAIL("-k not compatible with -w"); } + + if(o.verify_read && ! o.read_only) + FAIL("Verify read requires that the read test is used"); + + if(o.verify_read && o.read_bytes <= 0) + FAIL("Verify read requires that read bytes is > 0"); + + if(o.read_only && o.read_bytes <= 0) + WARN("Read bytes is 0, thus, a read test will actually just open/close"); + + if(o.create_only && o.read_only && o.read_bytes > o.write_bytes) + FAIL("When writing and reading files, read bytes must be smaller than write bytes"); + + if (rank == 0 && o.saveRankDetailsCSV){ + // check that the file is writeable, truncate it and add header + FILE* fd = fopen(o.saveRankDetailsCSV, "w"); + if (fd == NULL){ + FAIL("Cannot open saveRankPerformanceDetails file for write!"); + } + char * head = "rank,items"; + int ret = fwrite(head, strlen(head), 1, fd); + for(int e = 0; e < MDTEST_LAST_NUM; e++){ + char buf[1024]; + const char * str = mdtest_test_name(e); + + sprintf(buf, ",rate-%s,time-%s", str, str); + ret = fwrite(buf, strlen(buf), 1, fd); + if(ret != 1){ + FAIL("Cannot write header to saveRankPerformanceDetails file"); + } + } + fwrite("\n", 1, 1, fd); + fclose(fd); + } } void show_file_system_size(char *file_system) { @@ -1482,7 +1833,7 @@ void show_file_system_size(char *file_system) { VERBOSE(1,-1,"Entering show_file_system_size on %s", file_system ); - ret = backend->statfs (file_system, &stat_buf, backend_options); + ret = o.backend->statfs (file_system, &stat_buf, o.backend_options); if (0 != ret) { FAIL("unable to stat file system %s", file_system); } @@ -1520,39 +1871,6 @@ void show_file_system_size(char *file_system) { return; } -void display_freespace(char *testdirpath) -{ - char dirpath[MAX_PATHLEN] = {0}; - int i; - int directoryFound = 0; - - - VERBOSE(3,5,"Entering display_freespace on %s...", testdirpath ); - - strcpy(dirpath, testdirpath); - - /* get directory for outfile */ - i = strlen(dirpath); - while (i-- > 0) { - if (dirpath[i] == '/') { - dirpath[i] = '\0'; - directoryFound = 1; - break; - } - } - - /* if no directory/, use '.' */ - if (directoryFound == 0) { - strcpy(dirpath, "."); - } - - VERBOSE(3,5,"Before show_file_system_size, dirpath is '%s'", dirpath ); - show_file_system_size(dirpath); - VERBOSE(3,5, "After show_file_system_size, dirpath is '%s'\n", dirpath ); - - return; -} - void create_remove_directory_tree(int create, int currDepth, char* path, int dirNum, rank_progress_t * progress) { @@ -1563,16 +1881,16 @@ void create_remove_directory_tree(int create, VERBOSE(1,5,"Entering create_remove_directory_tree on %s, currDepth = %d...", path, currDepth ); if (currDepth == 0) { - sprintf(dir, "%s/%s.%d/", path, base_tree_name, dirNum); + sprintf(dir, "%s/%s.%d/", path, o.base_tree_name, dirNum); if (create) { VERBOSE(2,5,"Making directory '%s'", dir); - if (-1 == backend->mkdir (dir, DIRMODE, backend_options)) { - fprintf(out_logfile, "error could not create directory '%s'\n", dir); + if (-1 == o.backend->mkdir (dir, DIRMODE, o.backend_options)) { + EWARNF("unable to create tree directory '%s'", dir); } #ifdef HAVE_LUSTRE_LUSTREAPI /* internal node for branching, can be non-striped for children */ - if (global_dir_layout && \ + if (o.global_dir_layout && \ llapi_dir_set_default_lmv_stripe(dir, -1, 0, LMV_HASH_TYPE_FNV_1A_64, NULL) == -1) { @@ -1585,35 +1903,35 @@ void create_remove_directory_tree(int create, if (!create) { VERBOSE(2,5,"Remove directory '%s'", dir); - if (-1 == backend->rmdir(dir, backend_options)) { - FAIL("Unable to remove directory %s", dir); + if (-1 == o.backend->rmdir(dir, o.backend_options)) { + EWARNF("Unable to remove directory %s", dir); } } - } else if (currDepth <= depth) { + } else if (currDepth <= o.depth) { char temp_path[MAX_PATHLEN]; strcpy(temp_path, path); int currDir = dirNum; - for (i=0; imkdir(temp_path, DIRMODE, backend_options)) { - FAIL("Unable to create directory %s", temp_path); + if (-1 == o.backend->mkdir(temp_path, DIRMODE, o.backend_options)) { + EWARNF("Unable to create directory %s", temp_path); } } create_remove_directory_tree(create, ++currDepth, - temp_path, (branch_factor*currDir)+1, progress); + temp_path, (o.branch_factor*currDir)+1, progress); currDepth--; if (!create) { VERBOSE(2,5,"Remove directory '%s'", temp_path); - if (-1 == backend->rmdir(temp_path, backend_options)) { - FAIL("Unable to remove directory %s", temp_path); + if (-1 == o.backend->rmdir(temp_path, o.backend_options)) { + EWARNF("Unable to remove directory %s", temp_path); } } @@ -1623,12 +1941,11 @@ void create_remove_directory_tree(int create, } } -static void mdtest_iteration(int i, int j, MPI_Group testgroup, mdtest_results_t * summary_table){ +static void mdtest_iteration(int i, int j, mdtest_results_t * summary_table){ rank_progress_t progress_o; memset(& progress_o, 0 , sizeof(progress_o)); - progress_o.start_time = GetTimeStamp(); - progress_o.stone_wall_timer_seconds = stone_wall_timer_seconds; - progress_o.items_per_dir = items_per_dir; + progress_o.stone_wall_timer_seconds = 0; + progress_o.items_per_dir = o.items_per_dir; rank_progress_t * progress = & progress_o; /* start and end times of directory tree create/remove */ @@ -1637,255 +1954,229 @@ static void mdtest_iteration(int i, int j, MPI_Group testgroup, mdtest_results_t VERBOSE(1,-1,"main: * iteration %d *", j+1); - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ - prep_testdir(j, dir_iter); + if(o.create_only){ + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ + if (rank >= o.path_count) { + continue; + } + prep_testdir(j, dir_iter); - VERBOSE(2,5,"main (for j loop): making testdir, '%s'", testdir ); - if ((rank < path_count) && backend->access(testdir, F_OK, backend_options) != 0) { - if (backend->mkdir(testdir, DIRMODE, backend_options) != 0) { - FAIL("Unable to create test directory %s", testdir); - } + VERBOSE(2,5,"main (for j loop): making o.testdir, '%s'", o.testdir ); + if (o.backend->access(o.testdir, F_OK, o.backend_options) != 0) { + if (o.backend->mkdir(o.testdir, DIRMODE, o.backend_options) != 0) { + EWARNF("Unable to create test directory %s", o.testdir); + } #ifdef HAVE_LUSTRE_LUSTREAPI - /* internal node for branching, can be non-striped for children */ - if (global_dir_layout && unique_dir_per_task && llapi_dir_set_default_lmv_stripe(testdir, -1, 0, LMV_HASH_TYPE_FNV_1A_64, NULL) == -1) { - FAIL("Unable to reset to global default directory layout"); - } + /* internal node for branching, can be non-striped for children */ + if (o.global_dir_layout && o.unique_dir_per_task && llapi_dir_set_default_lmv_stripe(o.testdir, -1, 0, LMV_HASH_TYPE_FNV_1A_64, NULL) == -1) { + EWARNF("Unable to reset to global default directory layout"); + } #endif /* HAVE_LUSTRE_LUSTREAPI */ + } } - } - if (create_only) { - /* create hierarchical directory structure */ - MPI_Barrier(testComm); + /* create hierarchical directory structure */ + MPI_Barrier(testComm); - startCreate = GetTimeStamp(); - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ - prep_testdir(j, dir_iter); + startCreate = GetTimeStamp(); + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ + prep_testdir(j, dir_iter); - if (unique_dir_per_task) { - if (collective_creates && (rank == 0)) { - /* - * This is inside two loops, one of which already uses "i" and the other uses "j". - * I don't know how this ever worked. I'm changing this loop to use "k". - */ - for (k=0; krate[8] = - num_dirs_in_tree / (endCreate - startCreate); - summary_table->time[8] = (endCreate - startCreate); - summary_table->items[8] = num_dirs_in_tree; - summary_table->stonewall_last_item[8] = num_dirs_in_tree; - VERBOSE(1,-1,"V-1: main: Tree creation : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[8]); + } + MPI_Barrier(testComm); + endCreate = GetTimeStamp(); + summary_table->rate[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree / (endCreate - startCreate); + summary_table->time[MDTEST_TREE_CREATE_NUM] = (endCreate - startCreate); + summary_table->items[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree; + summary_table->stonewall_last_item[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree; + VERBOSE(1,-1,"V-1: main: Tree creation : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[MDTEST_TREE_CREATE_NUM]); } - sprintf(unique_mk_dir, "%s.0", base_tree_name); - sprintf(unique_chdir_dir, "%s.0", base_tree_name); - sprintf(unique_stat_dir, "%s.0", base_tree_name); - sprintf(unique_read_dir, "%s.0", base_tree_name); - sprintf(unique_rm_dir, "%s.0", base_tree_name); - unique_rm_uni_dir[0] = 0; - if (!unique_dir_per_task) { - VERBOSE(3,-1,"V-3: main: Using unique_mk_dir, '%s'", unique_mk_dir ); + sprintf(o.unique_mk_dir, "%s.0", o.base_tree_name); + sprintf(o.unique_chdir_dir, "%s.0", o.base_tree_name); + sprintf(o.unique_stat_dir, "%s.0", o.base_tree_name); + sprintf(o.unique_read_dir, "%s.0", o.base_tree_name); + sprintf(o.unique_rm_dir, "%s.0", o.base_tree_name); + o.unique_rm_uni_dir[0] = 0; + + if (! o.unique_dir_per_task) { + VERBOSE(3,-1,"V-3: main: Using unique_mk_dir, '%s'", o.unique_mk_dir ); } if (rank < i) { - if (!shared_file) { - sprintf(mk_name, "mdtest.%d.", (rank+(0*nstride))%i); - sprintf(stat_name, "mdtest.%d.", (rank+(1*nstride))%i); - sprintf(read_name, "mdtest.%d.", (rank+(2*nstride))%i); - sprintf(rm_name, "mdtest.%d.", (rank+(3*nstride))%i); + if (! o.shared_file) { + sprintf(o.mk_name, "mdtest.%d.", (rank+(0*o.nstride))%i); + sprintf(o.stat_name, "mdtest.%d.", (rank+(1*o.nstride))%i); + sprintf(o.read_name, "mdtest.%d.", (rank+(2*o.nstride))%i); + sprintf(o.rm_name, "mdtest.%d.", (rank+(3*o.nstride))%i); } - if (unique_dir_per_task) { - VERBOSE(3,5,"i %d nstride %d", i, nstride); - sprintf(unique_mk_dir, "mdtest_tree.%d.0", (rank+(0*nstride))%i); - sprintf(unique_chdir_dir, "mdtest_tree.%d.0", (rank+(1*nstride))%i); - sprintf(unique_stat_dir, "mdtest_tree.%d.0", (rank+(2*nstride))%i); - sprintf(unique_read_dir, "mdtest_tree.%d.0", (rank+(3*nstride))%i); - sprintf(unique_rm_dir, "mdtest_tree.%d.0", (rank+(4*nstride))%i); - unique_rm_uni_dir[0] = 0; - VERBOSE(5,5,"mk_dir %s chdir %s stat_dir %s read_dir %s rm_dir %s\n", unique_mk_dir,unique_chdir_dir,unique_stat_dir,unique_read_dir,unique_rm_dir); + if (o.unique_dir_per_task) { + VERBOSE(3,5,"i %d nstride %d", i, o.nstride); + sprintf(o.unique_mk_dir, "mdtest_tree.%d.0", (rank+(0*o.nstride))%i); + sprintf(o.unique_chdir_dir, "mdtest_tree.%d.0", (rank+(1*o.nstride))%i); + sprintf(o.unique_stat_dir, "mdtest_tree.%d.0", (rank+(2*o.nstride))%i); + sprintf(o.unique_read_dir, "mdtest_tree.%d.0", (rank+(3*o.nstride))%i); + sprintf(o.unique_rm_dir, "mdtest_tree.%d.0", (rank+(4*o.nstride))%i); + o.unique_rm_uni_dir[0] = 0; + VERBOSE(5,5,"mk_dir %s chdir %s stat_dir %s read_dir %s rm_dir %s\n", o.unique_mk_dir, o.unique_chdir_dir, o.unique_stat_dir, o.unique_read_dir, o.unique_rm_dir); } - VERBOSE(3,-1,"V-3: main: Copied unique_mk_dir, '%s', to topdir", unique_mk_dir ); + VERBOSE(3,-1,"V-3: main: Copied unique_mk_dir, '%s', to topdir", o.unique_mk_dir ); - if (dirs_only && !shared_file) { - if (pre_delay) { - DelaySecs(pre_delay); + if (o.dirs_only && ! o.shared_file) { + if (o.pre_delay) { + DelaySecs(o.pre_delay); } - directory_test(j, i, unique_mk_dir, progress); + directory_test(j, i, o.unique_mk_dir, progress); } - if (files_only) { - if (pre_delay) { - DelaySecs(pre_delay); + if (o.files_only) { + if (o.pre_delay) { + DelaySecs(o.pre_delay); } - VERBOSE(3,5,"will file_test on %s", unique_mk_dir); - file_test(j, i, unique_mk_dir, progress); + VERBOSE(3,5,"will file_test on %s", o.unique_mk_dir); + + file_test(j, i, o.unique_mk_dir, progress); } } /* remove directory structure */ - if (!unique_dir_per_task) { - VERBOSE(3,-1,"main: Using testdir, '%s'", testdir ); + if (! o.unique_dir_per_task) { + VERBOSE(3,-1,"main: Using o.testdir, '%s'", o.testdir ); } MPI_Barrier(testComm); - if (remove_only) { + if (o.remove_only) { progress->items_start = 0; startCreate = GetTimeStamp(); - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(j, dir_iter); - if (unique_dir_per_task) { - if (collective_creates && (rank == 0)) { + if (o.unique_dir_per_task) { + if (o.collective_creates && (rank == 0)) { /* * This is inside two loops, one of which already uses "i" and the other uses "j". * I don't know how this ever worked. I'm changing this loop to use "k". */ - for (k=0; krate[9] = num_dirs_in_tree / (endCreate - startCreate); - summary_table->time[9] = endCreate - startCreate; - summary_table->items[9] = num_dirs_in_tree; - summary_table->stonewall_last_item[8] = num_dirs_in_tree; - VERBOSE(1,-1,"main Tree removal : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[9]); - VERBOSE(2,-1,"main (at end of for j loop): Removing testdir of '%s'\n", testdir ); + summary_table->rate[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree / (endCreate - startCreate); + summary_table->time[MDTEST_TREE_REMOVE_NUM] = endCreate - startCreate; + summary_table->items[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree; + summary_table->stonewall_last_item[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree; + VERBOSE(1,-1,"main Tree removal : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[MDTEST_TREE_REMOVE_NUM]); + VERBOSE(2,-1,"main (at end of for j loop): Removing o.testdir of '%s'\n", o.testdir ); - for (int dir_iter = 0; dir_iter < directory_loops; dir_iter ++){ + for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(j, dir_iter); - if ((rank < path_count) && backend->access(testdir, F_OK, backend_options) == 0) { - //if (( rank == 0 ) && access(testdir, F_OK) == 0) { - if (backend->rmdir(testdir, backend_options) == -1) { - FAIL("unable to remove directory %s", testdir); + if ((rank < o.path_count) && o.backend->access(o.testdir, F_OK, o.backend_options) == 0) { + //if (( rank == 0 ) && access(o.testdir, F_OK) == 0) { + if (o.backend->rmdir(o.testdir, o.backend_options) == -1) { + EWARNF("unable to remove directory %s", o.testdir); } } } } else { - summary_table->rate[9] = 0; + summary_table->rate[MDTEST_TREE_REMOVE_NUM] = 0; } } void mdtest_init_args(){ - barriers = 1; - branch_factor = 1; - throttle = 1; - stoneWallingStatusFile = NULL; - create_only = 0; - stat_only = 0; - read_only = 0; - verify_read = 0; - verification_error = 0; - remove_only = 0; - leaf_only = 0; - depth = 0; - num_dirs_in_tree = 0; - items_per_dir = 0; - random_seed = 0; - print_time = 0; - print_rate_and_time = 0; - shared_file = 0; - files_only = 0; - dirs_only = 0; - pre_delay = 0; - unique_dir_per_task = 0; - time_unique_dir_overhead = 0; - items = 0; - num_dirs_in_tree_calc = 0; - collective_creates = 0; - write_bytes = 0; - stone_wall_timer_seconds = 0; - read_bytes = 0; - sync_file = 0; - call_sync = 0; - path_count = 0; - nstride = 0; - make_node = 0; -#ifdef HAVE_LUSTRE_LUSTREAPI - global_dir_layout = 0; -#endif /* HAVE_LUSTRE_LUSTREAPI */ + o = (mdtest_options_t) { + .barriers = 1, + .branch_factor = 1, + .random_buffer_offset = -1, + .prologue = "", + .epilogue = "", + }; } mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out) { testComm = world_com; out_logfile = world_out; - mpi_comm_world = world_com; + out_resultfile = world_out; - init_clock(); + init_clock(world_com); mdtest_init_args(); int i, j; int numNodes; int numTasksOnNode0 = 0; - MPI_Group worldgroup, testgroup; + MPI_Group worldgroup; struct { int first; int last; @@ -1895,6 +2186,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * int last = 0; int stride = 1; int iterations = 1; + int created_root_dir = 0; // was the root directory existing or newly created verbose = 0; int no_barriers = 0; @@ -1905,77 +2197,94 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * aiori_supported_apis(APIs, APIs_legacy, MDTEST); char apiStr[1024]; sprintf(apiStr, "API for I/O [%s]", APIs); - memset(& hints, 0, sizeof(hints)); + memset(& o.hints, 0, sizeof(o.hints)); + + char * packetType = "t"; option_help options [] = { - {'a', NULL, apiStr, OPTION_OPTIONAL_ARGUMENT, 's', & api}, - {'b', NULL, "branching factor of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & branch_factor}, + {'a', NULL, apiStr, OPTION_OPTIONAL_ARGUMENT, 's', & o.api}, + {'b', NULL, "branching factor of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.branch_factor}, {'d', NULL, "the directory in which the tests will run", OPTION_OPTIONAL_ARGUMENT, 's', & path}, {'B', NULL, "no barriers between phases", OPTION_OPTIONAL_ARGUMENT, 'd', & no_barriers}, - {'C', NULL, "only create files/dirs", OPTION_FLAG, 'd', & create_only}, - {'T', NULL, "only stat files/dirs", OPTION_FLAG, 'd', & stat_only}, - {'E', NULL, "only read files/dir", OPTION_FLAG, 'd', & read_only}, - {'r', NULL, "only remove files or directories left behind by previous runs", OPTION_FLAG, 'd', & remove_only}, - {'D', NULL, "perform test on directories only (no files)", OPTION_FLAG, 'd', & dirs_only}, - {'e', NULL, "bytes to read from each file", OPTION_OPTIONAL_ARGUMENT, 'l', & read_bytes}, + {'C', NULL, "only create files/dirs", OPTION_FLAG, 'd', & o.create_only}, + {'T', NULL, "only stat files/dirs", OPTION_FLAG, 'd', & o.stat_only}, + {'E', NULL, "only read files/dir", OPTION_FLAG, 'd', & o.read_only}, + {'r', NULL, "only remove files or directories left behind by previous runs", OPTION_FLAG, 'd', & o.remove_only}, + {'D', NULL, "perform test on directories only (no files)", OPTION_FLAG, 'd', & o.dirs_only}, + {'e', NULL, "bytes to read from each file", OPTION_OPTIONAL_ARGUMENT, 'l', & o.read_bytes}, {'f', NULL, "first number of tasks on which the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & first}, - {'F', NULL, "perform test on files only (no directories)", OPTION_FLAG, 'd', & files_only}, + {'F', NULL, "perform test on files only (no directories)", OPTION_FLAG, 'd', & o.files_only}, #ifdef HAVE_LUSTRE_LUSTREAPI - {'g', NULL, "global default directory layout for test subdirectories (deletes inherited striping layout)", OPTION_FLAG, 'd', & global_dir_layout}, + {'g', NULL, "global default directory layout for test subdirectories (deletes inherited striping layout)", OPTION_FLAG, 'd', & o.global_dir_layout}, #endif /* HAVE_LUSTRE_LUSTREAPI */ + {'G', NULL, "Offset for the data in the read/write buffer, if not set, a random value is used", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_buffer_offset}, {'i', NULL, "number of iterations the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & iterations}, - {'I', NULL, "number of items per directory in tree", OPTION_OPTIONAL_ARGUMENT, 'l', & items_per_dir}, - {'k', NULL, "use mknod to create file", OPTION_FLAG, 'd', & make_node}, + {'I', NULL, "number of items per directory in tree", OPTION_OPTIONAL_ARGUMENT, 'l', & o.items_per_dir}, + {'k', NULL, "use mknod to create file", OPTION_FLAG, 'd', & o.make_node}, {'l', NULL, "last number of tasks on which the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & last}, - {'L', NULL, "files only at leaf level of tree", OPTION_FLAG, 'd', & leaf_only}, - {'n', NULL, "every process will creat/stat/read/remove # directories and files", OPTION_OPTIONAL_ARGUMENT, 'l', & items}, - {'N', NULL, "stride # between tasks for file/dir operation (local=0; set to 1 to avoid client cache)", OPTION_OPTIONAL_ARGUMENT, 'd', & nstride}, - {'p', NULL, "pre-iteration delay (in seconds)", OPTION_OPTIONAL_ARGUMENT, 'd', & pre_delay}, - {'P', NULL, "print rate AND time", OPTION_FLAG, 'd', & print_rate_and_time}, + {'L', NULL, "files only at leaf level of tree", OPTION_FLAG, 'd', & o.leaf_only}, + {'n', NULL, "every process will creat/stat/read/remove # directories and files", OPTION_OPTIONAL_ARGUMENT, 'l', & o.items}, + {'N', NULL, "stride # between tasks for file/dir operation (local=0; set to 1 to avoid client cache)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.nstride}, + {'p', NULL, "pre-iteration delay (in seconds)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.pre_delay}, + {'P', NULL, "print rate AND time", OPTION_FLAG, 'd', & o.print_rate_and_time}, + {0, "print-all-procs", "all processes print an excerpt of their results", OPTION_FLAG, 'd', & o.print_all_proc}, {'R', NULL, "random access to files (only for stat)", OPTION_FLAG, 'd', & randomize}, - {0, "random-seed", "random seed for -R", OPTION_OPTIONAL_ARGUMENT, 'd', & random_seed}, + {0, "random-seed", "random seed for -R", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_seed}, {'s', NULL, "stride between the number of tasks for each test", OPTION_OPTIONAL_ARGUMENT, 'd', & stride}, - {'S', NULL, "shared file access (file only, no directories)", OPTION_FLAG, 'd', & shared_file}, - {'c', NULL, "collective creates: task 0 does all creates", OPTION_FLAG, 'd', & collective_creates}, - {'t', NULL, "time unique working directory overhead", OPTION_FLAG, 'd', & time_unique_dir_overhead}, - {'u', NULL, "unique working directory for each task", OPTION_FLAG, 'd', & unique_dir_per_task}, + {'S', NULL, "shared file access (file only, no directories)", OPTION_FLAG, 'd', & o.shared_file}, + {'c', NULL, "collective creates: task 0 does all creates", OPTION_FLAG, 'd', & o.collective_creates}, + {'t', NULL, "time unique working directory overhead", OPTION_FLAG, 'd', & o.time_unique_dir_overhead}, + {'u', NULL, "unique working directory for each task", OPTION_FLAG, 'd', & o.unique_dir_per_task}, {'v', NULL, "verbosity (each instance of option increments by one)", OPTION_FLAG, 'd', & verbose}, {'V', NULL, "verbosity value", OPTION_OPTIONAL_ARGUMENT, 'd', & verbose}, - {'w', NULL, "bytes to write to each file after it is created", OPTION_OPTIONAL_ARGUMENT, 'l', & write_bytes}, - {'W', NULL, "number in seconds; stonewall timer, write as many seconds and ensure all processes did the same number of operations (currently only stops during create phase)", OPTION_OPTIONAL_ARGUMENT, 'd', & stone_wall_timer_seconds}, - {'x', NULL, "StoneWallingStatusFile; contains the number of iterations of the creation phase, can be used to split phases across runs", OPTION_OPTIONAL_ARGUMENT, 's', & stoneWallingStatusFile}, - {'X', "verify-read", "Verify the data read", OPTION_FLAG, 'd', & verify_read}, - {'y', NULL, "sync file after writing", OPTION_FLAG, 'd', & sync_file}, - {'Y', NULL, "call the sync command after each phase (included in the timing; note it causes all IO to be flushed from your node)", OPTION_FLAG, 'd', & call_sync}, - {'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & depth}, - {'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & print_time}, + {'w', NULL, "bytes to write to each file after it is created", OPTION_OPTIONAL_ARGUMENT, 'l', & o.write_bytes}, + {'W', NULL, "number in seconds; stonewall timer, write as many seconds and ensure all processes did the same number of operations (currently only stops during create phase and files)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stone_wall_timer_seconds}, + {'x', NULL, "StoneWallingStatusFile; contains the number of iterations of the creation phase, can be used to split phases across runs", OPTION_OPTIONAL_ARGUMENT, 's', & o.stoneWallingStatusFile}, + {'X', "verify-read", "Verify the data read", OPTION_FLAG, 'd', & o.verify_read}, + {0, "verify-write", "Verify the data after a write by reading it back immediately", OPTION_FLAG, 'd', & o.verify_write}, + {'y', NULL, "sync file after writing", OPTION_FLAG, 'd', & o.sync_file}, + {'Y', NULL, "call the sync command after each phase (included in the timing; note it causes all IO to be flushed from your node)", OPTION_FLAG, 'd', & o.call_sync}, + {'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth}, + {'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time}, + {0, "run-cmd-before-phase", "call this external command before each phase (excluded from the timing)", OPTION_OPTIONAL_ARGUMENT, 's', & o.prologue}, + {0, "run-cmd-after-phase", "call this external command after each phase (included in the timing)", OPTION_OPTIONAL_ARGUMENT, 's', & o.epilogue}, + {0, "dataPacketType", "type of packet that will be created [offset|incompressible|timestamp|o|i|t]", OPTION_OPTIONAL_ARGUMENT, 's', & packetType}, + {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags}, + {0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors}, + {0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV}, + {0, "showRankStatistics", "Include statistics per rank", OPTION_FLAG, 'd', & o.show_perrank_statistics}, + LAST_OPTION }; options_all_t * global_options = airoi_create_all_module_options(options); option_parse(argc, argv, global_options); - backend = aiori_select(api); - if (backend == NULL) + o.backend = aiori_select(o.api); + if (o.backend == NULL) ERR("Unrecognized I/O API"); - backend_options = airoi_update_module_options(backend, global_options); + if (! o.backend->enable_mdtest) + ERR("Backend doesn't support MDTest"); + o.backend_options = airoi_update_module_options(o.backend, global_options); free(global_options->modules); free(global_options); + + o.dataPacketType = parsePacketType(packetType[0]); MPI_Comm_rank(testComm, &rank); - MPI_Comm_size(testComm, &size); + MPI_Comm_size(testComm, &o.size); - if (backend->initialize){ - backend->initialize(backend_options); + if(o.backend->xfer_hints){ + o.backend->xfer_hints(& o.hints); } - if(backend->xfer_hints){ - backend->xfer_hints(& hints); + if(o.backend->check_params){ + o.backend->check_params(o.backend_options); } - if(backend->check_params){ - backend->check_params(backend_options); + if (o.backend->initialize){ + o.backend->initialize(o.backend_options); } - pid = getpid(); - uid = getuid(); + o.pid = getpid(); + o.uid = getuid(); numNodes = GetNumNodes(testComm); numTasksOnNode0 = GetNumTasksOnNode0(testComm); @@ -1987,118 +2296,122 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * } VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp()); - VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, numNodes); + VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, o.size, numNodes); VERBOSE(0,-1,"Command line used: %s", cmd_buffer); /* adjust special variables */ - barriers = ! no_barriers; + o.barriers = ! no_barriers; if (path != NULL){ parse_dirpath(path); } if( randomize > 0 ){ - if (random_seed == 0) { + if (o.random_seed == 0) { /* Ensure all procs have the same random number */ - random_seed = time(NULL); + o.random_seed = time(NULL); MPI_Barrier(testComm); - MPI_Bcast(&random_seed, 1, MPI_INT, 0, testComm); + MPI_Bcast(& o.random_seed, 1, MPI_INT, 0, testComm); } - random_seed += rank; + o.random_seed += rank; } - if ((items > 0) && (items_per_dir > 0) && (! unique_dir_per_task)) { - directory_loops = items / items_per_dir; + if( o.random_buffer_offset == -1 ){ + o.random_buffer_offset = time(NULL); + MPI_Bcast(& o.random_buffer_offset, 1, MPI_INT, 0, testComm); + } + if ((o.items > 0) && (o.items_per_dir > 0) && (! o.unique_dir_per_task)) { + o.directory_loops = o.items / o.items_per_dir; }else{ - directory_loops = 1; + o.directory_loops = 1; } - valid_tests(); + md_validate_tests(); // option_print_current(options); - VERBOSE(1,-1, "api : %s", api); - VERBOSE(1,-1, "barriers : %s", ( barriers ? "True" : "False" )); - VERBOSE(1,-1, "collective_creates : %s", ( collective_creates ? "True" : "False" )); - VERBOSE(1,-1, "create_only : %s", ( create_only ? "True" : "False" )); + VERBOSE(1,-1, "api : %s", o.api); + VERBOSE(1,-1, "barriers : %s", ( o.barriers ? "True" : "False" )); + VERBOSE(1,-1, "collective_creates : %s", ( o.collective_creates ? "True" : "False" )); + VERBOSE(1,-1, "create_only : %s", ( o.create_only ? "True" : "False" )); VERBOSE(1,-1, "dirpath(s):" ); - for ( i = 0; i < path_count; i++ ) { - VERBOSE(1,-1, "\t%s", filenames[i] ); + for ( i = 0; i < o.path_count; i++ ) { + VERBOSE(1,-1, "\t%s", o.filenames[i] ); } - VERBOSE(1,-1, "dirs_only : %s", ( dirs_only ? "True" : "False" )); - VERBOSE(1,-1, "read_bytes : "LLU"", read_bytes ); - VERBOSE(1,-1, "read_only : %s", ( read_only ? "True" : "False" )); + VERBOSE(1,-1, "dirs_only : %s", ( o.dirs_only ? "True" : "False" )); + VERBOSE(1,-1, "read_bytes : "LLU"", o.read_bytes ); + VERBOSE(1,-1, "read_only : %s", ( o.read_only ? "True" : "False" )); VERBOSE(1,-1, "first : %d", first ); - VERBOSE(1,-1, "files_only : %s", ( files_only ? "True" : "False" )); + VERBOSE(1,-1, "files_only : %s", ( o.files_only ? "True" : "False" )); #ifdef HAVE_LUSTRE_LUSTREAPI - VERBOSE(1,-1, "global_dir_layout : %s", ( global_dir_layout ? "True" : "False" )); + VERBOSE(1,-1, "global_dir_layout : %s", ( o.global_dir_layout ? "True" : "False" )); #endif /* HAVE_LUSTRE_LUSTREAPI */ VERBOSE(1,-1, "iterations : %d", iterations ); - VERBOSE(1,-1, "items_per_dir : "LLU"", items_per_dir ); + VERBOSE(1,-1, "items_per_dir : "LLU"", o.items_per_dir ); VERBOSE(1,-1, "last : %d", last ); - VERBOSE(1,-1, "leaf_only : %s", ( leaf_only ? "True" : "False" )); - VERBOSE(1,-1, "items : "LLU"", items ); - VERBOSE(1,-1, "nstride : %d", nstride ); - VERBOSE(1,-1, "pre_delay : %d", pre_delay ); - VERBOSE(1,-1, "remove_only : %s", ( leaf_only ? "True" : "False" )); - VERBOSE(1,-1, "random_seed : %d", random_seed ); + VERBOSE(1,-1, "leaf_only : %s", ( o.leaf_only ? "True" : "False" )); + VERBOSE(1,-1, "items : "LLU"", o.items ); + VERBOSE(1,-1, "nstride : %d", o.nstride ); + VERBOSE(1,-1, "pre_delay : %d", o.pre_delay ); + VERBOSE(1,-1, "remove_only : %s", ( o.leaf_only ? "True" : "False" )); + VERBOSE(1,-1, "random_seed : %d", o.random_seed ); VERBOSE(1,-1, "stride : %d", stride ); - VERBOSE(1,-1, "shared_file : %s", ( shared_file ? "True" : "False" )); - VERBOSE(1,-1, "time_unique_dir_overhead: %s", ( time_unique_dir_overhead ? "True" : "False" )); - VERBOSE(1,-1, "stone_wall_timer_seconds: %d", stone_wall_timer_seconds); - VERBOSE(1,-1, "stat_only : %s", ( stat_only ? "True" : "False" )); - VERBOSE(1,-1, "unique_dir_per_task : %s", ( unique_dir_per_task ? "True" : "False" )); - VERBOSE(1,-1, "write_bytes : "LLU"", write_bytes ); - VERBOSE(1,-1, "sync_file : %s", ( sync_file ? "True" : "False" )); - VERBOSE(1,-1, "call_sync : %s", ( call_sync ? "True" : "False" )); - VERBOSE(1,-1, "depth : %d", depth ); - VERBOSE(1,-1, "make_node : %d", make_node ); + VERBOSE(1,-1, "shared_file : %s", ( o.shared_file ? "True" : "False" )); + VERBOSE(1,-1, "time_unique_dir_overhead: %s", ( o.time_unique_dir_overhead ? "True" : "False" )); + VERBOSE(1,-1, "stone_wall_timer_seconds: %d", o.stone_wall_timer_seconds); + VERBOSE(1,-1, "stat_only : %s", ( o.stat_only ? "True" : "False" )); + VERBOSE(1,-1, "unique_dir_per_task : %s", ( o.unique_dir_per_task ? "True" : "False" )); + VERBOSE(1,-1, "write_bytes : "LLU"", o.write_bytes ); + VERBOSE(1,-1, "sync_file : %s", ( o.sync_file ? "True" : "False" )); + VERBOSE(1,-1, "call_sync : %s", ( o.call_sync ? "True" : "False" )); + VERBOSE(1,-1, "depth : %d", o.depth ); + VERBOSE(1,-1, "make_node : %d", o.make_node ); /* setup total number of items and number of items per dir */ - if (depth <= 0) { - num_dirs_in_tree = 1; + if (o.depth <= 0) { + o.num_dirs_in_tree = 1; } else { - if (branch_factor < 1) { - num_dirs_in_tree = 1; - } else if (branch_factor == 1) { - num_dirs_in_tree = depth + 1; + if (o.branch_factor < 1) { + o.num_dirs_in_tree = 1; + } else if (o.branch_factor == 1) { + o.num_dirs_in_tree = o.depth + 1; } else { - num_dirs_in_tree = (pow(branch_factor, depth+1) - 1) / (branch_factor - 1); + o.num_dirs_in_tree = (pow(o.branch_factor, o.depth+1) - 1) / (o.branch_factor - 1); } } - if (items_per_dir > 0) { - if(items == 0){ - if (leaf_only) { - items = items_per_dir * (uint64_t) pow(branch_factor, depth); + if (o.items_per_dir > 0) { + if(o.items == 0){ + if (o.leaf_only) { + o.items = o.items_per_dir * (uint64_t) pow(o.branch_factor, o.depth); } else { - items = items_per_dir * num_dirs_in_tree; + o.items = o.items_per_dir * o.num_dirs_in_tree; } }else{ - num_dirs_in_tree_calc = num_dirs_in_tree; + o.num_dirs_in_tree_calc = o.num_dirs_in_tree; } } else { - if (leaf_only) { - if (branch_factor <= 1) { - items_per_dir = items; + if (o.leaf_only) { + if (o.branch_factor <= 1) { + o.items_per_dir = o.items; } else { - items_per_dir = (uint64_t) (items / pow(branch_factor, depth)); - items = items_per_dir * (uint64_t) pow(branch_factor, depth); + o.items_per_dir = (uint64_t) (o.items / pow(o.branch_factor, o.depth)); + o.items = o.items_per_dir * (uint64_t) pow(o.branch_factor, o.depth); } } else { - items_per_dir = items / num_dirs_in_tree; - items = items_per_dir * num_dirs_in_tree; + o.items_per_dir = o.items / o.num_dirs_in_tree; + o.items = o.items_per_dir * o.num_dirs_in_tree; } } /* initialize rand_array */ - if (random_seed > 0) { - srand(random_seed); + if (o.random_seed > 0) { + srand(o.random_seed); uint64_t s; - rand_array = (uint64_t *) malloc( items * sizeof(*rand_array)); + o.rand_array = (uint64_t *) safeMalloc( o.items * sizeof(*o.rand_array)); - for (s=0; s < items; s++) { - rand_array[s] = s; + for (s=0; s < o.items; s++) { + o.rand_array[s] = s; } /* shuffle list randomly */ - uint64_t n = items; + uint64_t n = o.items; while (n>1) { n--; @@ -2117,121 +2430,132 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * * element, and the kth element to the nth element. */ - uint64_t tmp = rand_array[k]; - rand_array[k] = rand_array[n]; - rand_array[n] = tmp; + uint64_t tmp = o.rand_array[k]; + o.rand_array[k] = o.rand_array[n]; + o.rand_array[n] = tmp; } } /* allocate and initialize write buffer with # */ - if (write_bytes > 0) { - int alloc_res = posix_memalign((void**)&write_buffer, sysconf(_SC_PAGESIZE), write_bytes); - if (alloc_res) { - FAIL("out of memory"); - } - generate_memory_pattern(write_buffer, write_bytes); + if (o.write_bytes > 0) { + o.write_buffer = aligned_buffer_alloc(o.write_bytes, o.gpu_memory_flags); + generate_memory_pattern(o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType); } /* setup directory path to work in */ - if (path_count == 0) { /* special case where no directory path provided with '-d' option */ - char *ret = getcwd(testdirpath, MAX_PATHLEN); + if (o.path_count == 0) { /* special case where no directory path provided with '-d' option */ + char *ret = getcwd(o.testdirpath, MAX_PATHLEN); if (ret == NULL) { - FAIL("Unable to get current working directory on %s", testdirpath); + FAIL("Unable to get current working directory on %s", o.testdirpath); } - path_count = 1; + o.path_count = 1; } else { - strcpy(testdirpath, filenames[rank%path_count]); + strcpy(o.testdirpath, o.filenames[rank % o.path_count]); } /* if directory does not exist, create it */ - if ((rank < path_count) && backend->access(testdirpath, F_OK, backend_options) != 0) { - if (backend->mkdir(testdirpath, DIRMODE, backend_options) != 0) { - FAIL("Unable to create test directory path %s", testdirpath); + if ((rank < o.path_count) && o.backend->access(o.testdirpath, F_OK, o.backend_options) != 0) { + if (o.backend->mkdir(o.testdirpath, DIRMODE, o.backend_options) != 0) { + EWARNF("Unable to create test directory path %s", o.testdirpath); } + created_root_dir = 1; } /* display disk usage */ - VERBOSE(3,-1,"main (before display_freespace): testdirpath is '%s'", testdirpath ); + VERBOSE(3,-1,"main (before display_freespace): o.testdirpath is '%s'", o.testdirpath ); - if (rank == 0) display_freespace(testdirpath); + if (rank == 0) ShowFileSystemSize(o.testdirpath, o.backend, o.backend_options); int tasksBlockMapping = QueryNodeMapping(testComm, true); /* set the shift to mimic IOR and shift by procs per node */ - if (nstride > 0) { + if (o.nstride > 0) { if ( numNodes > 1 && tasksBlockMapping ) { /* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */ - nstride *= numTasksOnNode0; + o.nstride *= numTasksOnNode0; } - VERBOSE(0,5,"Shifting ranks by %d for each phase.", nstride); + VERBOSE(0,5,"Shifting ranks by %d for each phase.", o.nstride); } - VERBOSE(3,-1,"main (after display_freespace): testdirpath is '%s'", testdirpath ); + VERBOSE(3,-1,"main (after display_freespace): o.testdirpath is '%s'", o.testdirpath ); if (rank == 0) { - if (random_seed > 0) { - VERBOSE(0,-1,"random seed: %d", random_seed); + if (o.random_seed > 0) { + VERBOSE(0,-1,"random seed: %d", o.random_seed); } } - if (gethostname(hostname, MAX_PATHLEN) == -1) { + if (gethostname(o.hostname, MAX_PATHLEN) == -1) { perror("gethostname"); MPI_Abort(testComm, 2); } if (last == 0) { - first = size; - last = size; + first = o.size; + last = o.size; + } + if(first > last){ + FAIL("process number: first > last doesn't make sense"); + } + if(last > o.size){ + FAIL("process number: last > number of processes doesn't make sense"); } /* setup summary table for recording results */ - summary_table = (mdtest_results_t *) malloc(iterations * sizeof(mdtest_results_t)); - memset(summary_table, 0, iterations * sizeof(mdtest_results_t)); - for(int i=0; i < iterations; i++){ - for(int j=0; j < MDTEST_LAST_NUM; j++){ - summary_table[i].rate[j] = 0.0; - summary_table[i].time[j] = 0.0; - } - } + o.summary_table = (mdtest_results_t *) safeMalloc(iterations * sizeof(mdtest_results_t)); + memset(o.summary_table, 0, iterations * sizeof(mdtest_results_t)); - if (summary_table == NULL) { - FAIL("out of memory"); - } - - if (unique_dir_per_task) { - sprintf(base_tree_name, "mdtest_tree.%d", rank); + if (o.unique_dir_per_task) { + sprintf(o.base_tree_name, "mdtest_tree.%d", rank); } else { - sprintf(base_tree_name, "mdtest_tree"); + sprintf(o.base_tree_name, "mdtest_tree"); } + mdtest_results_t * aggregated_results = safeMalloc(iterations * sizeof(mdtest_results_t)); + /* default use shared directory */ - strcpy(mk_name, "mdtest.shared."); - strcpy(stat_name, "mdtest.shared."); - strcpy(read_name, "mdtest.shared."); - strcpy(rm_name, "mdtest.shared."); + strcpy(o.mk_name, "mdtest.shared."); + strcpy(o.stat_name, "mdtest.shared."); + strcpy(o.read_name, "mdtest.shared."); + strcpy(o.rm_name, "mdtest.shared."); MPI_Comm_group(testComm, &worldgroup); + + last = o.size < last ? o.size : last; + + /* Run the tests */ + for (i = first; i <= last; i += stride) { + sleep(1); + + if(i < last){ + MPI_Group testgroup; + range.last = i - 1; + MPI_Group_range_incl(worldgroup, 1, (void *)&range, &testgroup); + MPI_Comm_create(world_com, testgroup, &testComm); + MPI_Group_free(&testgroup); + if(testComm == MPI_COMM_NULL){ + continue; + } + }else{ + MPI_Comm_dup(world_com, & testComm); + } + MPI_Comm_size(testComm, &o.size); - /* Run the tests */ - for (i = first; i <= last && i <= size; i += stride) { - range.last = i - 1; - MPI_Group_range_incl(worldgroup, 1, (void *)&range, &testgroup); - MPI_Comm_create(testComm, testgroup, &testComm); if (rank == 0) { - uint64_t items_all = i * items; - if(num_dirs_in_tree_calc){ - items_all *= num_dirs_in_tree_calc; + uint64_t items_all = i * o.items; + if(o.num_dirs_in_tree_calc){ + items_all *= o.num_dirs_in_tree_calc; } - if (files_only && dirs_only) { + if (o.files_only && o.dirs_only) { VERBOSE(0,-1,"%d tasks, "LLU" files/directories", i, items_all); - } else if (files_only) { - if (!shared_file) { + } else if (o.files_only) { + if (! o.shared_file) { VERBOSE(0,-1,"%d tasks, "LLU" files", i, items_all); } else { VERBOSE(0,-1,"%d tasks, 1 file", i); } - } else if (dirs_only) { + } else if (o.dirs_only) { VERBOSE(0,-1,"%d tasks, "LLU" directories", i, items_all); } } @@ -2241,31 +2565,42 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * for (j = 0; j < iterations; j++) { // keep track of the current status for stonewalling - mdtest_iteration(i, j, testgroup, & summary_table[j]); + mdtest_iteration(i, j, & o.summary_table[j]); } - if (print_rate_and_time){ - summarize_results(iterations, 0); - summarize_results(iterations, 1); - }else{ - summarize_results(iterations, print_time); + summarize_results(iterations, aggregated_results); + if(o.saveRankDetailsCSV){ + StoreRankInformation(iterations, aggregated_results); } - if (i == 1 && stride > 1) { - i = 0; + int total_errors = 0; + MPI_Reduce(& o.verification_error, & total_errors, 1, MPI_INT, MPI_SUM, 0, testComm); + if(rank == 0 && total_errors){ + VERBOSE(0, -1, "\nERROR: verifying the data on read (%lld errors)! Take the performance values with care!\n", total_errors); } + + MPI_Comm_free(&testComm); + } + + MPI_Group_free(&worldgroup); + testComm = world_com; + + if (created_root_dir && o.remove_only && o.backend->rmdir(o.testdirpath, o.backend_options) != 0) { + FAIL("Unable to remove test directory path %s", o.testdirpath); } - if(verification_error){ - VERBOSE(0, -1, "\nERROR: verifying the data read! Take the performance values with care!\n"); - } VERBOSE(0,-1,"-- finished at %s --\n", PrintTimestamp()); - if (random_seed > 0) { - free(rand_array); + if (o.random_seed > 0) { + free(o.rand_array); } - if (backend->finalize){ - backend->finalize(backend_options); + if (o.backend->finalize){ + o.backend->finalize(o.backend_options); } - return summary_table; + if (o.write_bytes > 0) { + aligned_buffer_free(o.write_buffer, o.gpu_memory_flags); + } + free(o.summary_table); + + return aggregated_results; } diff --git a/src/mdtest.h b/src/mdtest.h index 6267282..09f14be 100644 --- a/src/mdtest.h +++ b/src/mdtest.h @@ -8,28 +8,31 @@ typedef enum { MDTEST_DIR_CREATE_NUM = 0, MDTEST_DIR_STAT_NUM = 1, - MDTEST_DIR_READ_NUM = 1, - MDTEST_DIR_REMOVE_NUM = 3, - MDTEST_FILE_CREATE_NUM = 4, - MDTEST_FILE_STAT_NUM = 5, - MDTEST_FILE_READ_NUM = 6, - MDTEST_FILE_REMOVE_NUM = 7, - MDTEST_TREE_CREATE_NUM = 8, - MDTEST_TREE_REMOVE_NUM = 9, + MDTEST_DIR_READ_NUM = 2, + MDTEST_DIR_RENAME_NUM = 3, + MDTEST_DIR_REMOVE_NUM = 4, + MDTEST_FILE_CREATE_NUM = 5, + MDTEST_FILE_STAT_NUM = 6, + MDTEST_FILE_READ_NUM = 7, + MDTEST_FILE_REMOVE_NUM = 8, + MDTEST_TREE_CREATE_NUM = 9, + MDTEST_TREE_REMOVE_NUM = 10, MDTEST_LAST_NUM } mdtest_test_num_t; typedef struct { - double rate[MDTEST_LAST_NUM]; /* Calculated throughput */ + double rate[MDTEST_LAST_NUM]; /* Calculated throughput after the barrier */ + double rate_before_barrier[MDTEST_LAST_NUM]; /* Calculated throughput before the barrier */ double time[MDTEST_LAST_NUM]; /* Time */ - uint64_t items[MDTEST_LAST_NUM]; /* Number of operations done */ + double time_before_barrier[MDTEST_TREE_CREATE_NUM]; /* individual time before executing the barrier */ + uint64_t items[MDTEST_LAST_NUM]; /* Number of operations done in this process*/ /* Statistics when hitting the stonewall */ - double stonewall_time[MDTEST_LAST_NUM]; /* runtime until completion / hit of the stonewall */ - uint64_t stonewall_last_item[MDTEST_LAST_NUM]; /* Max number of items a process has accessed */ - uint64_t stonewall_item_min[MDTEST_LAST_NUM]; /* Min number of items a process has accessed */ - uint64_t stonewall_item_sum[MDTEST_LAST_NUM]; /* Total number of items accessed until stonewall */ + double stonewall_time[MDTEST_LAST_NUM]; /* Max runtime of any process until completion / hit of the stonewall */ + uint64_t stonewall_last_item[MDTEST_LAST_NUM]; /* The number of items a process has accessed */ + uint64_t stonewall_item_min[MDTEST_LAST_NUM]; /* Min number of items any process has accessed */ + uint64_t stonewall_item_sum[MDTEST_LAST_NUM]; /* Total number of items accessed by all processes until stonewall */ } mdtest_results_t; mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * out_logfile); diff --git a/src/option.c b/src/option.c index 618360f..80463d4 100644 --- a/src/option.c +++ b/src/option.c @@ -7,6 +7,23 @@ #include + +/* merge two option lists and return the total size */ +option_help * option_merge(option_help * a, option_help * b){ + int count_a = 0; + for(option_help * i = a; i->type != 0; i++){ + count_a++; + } + int count = count_a + 1; // LAST_OPTION is one + for(option_help * i = b; i->type != 0; i++){ + count++; + } + option_help * h = malloc(sizeof(option_help) * count); + memcpy(h, a, sizeof(option_help) * count_a); + memcpy(h + count_a, b, sizeof(option_help) * (count - count_a)); + return h; +} + /* * Takes a string of the form 64, 8m, 128k, 4g, etc. and converts to bytes. */ @@ -236,8 +253,10 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi int i = 0; if(arg != NULL){ arg[0] = 0; - arg++; replaced_equal = 1; + + // Check empty value + arg = (arg[1] == 0) ? NULL : arg + 1; } *flag_parsed_next = 0; @@ -247,11 +266,13 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi return; } txt++; - + int parsed = 0; + + // printf("Parsing: %s : %s\n", txt, arg); // support groups of multiple flags like -vvv or -vq for(int flag_index = 0; flag_index < strlen(txt); ++flag_index){ // don't loop looking for multiple flags if we already processed a long option - if(txt[0] == '-' && flag_index > 0) + if(txt[flag_index] == '=' || (txt[0] == '-' && flag_index > 0)) break; for(int m = 0; m < opt_all->module_count; m++ ){ @@ -264,6 +285,7 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi continue; } if ( (o->shortVar == txt[flag_index]) || (strlen(txt) > 2 && txt[0] == '-' && o->longVar != NULL && strcmp(txt + 1, o->longVar) == 0)){ + // printf("Found %s %c=%c? %d %d\n", o->help, o->shortVar, txt[flag_index], (o->shortVar == txt[flag_index]), (strlen(txt) > 2 && txt[0] == '-' && o->longVar != NULL && strcmp(txt + 1, o->longVar) == 0)); // now process the option. switch(o->arg){ case (OPTION_FLAG):{ @@ -279,7 +301,7 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi case (OPTION_OPTIONAL_ARGUMENT): case (OPTION_REQUIRED_ARGUMENT):{ // check if next is an argument - if(arg == NULL){ + if(arg == NULL && replaced_equal != 1){ if(o->shortVar == txt[0] && txt[1] != 0){ arg = & txt[1]; }else{ @@ -353,12 +375,13 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi (*requiredArgsSeen)++; } - return; + parsed = 1; } } } } - + if(parsed) return; + if(strcmp(txt, "h") == 0 || strcmp(txt, "-help") == 0){ *print_help = 1; }else{ diff --git a/src/option.h b/src/option.h index 5ca305f..0afa519 100644 --- a/src/option.h +++ b/src/option.h @@ -43,6 +43,7 @@ void option_print_current(option_help * args); //@return the number of parsed arguments int option_parse(int argc, char ** argv, options_all_t * args); int option_parse_str(char*val, options_all_t * opt_all); +option_help * option_merge(option_help * a, option_help * b); /* Parse a single line */ int option_parse_key_value(char * key, char * value, options_all_t * opt_all); diff --git a/src/parse_options.c b/src/parse_options.c index ce5421c..9168778 100755 --- a/src/parse_options.c +++ b/src/parse_options.c @@ -32,7 +32,7 @@ #include "option.h" #include "aiori.h" -IOR_param_t initialTestParams; +static IOR_param_t initialTestParams; option_help * createGlobalOptions(IOR_param_t * params); @@ -62,7 +62,17 @@ static void CheckRunSettings(IOR_test_t *tests) } if(params->dualMount && !params->filePerProc) { - MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "Dual Mount can only be used with File Per Process"); + ERR("Dual Mount can only be used with File Per Process"); + } + + if(params->gpuDirect){ + if(params->gpuMemoryFlags == IOR_MEMORY_TYPE_GPU_MANAGED){ + ERR("GPUDirect cannot be used with managed memory"); + } + params->gpuMemoryFlags = IOR_MEMORY_TYPE_GPU_DEVICE_ONLY; + if(params->checkRead || params->checkWrite){ + ERR("GPUDirect data cannot yet be checked"); + } } } } @@ -103,6 +113,21 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt } printf("Writing output to %s\n", value); } + } else if (strcasecmp(option, "saveRankPerformanceDetailsCSV") == 0){ + if (rank == 0){ + // check that the file is writeable, truncate it and add header + FILE* fd = fopen(value, "w"); + if (fd == NULL){ + FAIL("Cannot open saveRankPerformanceDetailsCSV file for write!"); + } + char buff[] = "access,rank,runtime-with-openclose,runtime,throughput-withopenclose,throughput\n"; + int ret = fwrite(buff, strlen(buff), 1, fd); + if(ret != 1){ + FAIL("Cannot write header to saveRankPerformanceDetailsCSV file"); + } + fclose(fd); + } + params->saveRankDetailsCSV = strdup(value); } else if (strcasecmp(option, "summaryFormat") == 0) { if(strcasecmp(value, "default") == 0){ outputFormat = OUTPUT_DEFAULT; @@ -123,6 +148,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->testFileName = strdup(value); } else if (strcasecmp(option, "dualmount") == 0){ params->dualMount = atoi(value); + } else if (strcasecmp(option, "allocateBufferOnGPU") == 0) { + params->gpuMemoryFlags = atoi(value); + } else if (strcasecmp(option, "GPUid") == 0) { + params->gpuID = atoi(value); + } else if (strcasecmp(option, "GPUDirect") == 0) { + params->gpuDirect = atoi(value); } else if (strcasecmp(option, "deadlineforstonewalling") == 0) { params->deadlineForStonewalling = atoi(value); } else if (strcasecmp(option, "stoneWallingWearOut") == 0) { @@ -175,8 +206,8 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->keepFileWithError = atoi(value); } else if (strcasecmp(option, "multiFile") == 0) { params->multiFile = atoi(value); - } else if (strcasecmp(option, "quitonerror") == 0) { - params->quitOnError = atoi(value); + } else if (strcasecmp(option, "warningAsErrors") == 0) { + params->warningAsErrors = atoi(value); } else if (strcasecmp(option, "segmentcount") == 0) { params->segmentCount = string_to_bytes(value); } else if (strcasecmp(option, "blocksize") == 0) { @@ -191,8 +222,8 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->verbose = atoi(value); } else if (strcasecmp(option, "settimestampsignature") == 0) { params->setTimeStampSignature = atoi(value); - } else if (strcasecmp(option, "storefileoffset") == 0) { - params->storeFileOffset = atoi(value); + } else if (strcasecmp(option, "dataPacketType") == 0) { + params->dataPacketType = parsePacketType(value[0]); } else if (strcasecmp(option, "uniqueDir") == 0) { params->uniqueDir = atoi(value); } else if (strcasecmp(option, "useexistingtestfile") == 0) { @@ -282,7 +313,7 @@ int contains_only(char *haystack, char *needle) /* check for "needle" */ if (strncasecmp(ptr, needle, strlen(needle)) != 0) return 0; - /* make sure the rest of the line is only whitspace as well */ + /* make sure the rest of the line is only whitespace as well */ for (ptr += strlen(needle); ptr < end; ptr++) { if (!isspace(*ptr)) return 0; @@ -384,7 +415,7 @@ option_help * createGlobalOptions(IOR_param_t * params){ char APIs[1024]; char APIs_legacy[1024]; aiori_supported_apis(APIs, APIs_legacy, IOR); - char apiStr[1024]; + char * apiStr = safeMalloc(1024); sprintf(apiStr, "API for I/O [%s]", APIs); option_help o [] = { @@ -395,9 +426,16 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'C', NULL, "reorderTasks -- changes task ordering for readback (useful to avoid client cache)", OPTION_FLAG, 'd', & params->reorderTasks}, {'d', NULL, "interTestDelay -- delay between reps in seconds", OPTION_OPTIONAL_ARGUMENT, 'd', & params->interTestDelay}, {'D', NULL, "deadlineForStonewalling -- seconds before stopping write or read phase", OPTION_OPTIONAL_ARGUMENT, 'd', & params->deadlineForStonewalling}, - {.help=" -O stoneWallingWearOut=1 -- once the stonewalling timout is over, all process finish to access the amount of data", .arg = OPTION_OPTIONAL_ARGUMENT}, + {.help=" -O stoneWallingWearOut=1 -- once the stonewalling timeout is over, all process finish to access the amount of data", .arg = OPTION_OPTIONAL_ARGUMENT}, {.help=" -O stoneWallingWearOutIterations=N -- stop after processing this number of iterations, needed for reading data back written with stoneWallingWearOut", .arg = OPTION_OPTIONAL_ARGUMENT}, {.help=" -O stoneWallingStatusFile=FILE -- this file keeps the number of iterations from stonewalling during write and allows to use them for read", .arg = OPTION_OPTIONAL_ARGUMENT}, +#ifdef HAVE_CUDA + {.help=" -O allocateBufferOnGPU=X -- allocate I/O buffers on the GPU: X=1 uses managed memory, X=2 device memory.", .arg = OPTION_OPTIONAL_ARGUMENT}, + {.help=" -O GPUid=X -- select the GPU to use.", .arg = OPTION_OPTIONAL_ARGUMENT}, +#ifdef HAVE_GPU_DIRECT + {0, "gpuDirect", "allocate I/O buffers on the GPU and use gpuDirect to store data; this option is incompatible with any option requiring CPU access to data.", OPTION_FLAG, 'd', & params->gpuDirect}, +#endif +#endif {'e', NULL, "fsync -- perform a fsync() operation at the end of each read/write phase", OPTION_FLAG, 'd', & params->fsync}, {'E', NULL, "useExistingTestFile -- do not remove test file before write access", OPTION_FLAG, 'd', & params->useExistingTestFile}, {'f', NULL, "scriptFile -- test script name", OPTION_OPTIONAL_ARGUMENT, 's', & params->testscripts}, @@ -412,13 +450,12 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'j', NULL, "outlierThreshold -- warn on outlier N seconds from mean", OPTION_OPTIONAL_ARGUMENT, 'd', & params->outlierThreshold}, {'k', NULL, "keepFile -- don't remove the test file(s) on program exit", OPTION_FLAG, 'd', & params->keepFile}, {'K', NULL, "keepFileWithError -- keep error-filled file(s) after data-checking", OPTION_FLAG, 'd', & params->keepFileWithError}, - {'l', NULL, "datapacket type-- type of packet that will be created [offset|incompressible|timestamp|o|i|t]", OPTION_OPTIONAL_ARGUMENT, 's', & params->buffer_type}, + {'l', "dataPacketType", "datapacket type-- type of packet that will be created [offset|incompressible|timestamp|o|i|t]", OPTION_OPTIONAL_ARGUMENT, 's', & params->buffer_type}, {'m', NULL, "multiFile -- use number of reps (-i) for multiple file count", OPTION_FLAG, 'd', & params->multiFile}, {'M', NULL, "memoryPerNode -- hog memory on the node (e.g.: 2g, 75%)", OPTION_OPTIONAL_ARGUMENT, 's', & params->memoryPerNodeStr}, {'N', NULL, "numTasks -- number of tasks that are participating in the test (overrides MPI)", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks}, {'o', NULL, "testFile -- full name for test", OPTION_OPTIONAL_ARGUMENT, 's', & params->testFileName}, {'O', NULL, "string of IOR directives (e.g. -O checkRead=1,lustreStripeCount=32)", OPTION_OPTIONAL_ARGUMENT, 'p', & decodeDirectiveWrapper}, - {'q', NULL, "quitOnError -- during file error-checking, abort on error", OPTION_FLAG, 'd', & params->quitOnError}, {'Q', NULL, "taskPerNodeOffset for read tests use with -C & -Z options (-C constant N, -Z at least N)", OPTION_OPTIONAL_ARGUMENT, 'd', & params->taskPerNodeOffset}, {'r', NULL, "readFile -- read existing file", OPTION_FLAG, 'd', & params->readFile}, {'R', NULL, "checkRead -- verify that the output of read matches the expected signature (used with -G)", OPTION_FLAG, 'd', & params->checkRead}, @@ -434,9 +471,13 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'y', NULL, "dualMount -- use dual mount points for a filesystem", OPTION_FLAG, 'd', & params->dualMount}, {'Y', NULL, "fsyncPerWrite -- perform sync operation after every write operation", OPTION_FLAG, 'd', & params->fsyncPerWrite}, {'z', NULL, "randomOffset -- access is to random, not sequential, offsets within a file", OPTION_FLAG, 'd', & params->randomOffset}, + {0, "randomPrefill", "For random -z access only: Prefill the file with this blocksize, e.g., 2m", OPTION_OPTIONAL_ARGUMENT, 'l', & params->randomPrefillBlocksize}, + {0, "random-offset-seed", "The seed for -z", OPTION_OPTIONAL_ARGUMENT, 'd', & params->randomSeed}, {'Z', NULL, "reorderTasksRandom -- changes task ordering to random ordering for readback", OPTION_FLAG, 'd', & params->reorderTasksRandom}, + {0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & params->warningAsErrors}, {.help=" -O summaryFile=FILE -- store result data into this file", .arg = OPTION_OPTIONAL_ARGUMENT}, - {.help=" -O summaryFormat=[default,JSON,CSV] -- use the format for outputing the summary", .arg = OPTION_OPTIONAL_ARGUMENT}, + {.help=" -O summaryFormat=[default,JSON,CSV] -- use the format for outputting the summary", .arg = OPTION_OPTIONAL_ARGUMENT}, + {.help=" -O saveRankPerformanceDetailsCSV= -- store the performance of each rank into the named CSV file.", .arg = OPTION_OPTIONAL_ARGUMENT}, {0, "dryRun", "do not perform any I/Os just run evtl. inputs print dummy output", OPTION_FLAG, 'd', & params->dryRun}, LAST_OPTION, }; @@ -449,9 +490,9 @@ option_help * createGlobalOptions(IOR_param_t * params){ /* * Parse Commandline. */ -IOR_test_t *ParseCommandLine(int argc, char **argv) +IOR_test_t *ParseCommandLine(int argc, char **argv, MPI_Comm com) { - init_IOR_Param_t(& initialTestParams); + init_IOR_Param_t(& initialTestParams, com); IOR_test_t *tests = NULL; diff --git a/src/parse_options.h b/src/parse_options.h index 45b93ca..b12dd78 100755 --- a/src/parse_options.h +++ b/src/parse_options.h @@ -13,8 +13,6 @@ #include "ior.h" -extern IOR_param_t initialTestParams; - -IOR_test_t *ParseCommandLine(int argc, char **argv); +IOR_test_t *ParseCommandLine(int argc, char **argv, MPI_Comm com); #endif /* !_PARSE_OPTIONS_H */ diff --git a/src/test/example.c b/src/test/example.c index 5bb4b2b..3b31066 100644 --- a/src/test/example.c +++ b/src/test/example.c @@ -1,8 +1,10 @@ #include -#include -#include +#include "../ior.h" +#include "../ior-internal.h" +// Run all tests via: +// make distcheck // build a single test via, e.g., mpicc example.c -I ../src/ ../src/libaiori.a -lm int main(){ @@ -16,16 +18,6 @@ int main(){ // having an individual file test.filePerProc = 1; - IOR_offset_t * offsets; - offsets = GetOffsetArraySequential(& test, 0); - assert(offsets[0] == 0); - assert(offsets[1] == 10); - assert(offsets[2] == 20); - assert(offsets[3] == 30); - assert(offsets[4] == 40); - // for(int i = 0; i < test.segmentCount; i++){ - // printf("%lld\n", (long long int) offsets[i]); - // } printf("OK\n"); return 0; } diff --git a/src/utilities.c b/src/utilities.c index 715e30d..c2ec6c9 100755 --- a/src/utilities.c +++ b/src/utilities.c @@ -16,6 +16,12 @@ # include "config.h" #endif +#ifdef HAVE_GETCPU_SYSCALL +# define _GNU_SOURCE +# include +# include +#endif + #ifdef __linux__ # define _GNU_SOURCE /* Needed for O_DIRECT in fcntl */ #endif /* __linux__ */ @@ -31,6 +37,10 @@ #include #include +#ifdef HAVE_CUDA +#include +#endif + #ifndef _WIN32 # include # ifdef __sun /* SunOS does not support statfs(), instead uses statvfs() */ @@ -59,13 +69,87 @@ int rank = 0; int rankOffset = 0; int verbose = VERBOSE_0; /* verbose output */ MPI_Comm testComm; -MPI_Comm mpi_comm_world; -FILE * out_logfile; -FILE * out_resultfile; +FILE * out_logfile = NULL; +FILE * out_resultfile = NULL; enum OutputFormat_t outputFormat; /***************************** F U N C T I O N S ******************************/ +void update_write_memory_pattern(uint64_t item, char * buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType){ + if(dataPacketType == DATA_TIMESTAMP || bytes < 8) return; + int k=1; + uint64_t * buffi = (uint64_t*) buf; + for(size_t i=0; i < bytes/sizeof(uint64_t); i+=512, k++){ + buffi[i] = ((uint32_t) item * k) | ((uint64_t) pretendRank) << 32; + } +} + +void generate_memory_pattern(char * buf, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType){ + uint64_t * buffi = (uint64_t*) buf; + // first half of 64 bits use the rank + const size_t size = bytes / 8; + // the first 8 bytes of each 4k block are updated at runtime + unsigned seed = rand_seed + pretendRank; + for(size_t i=0; i < size; i++){ + switch(dataPacketType){ + case(DATA_INCOMPRESSIBLE):{ + uint64_t hi = ((uint64_t) rand_r(& seed) << 32); + uint64_t lo = (uint64_t) rand_r(& seed); + buffi[i] = hi | lo; + break; + }case(DATA_OFFSET):{ + }case(DATA_TIMESTAMP):{ + buffi[i] = ((uint64_t) pretendRank) << 32 | rand_seed + i; + break; + } + } + } + + for(size_t i=size*8; i < bytes; i++){ + buf[i] = (char) i; + } +} + +int verify_memory_pattern(uint64_t item, char * buffer, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType){ + int error = 0; + // always read all data to ensure that performance numbers stay the same + uint64_t * buffi = (uint64_t*) buffer; + + // the first 8 bytes are set to item number + int k=1; + unsigned seed = rand_seed + pretendRank; + const size_t size = bytes / 8; + for(size_t i=0; i < size; i++){ + uint64_t exp; + + switch(dataPacketType){ + case(DATA_INCOMPRESSIBLE):{ + uint64_t hi = ((uint64_t) rand_r(& seed) << 32); + uint64_t lo = (uint64_t) rand_r(& seed); + exp = hi | lo; + break; + }case(DATA_OFFSET):{ + }case(DATA_TIMESTAMP):{ + exp = ((uint64_t) pretendRank) << 32 | rand_seed + i; + break; + } + } + if(i % 512 == 0 && dataPacketType != DATA_TIMESTAMP){ + exp = ((uint32_t) item * k) | ((uint64_t) pretendRank) << 32; + k++; + } + if(buffi[i] != exp){ + error = 1; + } + } + for(size_t i=size*8; i < bytes; i++){ + if(buffer[i] != (char) i){ + error = 1; + } + } + return error; +} + void* safeMalloc(uint64_t size){ void * d = malloc(size); if (d == NULL){ @@ -81,8 +165,8 @@ void FailMessage(int rank, const char *location, char *format, ...) { va_start(args, format); vsnprintf(msg, 4096, format, args); va_end(args); - fprintf(out_logfile, "%s: Process %d: FAILED in %s, %s: %s\n", - PrintTimestamp(), rank, location, msg, strerror(errno)); + fprintf(out_logfile, "%s: Process %d: FAILED in %s, %s\n", + PrintTimestamp(), rank, location, msg); fflush(out_logfile); MPI_Abort(testComm, 1); } @@ -119,28 +203,28 @@ size_t NodeMemoryStringToBytes(char *size_str) return mem / 100 * percent; } +ior_dataPacketType_e parsePacketType(char t){ + switch(t) { + case '\0': return DATA_TIMESTAMP; + case 'i': /* Incompressible */ + return DATA_INCOMPRESSIBLE; + case 't': /* timestamp */ + return DATA_TIMESTAMP; + case 'o': /* offset packet */ + return DATA_OFFSET; + default: + ERRF("Unknown packet type \"%c\"; generic assumed\n", t); + return DATA_OFFSET; + } +} + void updateParsedOptions(IOR_param_t * options, options_all_t * global_options){ if (options->setTimeStampSignature){ options->incompressibleSeed = options->setTimeStampSignature; } if (options->buffer_type && options->buffer_type[0] != 0){ - switch(options->buffer_type[0]) { - case 'i': /* Incompressible */ - options->dataPacketType = incompressible; - break; - case 't': /* timestamp */ - options->dataPacketType = timestamp; - break; - case 'o': /* offset packet */ - options->storeFileOffset = TRUE; - options->dataPacketType = offset; - break; - default: - fprintf(out_logfile, - "Unknown argument for -l %s; generic assumed\n", options->buffer_type); - break; - } + options->dataPacketType = parsePacketType(options->buffer_type[0]); } if (options->memoryPerNodeStr){ options->memoryPerNode = NodeMemoryStringToBytes(options->memoryPerNodeStr); @@ -158,7 +242,7 @@ void updateParsedOptions(IOR_param_t * options, options_all_t * global_options){ /* Used in aiori-POSIX.c and aiori-PLFS.c */ -void set_o_direct_flag(int *fd) +void set_o_direct_flag(int *flag) { /* note that TRU64 needs O_DIRECTIO, SunOS uses directio(), and everyone else needs O_DIRECT */ @@ -171,7 +255,7 @@ void set_o_direct_flag(int *fd) # endif /* not O_DIRECTIO */ #endif /* not O_DIRECT */ - *fd |= O_DIRECT; + *flag |= O_DIRECT; } @@ -566,16 +650,14 @@ IOR_offset_t StringToBytes(char *size_str) /* * Displays size of file system and percent of data blocks and inodes used. */ -void ShowFileSystemSize(IOR_param_t * test) // this might be converted to an AIORI call +void ShowFileSystemSize(char * filename, const struct ior_aiori * backend, void * backend_options) // this might be converted to an AIORI call { ior_aiori_statfs_t stat; - if(! test->backend->statfs){ + if(! backend->statfs){ WARN("Backend doesn't implement statfs"); return; } - char filename[MAX_PATHLEN]; - GetTestFileName(filename, test); - int ret = test->backend->statfs(filename, & stat, test->backend_options); + int ret = backend->statfs(filename, & stat, backend_options); if( ret != 0 ){ WARN("Backend returned error during statfs"); return; @@ -648,27 +730,6 @@ int Regex(char *string, char *pattern) return (retValue); } -/* - * Seed random generator. - */ -void SeedRandGen(MPI_Comm testComm) -{ - unsigned int randomSeed; - - if (rank == 0) { -#ifdef _WIN32 - rand_s(&randomSeed); -#else - struct timeval randGenTimer; - gettimeofday(&randGenTimer, (struct timezone *)NULL); - randomSeed = randGenTimer.tv_usec; -#endif - } - MPI_CHECK(MPI_Bcast(&randomSeed, 1, MPI_INT, 0, - testComm), "cannot broadcast random seed value"); - srandom(randomSeed); -} - /* * System info for Windows. */ @@ -691,10 +752,6 @@ int uname(struct utsname *name) } #endif /* _WIN32 */ - -double wall_clock_deviation; -double wall_clock_delta = 0; - /* * Get time stamp. Use MPI_Timer() unless _NO_MPI_TIMER is defined, * in which case use gettimeofday(). @@ -702,55 +759,46 @@ double wall_clock_delta = 0; double GetTimeStamp(void) { double timeVal; -#ifdef _NO_MPI_TIMER struct timeval timer; if (gettimeofday(&timer, (struct timezone *)NULL) != 0) ERR("cannot use gettimeofday()"); timeVal = (double)timer.tv_sec + ((double)timer.tv_usec / 1000000); -#else /* not _NO_MPI_TIMER */ - timeVal = MPI_Wtime(); /* no MPI_CHECK(), just check return value */ - if (timeVal < 0) - ERR("cannot use MPI_Wtime()"); -#endif /* _NO_MPI_TIMER */ - - /* wall_clock_delta is difference from root node's time */ - timeVal -= wall_clock_delta; return (timeVal); } /* * Determine any spread (range) between node times. + * Obsolete */ -static double TimeDeviation(void) +static double TimeDeviation(MPI_Comm com) { double timestamp; double min = 0; double max = 0; double roottimestamp; - MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error"); + MPI_CHECK(MPI_Barrier(com), "barrier error"); timestamp = GetTimeStamp(); MPI_CHECK(MPI_Reduce(×tamp, &min, 1, MPI_DOUBLE, - MPI_MIN, 0, mpi_comm_world), + MPI_MIN, 0, com), "cannot reduce tasks' times"); MPI_CHECK(MPI_Reduce(×tamp, &max, 1, MPI_DOUBLE, - MPI_MAX, 0, mpi_comm_world), + MPI_MAX, 0, com), "cannot reduce tasks' times"); /* delta between individual nodes' time and root node's time */ roottimestamp = timestamp; - MPI_CHECK(MPI_Bcast(&roottimestamp, 1, MPI_DOUBLE, 0, mpi_comm_world), + MPI_CHECK(MPI_Bcast(&roottimestamp, 1, MPI_DOUBLE, 0, com), "cannot broadcast root's time"); - wall_clock_delta = timestamp - roottimestamp; + // wall_clock_delta = timestamp - roottimestamp; return max - min; } -void init_clock(){ - /* check for skew between tasks' start times */ - wall_clock_deviation = TimeDeviation(); +void init_clock(MPI_Comm com){ + } char * PrintTimestamp() { @@ -768,16 +816,16 @@ char * PrintTimestamp() { return datestring; } -int64_t ReadStoneWallingIterations(char * const filename){ +int64_t ReadStoneWallingIterations(char * const filename, MPI_Comm com){ long long data; if(rank != 0){ - MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, mpi_comm_world); + MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com); return data; }else{ FILE * out = fopen(filename, "r"); if (out == NULL){ data = -1; - MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, mpi_comm_world); + MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com); return data; } int ret = fscanf(out, "%lld", & data); @@ -785,7 +833,7 @@ int64_t ReadStoneWallingIterations(char * const filename){ return -1; } fclose(out); - MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, mpi_comm_world); + MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, com); return data; } } @@ -869,17 +917,15 @@ char *HumanReadable(IOR_offset_t value, int base) return valueStr; } -#if defined(__aarch64__) -// TODO: This might be general enough to provide the functionality for any system -// regardless of processor type given we aren't worried about thread/process migration. +#if defined(HAVE_GETCPU_SYSCALL) +// Assume we aren't worried about thread/process migration. // Test on Intel systems and see if we can get rid of the architecture specificity // of the code. unsigned long GetProcessorAndCore(int *chip, int *core){ return syscall(SYS_getcpu, core, chip, NULL); } -// TODO: Add in AMD function -#else -// If we're not on an ARM processor assume we're on an intel processor and use the +#elif defined(HAVE_RDTSCP_ASM) +// We're on an intel processor and use the // rdtscp instruction. unsigned long GetProcessorAndCore(int *chip, int *core){ unsigned long a,d,c; @@ -888,5 +934,81 @@ unsigned long GetProcessorAndCore(int *chip, int *core){ *core = c & 0xFFF; return ((unsigned long)a) | (((unsigned long)d) << 32);; } +#else +// TODO: Add in AMD function +unsigned long GetProcessorAndCore(int *chip, int *core){ +#warning GetProcessorAndCore is implemented as a dummy + *chip = 0; + *core = 0; + return 1; +} #endif + + +/* + * Allocate a page-aligned (required by O_DIRECT) buffer. + */ +void *aligned_buffer_alloc(size_t size, ior_memory_flags type) +{ + size_t pageMask; + char *buf, *tmp; + char *aligned; + + if(type == IOR_MEMORY_TYPE_GPU_MANAGED){ +#ifdef HAVE_CUDA + // use unified memory here to allow drop-in-replacement + if (cudaMallocManaged((void**) & buf, size, cudaMemAttachGlobal) != cudaSuccess){ + ERR("Cannot allocate buffer on GPU"); + } + return buf; +#else + ERR("No CUDA supported, cannot allocate on the GPU"); +#endif + }else if(type == IOR_MEMORY_TYPE_GPU_DEVICE_ONLY){ +#ifdef HAVE_GPU_DIRECT + if (cudaMalloc((void**) & buf, size) != cudaSuccess){ + ERR("Cannot allocate buffer on GPU"); + } + return buf; +#else + ERR("No GPUDirect supported, cannot allocate on the GPU"); +#endif + } + +#ifdef HAVE_SYSCONF + long pageSize = sysconf(_SC_PAGESIZE); +#else + size_t pageSize = getpagesize(); +#endif + + pageMask = pageSize - 1; + buf = safeMalloc(size + pageSize + sizeof(void *)); + /* find the alinged buffer */ + tmp = buf + sizeof(char *); + aligned = tmp + pageSize - ((size_t) tmp & pageMask); + /* write a pointer to the original malloc()ed buffer into the bytes + preceding "aligned", so that the aligned buffer can later be free()ed */ + tmp = aligned - sizeof(void *); + *(void **)tmp = buf; + + return (void *)aligned; +} + +/* + * Free a buffer allocated by aligned_buffer_alloc(). + */ +void aligned_buffer_free(void *buf, ior_memory_flags gpu) +{ + if(gpu){ +#ifdef HAVE_CUDA + if (cudaFree(buf) != cudaSuccess){ + WARN("Cannot free buffer on GPU"); + } + return; +#else + ERR("No CUDA supported, cannot free on the GPU"); +#endif + } + free(*(void **)((char *)buf - sizeof(char *))); +} diff --git a/src/utilities.h b/src/utilities.h index 32292a4..7e9f704 100755 --- a/src/utilities.h +++ b/src/utilities.h @@ -22,8 +22,6 @@ extern int rank; extern int rankOffset; extern int verbose; extern MPI_Comm testComm; -extern MPI_Comm mpi_comm_world; -extern FILE * out_logfile; extern FILE * out_resultfile; extern enum OutputFormat_t outputFormat; /* format of the output */ @@ -31,25 +29,22 @@ extern enum OutputFormat_t outputFormat; /* format of the output */ * Try using the system's PATH_MAX, which is what realpath and such use. */ #define MAX_PATHLEN PATH_MAX - - -#ifdef __linux__ #define ERROR_LOCATION __func__ -#else -#define ERROR_LOCATION __LINE__ -#endif -#define FAIL(...) FailMessage(rank, ERROR_LOCATION, __VA_ARGS__) -void FailMessage(int rank, const char *location, char *format, ...); void* safeMalloc(uint64_t size); void set_o_direct_flag(int *fd); +ior_dataPacketType_e parsePacketType(char t); +void update_write_memory_pattern(uint64_t item, char * buf, size_t bytes, int rand_seed, int rank, ior_dataPacketType_e dataPacketType); +void generate_memory_pattern(char * buf, size_t bytes, int rand_seed, int rank, ior_dataPacketType_e dataPacketType); +/* check a data buffer, @return 0 if all is correct, otherwise 1 */ +int verify_memory_pattern(uint64_t item, char * buffer, size_t bytes, int rand_seed, int pretendRank, ior_dataPacketType_e dataPacketType); + char *CurrentTimeString(void); int Regex(char *, char *); -void ShowFileSystemSize(IOR_param_t * test); +void ShowFileSystemSize(char * filename, const struct ior_aiori * backend, void * backend_options); void DumpBuffer(void *, size_t); -void SeedRandGen(MPI_Comm); void SetHints (MPI_Info *, char *); void ShowHints (MPI_Info *); char *HumanReadable(IOR_offset_t value, int base); @@ -62,14 +57,13 @@ void updateParsedOptions(IOR_param_t * options, options_all_t * global_options); size_t NodeMemoryStringToBytes(char *size_str); /* Returns -1, if cannot be read */ -int64_t ReadStoneWallingIterations(char * const filename); +int64_t ReadStoneWallingIterations(char * const filename, MPI_Comm com); void StoreStoneWallingIterations(char * const filename, int64_t count); -void init_clock(void); +void init_clock(MPI_Comm com); double GetTimeStamp(void); char * PrintTimestamp(); // TODO remove this function unsigned long GetProcessorAndCore(int *chip, int *core); - -extern double wall_clock_deviation; -extern double wall_clock_delta; +void *aligned_buffer_alloc(size_t size, ior_memory_flags type); +void aligned_buffer_free(void *buf, ior_memory_flags type); #endif /* !_UTILITIES_H */ diff --git a/testing/basic-tests.sh b/testing/basic-tests.sh index 91dba4b..2f82ced 100755 --- a/testing/basic-tests.sh +++ b/testing/basic-tests.sh @@ -15,18 +15,39 @@ MDTEST 1 -a POSIX MDTEST 2 -a POSIX -W 2 MDTEST 1 -C -T -r -F -I 1 -z 1 -b 1 -L -u MDTEST 1 -C -T -I 1 -z 1 -b 1 -u +MDTEST 2 -n 1 -f 1 -l 2 -IOR 1 -a POSIX -w -z -F -Y -e -i1 -m -t 100k -b 1000k -IOR 1 -a POSIX -w -z -F -k -e -i2 -m -t 100k -b 100k -IOR 1 -a MMAP -r -z -F -k -e -i1 -m -t 100k -b 100k +IOR 1 -a POSIX -w -z -F -Y -e -i1 -m -t 100k -b 2000k +IOR 1 -a POSIX -w -z -F -k -e -i2 -m -t 100k -b 200k +IOR 1 -a MMAP -r -z -F -k -e -i1 -m -t 100k -b 200k -IOR 2 -a POSIX -w -z -C -F -k -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -w -z -C -Q 1 -F -k -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -r -z -Z -Q 2 -F -k -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -r -z -Z -Q 3 -X 13 -F -k -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -w -z -Z -Q 1 -X -13 -F -e -i1 -m -t 100k -b 100k +IOR 2 -a POSIX -w -C -k -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -w -z -C -F -k -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -w -z -C -Q 1 -F -k -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -r -z -Z -Q 2 -F -k -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -r -z -Z -Q 3 -X 13 -F -k -e -i1 -m -t 100k -b 200k +IOR 3 -a POSIX -w -z -Z -Q 1 -X -13 -F -e -i1 -m -t 100k -b 200k IOR 2 -f "$ROOT/test_comments.ior" +# Test for JSON output +IOR 2 -a DUMMY -e -F -t 1m -b 1m -A 328883 -O summaryFormat=JSON -O summaryFile=OUT.json +python -mjson.tool OUT.json >/dev/null && echo "JSON OK" + +# MDWB +MDWB 3 -a POSIX -O=1 -D=1 -G=10 -P=1 -I=1 -R=2 -X +MDWB 3 -a POSIX -O=1 -D=4 -G=10 -P=4 -I=1 -R=2 -X -t=0.001 -L=latency.txt +MDWB 3 -a POSIX -O=1 -D=2 -G=10 -P=4 -I=3 -R=2 -X -W -w 1 +MDWB 3 -a POSIX -O=1 -D=2 -G=10 -P=4 -I=3 -1 -W -w 1 --run-info-file=mdw.tst --print-detailed-stats +MDWB 3 -a POSIX -O=1 -D=2 -G=10 -P=4 -I=3 -2 -W -w 1 --run-info-file=mdw.tst --print-detailed-stats +MDWB 3 -a POSIX -O=1 -D=2 -G=10 -P=4 -I=3 -2 -W -w 1 --read-only --run-info-file=mdw.tst --print-detailed-stats +MDWB 3 -a POSIX -O=1 -D=2 -G=10 -P=4 -I=3 -2 -W -w 1 --read-only --run-info-file=mdw.tst --print-detailed-stats +MDWB 3 -a POSIX -O=1 -D=2 -G=10 -P=4 -I=3 -3 -W -w 1 --run-info-file=mdw.tst --print-detailed-stats + +MDWB 2 -a POSIX -O=1 -D=1 -G=3 -P=2 -I=2 -R=2 -X -S 772 --dataPacketType=t +DELETE=0 +MDWB 2 -a POSIX -D=1 -P=2 -I=2 -R=2 -X -G=2252 -S 772 --dataPacketType=i -1 +MDWB 2 -a POSIX -D=1 -P=2 -I=2 -R=2 -X -G=2252 -S 772 --dataPacketType=i -2 +MDWB 2 -a POSIX -D=1 -P=2 -I=2 -R=2 -X -G=2252 -S 772 --dataPacketType=i -3 END diff --git a/testing/build-hdfs.sh b/testing/build-hdfs.sh new file mode 100755 index 0000000..0165dfb --- /dev/null +++ b/testing/build-hdfs.sh @@ -0,0 +1,18 @@ +#!/bin/bash +mkdir build-hdfs +cd build-hdfs + +VER=hadoop-3.2.1 +if [[ ! -e $VER.tar.gz ]] ; then + wget https://www.apache.org/dyn/closer.cgi/hadoop/common/$VER/$VER.tar.gz + tar -xf $VER.tar.gz +fi + +../configure --with-hdfs CFLAGS="-I$PWD/$VER/include/ -O0 -g3" LDFLAGS="-L$PWD/$VER/lib/native -Wl,-rpath=$PWD/$VER/lib/native" +make -j + + +echo "To run execute:" +echo export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ +echo export CLASSPATH=$(find $VER/ -name "*.jar" -printf "%p:") +echo ./src/ior -a HDFS diff --git a/testing/complex-tests.sh b/testing/complex-tests.sh index c314cf9..e4692dd 100755 --- a/testing/complex-tests.sh +++ b/testing/complex-tests.sh @@ -10,22 +10,22 @@ TYPE="advanced" source $ROOT/test-lib.sh #stonewalling tests -IOR 2 -a DUMMY -w -O stoneWallingStatusFile=stonewall.log -O stoneWallingWearOut=1 -D 1 -t 1000 -b 1000 -s 15 -IOR 2 -a DUMMY -r -O stoneWallingStatusFile=stonewall.log -D 1 -t 1000 -b 1000 -s 30 # max 15 still! -IOR 2 -a DUMMY -r -O stoneWallingStatusFile=stonewall.log -t 1000 -b 1000 -s 30 +IOR 2 -a DUMMY -w -O stoneWallingStatusFile=stonewall.log -O stoneWallingWearOut=1 -D 1 -t 1000 -b 1000 -s 15 -k +IOR 2 -a DUMMY -r -O stoneWallingStatusFile=stonewall.log -D 1 -t 1000 -b 1000 -s 30 -k # max 15 still! +IOR 2 -a DUMMY -r -O stoneWallingStatusFile=stonewall.log -t 1000 -b 1000 -s 30 -k MDTEST 2 -I 20 -a DUMMY -W 1 -x stonewall-md.log -C MDTEST 2 -I 20 -a DUMMY -x stonewall-md.log -T -v MDTEST 2 -I 20 -a DUMMY -x stonewall-md.log -D -v #shared tests -IOR 2 -a POSIX -w -z -Y -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -w -k -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -r -z-k -e -i1 -m -t 100k -b 100k +IOR 2 -a POSIX -w -z -Y -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -w -k -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -r -z-k -e -i1 -m -t 100k -b 200k #test mutually exclusive options -IOR 2 -a POSIX -w -z -k -e -i1 -m -t 100k -b 100k -IOR 2 -a POSIX -w -z -k -e -i1 -m -t 100k -b 100k +IOR 2 -a POSIX -w -z -k -e -i1 -m -t 100k -b 200k +IOR 2 -a POSIX -w -z -k -e -i1 -m -t 100k -b 200k IOR 2 -a POSIX -w -Z -i1 -m -t 100k -b 100k -d 0.1 # Now set the num tasks per node to 1: diff --git a/testing/docker/ceph/NOTES b/testing/docker/ceph/NOTES index 2023922..398e4c4 100644 --- a/testing/docker/ceph/NOTES +++ b/testing/docker/ceph/NOTES @@ -7,7 +7,7 @@ Following are basic notes on how to deploy the 'ceph/demo' docker container. The Run `docker pull ceph/demo` to download the image to your system. ################################ -# Deploy 'ceph/demo' conatiner # +# Deploy 'ceph/demo' container # ################################ To deploy the Ceph cluster, execute the following command: diff --git a/testing/docker/run-all-tests.sh b/testing/docker/run-all-tests.sh index 172576f..15d576d 100755 --- a/testing/docker/run-all-tests.sh +++ b/testing/docker/run-all-tests.sh @@ -46,7 +46,7 @@ for IMAGE in $(find -type d | cut -b 3- |grep -v "^$") ; do done if [[ $ERROR != 0 ]] ; then - echo "Errors occured!" + echo "Errors occurred!" else echo "OK: all tests passed!" fi diff --git a/testing/mdtest-patterns/advanced/3.txt b/testing/mdtest-patterns/advanced/3.txt index 4c45941..cff653e 100644 --- a/testing/mdtest-patterns/advanced/3.txt +++ b/testing/mdtest-patterns/advanced/3.txt @@ -1,95 +1,92 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1656 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1683 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir -V-3: Rank 0 Line 801 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19' -V-3: Rank 0 Line 1716 will file_test on mdtest_tree.0 -V-3: Rank 0 Line 990 Entering file_test on mdtest_tree.0 -V-3: Rank 0 Line 1012 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.0' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.1' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.2' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.3' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.4' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.5' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.6' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.7' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.8' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.9' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.10' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.11' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.12' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.13' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.14' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.15' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.16' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.17' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.18' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.19' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 1723 main: Using testdir, '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir +V-3: Rank 0 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19' +V-3: Rank 0 will file_test on mdtest_tree.0 +V-3: Rank 0 Entering file_test on mdtest_tree.0 +V-3: Rank 0 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.0' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.1' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.2' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.3' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.4' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.5' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.6' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.7' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.8' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.9' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.10' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.11' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.12' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.13' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.14' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.15' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.16' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.17' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.18' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.19' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 main: Using o.testdir, '/dev/shm/mdest/test-dir.0-0' diff --git a/testing/mdtest-patterns/advanced/4.txt b/testing/mdtest-patterns/advanced/4.txt index 5d3b7da..62548ae 100644 --- a/testing/mdtest-patterns/advanced/4.txt +++ b/testing/mdtest-patterns/advanced/4.txt @@ -1,52 +1,49 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1683 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir -V-3: Rank 0 Line 833 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19 -V-3: Rank 0 Line 1716 will file_test on mdtest_tree.0 -V-3: Rank 0 Line 990 Entering file_test on mdtest_tree.0 -V-3: Rank 0 Line 1079 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.0 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.1 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.2 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.3 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.4 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.5 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.6 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.7 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.8 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.9 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.10 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.11 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.12 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.13 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.14 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.15 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.16 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.17 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.18 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.19 -V-3: Rank 0 Line 1723 main: Using testdir, '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir +V-3: Rank 0 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19 +V-3: Rank 0 will file_test on mdtest_tree.0 +V-3: Rank 0 Entering file_test on mdtest_tree.0 +V-3: Rank 0 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.0 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.1 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.2 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.3 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.4 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.5 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.6 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.7 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.8 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.9 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.10 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.11 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.12 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.13 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.14 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.15 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.16 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.17 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.18 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/file.mdtest.0.19 +V-3: Rank 0 main: Using o.testdir, '/dev/shm/mdest/test-dir.0-0' diff --git a/testing/mdtest-patterns/advanced/5.txt b/testing/mdtest-patterns/advanced/5.txt index e87ae0a..7192c35 100644 --- a/testing/mdtest-patterns/advanced/5.txt +++ b/testing/mdtest-patterns/advanced/5.txt @@ -1,77 +1,95 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1656 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1683 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir -V-3: Rank 0 Line 801 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19' -V-3: Rank 0 Line 833 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19 -V-3: Rank 0 Line 862 directory_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 890 directory_test: remove directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19' -V-3: Rank 0 Line 915 directory_test: remove unique directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1723 main: Using testdir, '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1764 V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir +V-3: Rank 0 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19' +V-3: Rank 0 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19 +V-3: Rank 0 directory_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 rename path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18 +V-3: Rank 0 mdtest_rename dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19 +V-3: Rank 0 directory_test: remove directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.0' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.1' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.2' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.3' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.4' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.5' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.6' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.7' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.8' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.9' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.10' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.11' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.12' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.13' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.14' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.15' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.16' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.17' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.18' +V-3: Rank 0 create_remove_items_helper (dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0/dir.mdtest.0.19' +V-3: Rank 0 directory_test: remove unique directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 main: Using o.testdir, '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' diff --git a/testing/mdtest-patterns/basic/0.txt b/testing/mdtest-patterns/basic/0.txt index ebe0f14..4c816c5 100644 --- a/testing/mdtest-patterns/basic/0.txt +++ b/testing/mdtest-patterns/basic/0.txt @@ -1,27 +1,25 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1656 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1683 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir -V-3: Rank 0 Line 801 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 833 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 862 directory_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 890 directory_test: remove directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 915 directory_test: remove unique directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1716 will file_test on mdtest_tree.0 -V-3: Rank 0 Line 990 Entering file_test on mdtest_tree.0 -V-3: Rank 0 Line 1012 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1079 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1104 file_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1134 file_test: rm directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1141 gonna create /dev/shm/mdest/test-dir.0-0/mdtest_tree.0 -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1158 file_test: rm unique directories path is 'mdtest_tree.0' -V-3: Rank 0 Line 1723 main: Using testdir, '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1764 V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir +V-3: Rank 0 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 directory_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 rename path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 directory_test: remove directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 directory_test: remove unique directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 will file_test on mdtest_tree.0 +V-3: Rank 0 Entering file_test on mdtest_tree.0 +V-3: Rank 0 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: rm directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 gonna create /dev/shm/mdest/test-dir.0-0/mdtest_tree.0 +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: rm unique directories path is 'mdtest_tree.0' +V-3: Rank 0 main: Using o.testdir, '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' diff --git a/testing/mdtest-patterns/basic/1.txt b/testing/mdtest-patterns/basic/1.txt index ebe0f14..4c816c5 100644 --- a/testing/mdtest-patterns/basic/1.txt +++ b/testing/mdtest-patterns/basic/1.txt @@ -1,27 +1,25 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1656 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1683 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir -V-3: Rank 0 Line 801 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 833 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 862 directory_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 890 directory_test: remove directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 915 directory_test: remove unique directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1716 will file_test on mdtest_tree.0 -V-3: Rank 0 Line 990 Entering file_test on mdtest_tree.0 -V-3: Rank 0 Line 1012 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1079 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1104 file_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1134 file_test: rm directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1141 gonna create /dev/shm/mdest/test-dir.0-0/mdtest_tree.0 -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' -V-3: Rank 0 Line 1158 file_test: rm unique directories path is 'mdtest_tree.0' -V-3: Rank 0 Line 1723 main: Using testdir, '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1764 V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main: Using unique_mk_dir, 'mdtest_tree.0' +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0', to topdir +V-3: Rank 0 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 directory_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 rename path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 directory_test: remove directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 directory_test: remove unique directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 will file_test on mdtest_tree.0 +V-3: Rank 0 Entering file_test on mdtest_tree.0 +V-3: Rank 0 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: read path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: rm directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 gonna create /dev/shm/mdest/test-dir.0-0/mdtest_tree.0 +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0' +V-3: Rank 0 file_test: rm unique directories path is 'mdtest_tree.0' +V-3: Rank 0 main: Using o.testdir, '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' diff --git a/testing/mdtest-patterns/basic/2.txt b/testing/mdtest-patterns/basic/2.txt index 77f5c78..099b265 100644 --- a/testing/mdtest-patterns/basic/2.txt +++ b/testing/mdtest-patterns/basic/2.txt @@ -1,29 +1,26 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1647 main (create hierarchical directory loop-!collective_creates): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1694 i 1 nstride 0 -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0.0', to topdir -V-3: Rank 0 Line 1716 will file_test on mdtest_tree.0.0 -V-3: Rank 0 Line 990 Entering file_test on mdtest_tree.0.0 -V-3: Rank 0 Line 1012 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 483 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//file.mdtest.0.1' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 1079 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/file.mdtest.0.1 -V-3: Rank 0 Line 1134 file_test: rm directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 1141 gonna create /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0 -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 483 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 310 create_remove_items_helper (non-dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//file.mdtest.0.1' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 1158 file_test: rm unique directories path is '/dev/shm/mdest/test-dir.0-0/' -V-3: Rank 0 Line 1754 main (remove hierarchical directory loop-!collective): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (create hierarchical directory loop-!collective_creates): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 i 1 nstride 0 +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0.0', to topdir +V-3: Rank 0 will file_test on mdtest_tree.0.0 +V-3: Rank 0 Entering file_test on mdtest_tree.0.0 +V-3: Rank 0 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//file.mdtest.0.1' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/file.mdtest.0.1 +V-3: Rank 0 file_test: rm directories path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 gonna create /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0 +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 create_remove_items_helper (non-dirs remove): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//file.mdtest.0.1' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 file_test: rm unique directories path is '/dev/shm/mdest/test-dir.0-0/' +V-3: Rank 0 main (remove hierarchical directory loop-!collective): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' diff --git a/testing/mdtest-patterns/basic/3.txt b/testing/mdtest-patterns/basic/3.txt index eafadc1..cf925f8 100644 --- a/testing/mdtest-patterns/basic/3.txt +++ b/testing/mdtest-patterns/basic/3.txt @@ -1,34 +1,31 @@ -V-3: Rank 0 Line 2082 main (before display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1506 Entering display_freespace on /dev/shm/mdest... -V-3: Rank 0 Line 1525 Before show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 1527 After show_file_system_size, dirpath is '/dev/shm' -V-3: Rank 0 Line 2097 main (after display_freespace): testdirpath is '/dev/shm/mdest' -V-3: Rank 0 Line 1647 main (create hierarchical directory loop-!collective_creates): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' -V-3: Rank 0 Line 1694 i 1 nstride 0 -V-3: Rank 0 Line 1704 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0.0', to topdir -V-3: Rank 0 Line 801 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/dir.mdtest.0.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 483 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 288 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//dir.mdtest.0.1' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 833 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/dir.mdtest.0.0 -V-3: Rank 0 Line 588 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/dir.mdtest.0.1 -V-3: Rank 0 Line 1716 will file_test on mdtest_tree.0.0 -V-3: Rank 0 Line 990 Entering file_test on mdtest_tree.0.0 -V-3: Rank 0 Line 1012 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/file.mdtest.0.0' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 483 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 326 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//file.mdtest.0.1' -V-3: Rank 0 Line 348 create_remove_items_helper (non-collective, shared): open... -V-3: Rank 0 Line 373 create_remove_items_helper: close... -V-3: Rank 0 Line 457 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' -V-3: Rank 0 Line 1079 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/file.mdtest.0.0 -V-3: Rank 0 Line 588 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/file.mdtest.0.1 +V-3: Rank 0 main (before display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (after display_freespace): o.testdirpath is '/dev/shm/mdest' +V-3: Rank 0 main (create hierarchical directory loop-!collective_creates): Calling create_remove_directory_tree with '/dev/shm/mdest/test-dir.0-0' +V-3: Rank 0 i 1 nstride 0 +V-3: Rank 0 V-3: main: Copied unique_mk_dir, 'mdtest_tree.0.0', to topdir +V-3: Rank 0 directory_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/dir.mdtest.0.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 create_remove_items_helper (dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//dir.mdtest.0.1' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/dir.mdtest.0.0 +V-3: Rank 0 mdtest_stat dir: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/dir.mdtest.0.1 +V-3: Rank 0 will file_test on mdtest_tree.0.0 +V-3: Rank 0 Entering file_test on mdtest_tree.0.0 +V-3: Rank 0 file_test: create path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/file.mdtest.0.0' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 create_remove_items (for loop): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 create_remove_items_helper (non-dirs create): curr_item is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1//file.mdtest.0.1' +V-3: Rank 0 create_remove_items_helper (non-collective, shared): open... +V-3: Rank 0 create_remove_items_helper: close... +V-3: Rank 0 create_remove_items (start): temp_path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/' +V-3: Rank 0 file_test: stat path is '/dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0' +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/file.mdtest.0.0 +V-3: Rank 0 mdtest_stat file: /dev/shm/mdest/test-dir.0-0/mdtest_tree.0.0/mdtest_tree.0.1/file.mdtest.0.1 diff --git a/testing/s3.sh b/testing/s3.sh new file mode 100755 index 0000000..b38d339 --- /dev/null +++ b/testing/s3.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Test basic S3 behavior using minio. + +ROOT="$(dirname ${BASH_SOURCE[0]})" +TYPE="basic" + +if [[ ! -e $ROOT/minio ]] ; then + wget https://dl.min.io/server/minio/release/linux-amd64/minio + mv minio $ROOT + chmod +x $ROOT/minio +fi + +export MINIO_ACCESS_KEY=accesskey +export MINIO_SECRET_KEY=secretkey + +$ROOT/minio --quiet server /dev/shm & + +export IOR_EXTRA="-o test" +export MDTEST_EXTRA="-d test" +source $ROOT/test-lib.sh + +I=100 # Start with this ID +IOR 2 -a S3-libs3 --S3.host=localhost:9000 --S3.secret-key=secretkey --S3.access-key=accesskey -b $((10*1024*1024)) -t $((10*1024*1024)) +MDTEST 2 -a S3-libs3 -L --S3.host=localhost:9000 --S3.secret-key=secretkey --S3.access-key=accesskey -n 10 +MDTEST 2 -a S3-libs3 --S3.host=localhost:9000 --S3.secret-key=secretkey --S3.access-key=accesskey -n 5 -w 1024 -e 1024 + +IOR 1 -a S3-libs3 --S3.host=localhost:9000 --S3.secret-key=secretkey --S3.access-key=accesskey -b $((10*1024)) -t $((10*1024)) --S3.bucket-per-file +MDTEST 1 -a S3-libs3 -L --S3.host=localhost:9000 --S3.secret-key=secretkey --S3.access-key=accesskey --S3.bucket-per-file -n 5 +MDTEST 1 -a S3-libs3 --S3.host=localhost:9000 --S3.secret-key=secretkey --S3.access-key=accesskey --S3.bucket-per-file -n 10 -w 1024 -e 1024 + + +kill -9 %1 diff --git a/testing/test-lib.sh b/testing/test-lib.sh index 444873d..a7e23fb 100644 --- a/testing/test-lib.sh +++ b/testing/test-lib.sh @@ -7,12 +7,17 @@ # Example: export IOR_EXTRA="-v -v -v" IOR_MPIRUN=${IOR_MPIRUN:-mpiexec -np} +if ${IOR_MPIRUN} 1 --oversubscribe true ; then + IOR_MPIRUN="mpiexec --oversubscribe -np" +fi IOR_BIN_DIR=${IOR_BIN_DIR:-./src} -IOR_OUT=${IOR_OUT:-./test_logs} +IOR_OUT=${IOR_OUT:-./test_logs/$TYPE} IOR_TMP=${IOR_TMP:-/dev/shm} IOR_EXTRA=${IOR_EXTRA:-} # Add global options like verbosity MDTEST_EXTRA=${MDTEST_EXTRA:-} MDTEST_TEST_PATTERNS=${MDTEST_TEST_PATTERNS:-../testing/mdtest-patterns/$TYPE} +MDWB_EXTRA=${MDWB_EXTRA:-} + ################################################################################ mkdir -p ${IOR_OUT} @@ -40,7 +45,7 @@ I=0 function IOR(){ RANKS=$1 shift - WHAT="${IOR_MPIRUN} $RANKS ${IOR_BIN_DIR}/ior ${@} ${IOR_EXTRA} -o ${IOR_TMP}/ior" + WHAT="${IOR_MPIRUN} $RANKS ${IOR_BIN_DIR}/ior ${@} -o ${IOR_TMP}/ior ${IOR_EXTRA}" $WHAT 1>"${IOR_OUT}/test_out.$I" 2>&1 if [[ $? != 0 ]]; then echo -n "ERR" @@ -56,15 +61,15 @@ function MDTEST(){ RANKS=$1 shift rm -rf ${IOR_TMP}/mdest - WHAT="${IOR_MPIRUN} $RANKS ${IOR_BIN_DIR}/mdtest ${@} ${MDTEST_EXTRA} -d ${IOR_TMP}/mdest -V=4" + WHAT="${IOR_MPIRUN} $RANKS ${IOR_BIN_DIR}/mdtest ${@} -d ${IOR_TMP}/mdest ${MDTEST_EXTRA} -V=4" $WHAT 1>"${IOR_OUT}/test_out.$I" 2>&1 if [[ $? != 0 ]]; then echo -n "ERR" ERRORS=$(($ERRORS + 1)) else # compare basic pattern + grep "V-3" "${IOR_OUT}/test_out.$I" | sed "s/Line *[0-9]*//" > "${IOR_OUT}/tmp" if [[ -r ${MDTEST_TEST_PATTERNS}/$I.txt ]] ; then - grep "V-3" "${IOR_OUT}/test_out.$I" > "${IOR_OUT}/tmp" cmp -s "${IOR_OUT}/tmp" ${MDTEST_TEST_PATTERNS}/$I.txt if [[ $? != 0 ]]; then mv "${IOR_OUT}/tmp" ${IOR_OUT}/tmp.$I @@ -74,7 +79,7 @@ function MDTEST(){ if [[ ! -e ${MDTEST_TEST_PATTERNS} ]] ; then mkdir -p ${MDTEST_TEST_PATTERNS} fi - grep "V-3" "${IOR_OUT}/test_out.$I" > ${MDTEST_TEST_PATTERNS}/$I.txt + mv "${IOR_OUT}/tmp" ${MDTEST_TEST_PATTERNS}/$I.txt fi echo -n "OK " fi @@ -82,6 +87,25 @@ function MDTEST(){ I=$((${I}+1)) } +function MDWB(){ + RANKS=$1 + shift + if [[ "$DELETE" != "0" ]] ; then + rm -rf "${IOR_TMP}/md-workbench" + fi + WHAT="${IOR_MPIRUN} $RANKS ${IOR_BIN_DIR}/md-workbench ${@} -o ${IOR_TMP}/md-workbench ${MDWB_EXTRA}" + LOG="${IOR_OUT}/test_out.$I" + $WHAT 1>"$LOG" 2>&1 + if [[ $? != 0 ]] || grep '!!!' "$LOG" ; then + echo -n "ERR" + ERRORS=$(($ERRORS + 1)) + else + echo -n "OK " + fi + echo " $WHAT" + I=$((${I}+1)) +} + function END(){ if [[ ${ERRORS} == 0 ]] ; then echo "PASSED" diff --git a/testing/test_comments.ior b/testing/test_comments.ior index eaf7997..1472e8f 100644 --- a/testing/test_comments.ior +++ b/testing/test_comments.ior @@ -2,16 +2,16 @@ IOR START api=posix writeFile =1 - randomOffset=1 + randomOffset=1 reorderTasks=1 - filePerProc=1 + filePerProc=1 keepFile=1 fsync=1 repetitions=1 multiFile=1 # tab-prefixed comment -transferSize=100k -blockSize=100k +transferSize=10k +blockSize=20k # space-prefixed comment run --dummy.delay-create=1000