From 74df77430fbaaaadcd2ac0adb1f997fe9af84a9d Mon Sep 17 00:00:00 2001 From: Julian Kunkel Date: Thu, 18 Feb 2021 10:40:42 +0000 Subject: [PATCH] Feature ior gpu #284 (#323) * Basic support for memory allocation on GPU using CUDA unified memory. Partially addressing #284. IOR support completed. * Support for GPU alloc in MDTest and MD-Workbench * Option: support repeated parsing of same option (allows option sharing across modules). * Checks for gpuDirect * Integrate gpuDirect options and basic hooks, more testing to be done. * POSIX: basic gpuDirect implementation working with fake-gpudirect library. * CUDA allow setting of DeviceID for IOR (not yet MDTest). * CUDA/GPUDirect Support --with-X= * Bugfix in option parser for flags that are part of an argument for an option, e.g., -O=1, if 1 is a flag it is wrongly assumed to be a flag. --- configure.ac | 46 ++++++++++++ src/Makefile.am | 4 + src/aiori-POSIX.c | 179 +++++++++++++++++++++++++++++++++----------- src/aiori-POSIX.h | 2 +- src/ior.c | 57 ++++---------- src/ior.h | 10 ++- src/md-workbench.c | 26 ++++--- src/mdtest.c | 16 ++-- src/option.c | 12 ++- src/parse_options.c | 25 ++++++- src/utilities.c | 77 ++++++++++++++++++- src/utilities.h | 3 +- 12 files changed, 338 insertions(+), 119 deletions(-) diff --git a/configure.ac b/configure.ac index a9d106a..b758f5b 100755 --- a/configure.ac +++ b/configure.ac @@ -73,6 +73,52 @@ AS_IF([test "$ac_cv_header_gpfs_h" = "yes" -o "$ac_cv_header_gpfs_fcntl_h" = "ye ]) ]) +# Check for CUDA +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda], + [support configurable CUDA @<:@default=check@:>@])], + [], [with_cuda=check]) + +AS_IF([test "x$with_cuda" != xno], [ + LDFLAGS="$LDFLAGS -L$with_cuda/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_cuda/lib64" + CPPFLAGS="$CPPFLAGS -I$with_cuda/include" + + AC_CHECK_HEADERS([cuda_runtime.h], [AC_DEFINE([HAVE_CUDA], [], [CUDA GPU API found])], [ + if test "x$with_cuda" != xcheck; then + AC_MSG_FAILURE([--with-cuda was given, not found]) + fi + ]) +AS_IF([test "$ac_cv_header_cuda_runtime_h" = "yes"], [ + AC_SEARCH_LIBS([cudaMalloc], [cudart cudart_static], [], + [AC_MSG_ERROR([Library containing cudaMalloc symbol not found])]) + ]) +]) +AM_CONDITIONAL([USE_CUDA], [test x$with_cuda = xyes]) + +# Check for GPUDirect +AC_ARG_WITH([gpuDirect], + [AS_HELP_STRING([--with-gpuDirect], + [support configurable GPUDirect @<:@default=check@:>@])], + [], [with_gpuDirect=check]) + +AS_IF([test "x$with_gpuDirect" != xno], [ + LDFLAGS="$LDFLAGS -L$with_gpuDirect/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_gpuDirect/lib64" + CPPFLAGS="$CPPFLAGS -I$with_gpuDirect/include" + + AC_CHECK_HEADERS([cufile.h], [AC_DEFINE([HAVE_GPU_DIRECT], [], [GPUDirect API found])], [ + if test "x$with_gpuDirect" != xcheck; then + AC_MSG_FAILURE([--with-gpuDirect was given, not found]) + fi + ]) +AS_IF([test "$ac_cv_header_cufile_h" = "yes"], [ + AC_SEARCH_LIBS([cuFileDriverOpen], [cufile], [], + [AC_MSG_ERROR([Library containing cuFileDriverOpen symbol not found])]) + ]) +]) +AM_CONDITIONAL([HAVE_GPU_DIRECT], [test x$with_gpuDirect = xyes]) + + + # Check for system capabilities AC_SYS_LARGEFILE diff --git a/src/Makefile.am b/src/Makefile.am index 52461a1..fdf746f 100755 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -41,6 +41,10 @@ extraLDFLAGS += -L/opt/hadoop-2.2.0/lib/native extraLDADD += -lhdfs endif +if USE_CUDA +extraLDADD += -lcudart +endif + if USE_HDF5_AIORI extraSOURCES += aiori-HDF5.c extraLDADD += -lhdf5 -lz diff --git a/src/aiori-POSIX.c b/src/aiori-POSIX.c index 5f6261a..5040a53 100755 --- a/src/aiori-POSIX.c +++ b/src/aiori-POSIX.c @@ -57,6 +57,20 @@ #include "aiori-POSIX.h" +#ifdef HAVE_GPU_DIRECT +typedef long long loff_t; +#include +#include +#endif + +typedef struct { + int fd; +#ifdef HAVE_GPU_DIRECT + CUfileHandle_t cf_handle; +#endif +} posix_fd; + + #ifndef open64 /* necessary for TRU64 -- */ # define open64 open /* unlikely, but may pose */ #endif /* not open64 */ /* conflicting prototypes */ @@ -69,7 +83,30 @@ # define O_BINARY 0 #endif +#ifdef HAVE_GPU_DIRECT +static const char* cuFileGetErrorString(CUfileError_t status){ + if(IS_CUDA_ERR(status)){ + return cudaGetErrorString(status.err); + } + return strerror(status.err); +} + +static void init_cufile(posix_fd * pfd){ + CUfileDescr_t cf_descr = (CUfileDescr_t){ + .handle.fd = pfd->fd, + .type = CU_FILE_HANDLE_TYPE_OPAQUE_FD + }; + CUfileError_t status = cuFileHandleRegister(& pfd->cf_handle, & cf_descr); + if(status.err != CU_FILE_SUCCESS){ + EWARNF("Could not register handle %s", cuFileGetErrorString(status)); + } +} +#endif + /**************************** P R O T O T Y P E S *****************************/ +static void POSIX_Initialize(aiori_mod_opt_t * options); +static void POSIX_Finalize(aiori_mod_opt_t * options); + static IOR_offset_t POSIX_Xfer(int, aiori_fd_t *, IOR_size_t *, IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *); @@ -105,6 +142,9 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o {0, "posix.lustre.stripesize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_size}, {0, "posix.lustre.startost", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_start_ost}, {0, "posix.lustre.ignorelocks", "", OPTION_FLAG, 'd', & o->lustre_ignore_locks}, +#endif +#ifdef HAVE_GPU_DIRECT + {0, "gpuDirect", "allocate I/O buffers on the GPU", OPTION_FLAG, 'd', & o->gpuDirect}, #endif LAST_OPTION }; @@ -120,6 +160,8 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o ior_aiori_t posix_aiori = { .name = "POSIX", .name_legacy = NULL, + .initialize = POSIX_Initialize, + .finalize = POSIX_Finalize, .create = POSIX_Create, .mknod = POSIX_Mknod, .open = POSIX_Open, @@ -156,6 +198,14 @@ int POSIX_check_params(aiori_mod_opt_t * param){ ERR("beegfsChunkSize must be a power of two and >64k"); if(o->lustre_stripe_count != -1 || o->lustre_stripe_size != 0) o->lustre_set_striping = 1; + if(o->gpuDirect && ! o->direct_io){ + ERR("GPUDirect required direct I/O to be used!"); + } +#ifndef HAVE_GPU_DIRECT + if(o->gpuDirect){ + ERR("GPUDirect support is not compiled"); + } +#endif return 0; } @@ -352,14 +402,10 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) { int fd_oflag = O_BINARY; int mode = 0664; - int *fd; - - fd = (int *)malloc(sizeof(int)); - if (fd == NULL) - ERR("Unable to malloc file descriptor"); + posix_fd * pfd = safeMalloc(sizeof(posix_fd)); posix_options_t * o = (posix_options_t*) param; if (o->direct_io == TRUE){ - set_o_direct_flag(&fd_oflag); + set_o_direct_flag(& fd_oflag); } if(hints->dryRun) @@ -378,8 +424,8 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) if (!hints->filePerProc && rank != 0) { MPI_CHECK(MPI_Barrier(testComm), "barrier error"); fd_oflag |= O_RDWR; - *fd = open64(testFileName, fd_oflag, mode); - if (*fd < 0){ + pfd->fd = open64(testFileName, fd_oflag, mode); + if (pfd->fd < 0){ ERRF("open64(\"%s\", %d, %#o) failed. Error: %s", testFileName, fd_oflag, mode, strerror(errno)); } @@ -396,16 +442,16 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) * Lustre striping information on a pre-existing file.*/ fd_oflag |= O_CREAT | O_EXCL | O_RDWR | O_LOV_DELAY_CREATE; - *fd = open64(testFileName, fd_oflag, mode); - if (*fd < 0) { + pfd->fd = open64(testFileName, fd_oflag, mode); + if (pfd->fd < 0) { ERRF("Unable to open '%s': %s\n", testFileName, strerror(errno)); - } else if (ioctl(*fd, LL_IOC_LOV_SETSTRIPE, &opts)) { + } else if (ioctl(pfd->fd, LL_IOC_LOV_SETSTRIPE, &opts)) { char *errmsg = "stripe already set"; if (errno != EEXIST && errno != EALREADY) errmsg = strerror(errno); ERRF("Error on ioctl for '%s' (%d): %s\n", - testFileName, *fd, errmsg); + testFileName, pfd->fd, errmsg); } if (!hints->filePerProc) MPI_CHECK(MPI_Barrier(testComm), @@ -431,8 +477,8 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) } #endif /* HAVE_BEEGFS_BEEGFS_H */ - *fd = open64(testFileName, fd_oflag, mode); - if (*fd < 0){ + pfd->fd = open64(testFileName, fd_oflag, mode); + if (pfd->fd < 0){ ERRF("open64(\"%s\", %d, %#o) failed. Error: %s", testFileName, fd_oflag, mode, strerror(errno)); } @@ -442,8 +488,8 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) if (o->lustre_ignore_locks) { int lustre_ioctl_flags = LL_FILE_IGNORE_LOCK; - if (ioctl(*fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) - ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", *fd); + if (ioctl(pfd->fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) + ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", pfd->fd); } #endif /* HAVE_LUSTRE_USER */ @@ -452,10 +498,15 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param) * the intent that we can avoid some byte range lock revocation: * everyone will be writing/reading from individual regions */ if (o->gpfs_release_token ) { - gpfs_free_all_locks(*fd); + gpfs_free_all_locks(pfd->fd); } #endif - return (aiori_fd_t*) fd; +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + init_cufile(pfd); + } +#endif + return (aiori_fd_t*) pfd; } /* @@ -477,24 +528,18 @@ int POSIX_Mknod(char *testFileName) */ aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param) { - int fd_oflag = O_BINARY; - int *fd; - - fd = (int *)malloc(sizeof(int)); - if (fd == NULL) - ERR("Unable to malloc file descriptor"); - + int fd_oflag = O_BINARY | O_RDWR; + posix_fd * pfd = safeMalloc(sizeof(posix_fd)); posix_options_t * o = (posix_options_t*) param; - if (o->direct_io == TRUE) + if (o->direct_io == TRUE){ set_o_direct_flag(&fd_oflag); - - fd_oflag |= O_RDWR; + } if(hints->dryRun) return (aiori_fd_t*) 0; - *fd = open64(testFileName, fd_oflag); - if (*fd < 0) + pfd->fd = open64(testFileName, fd_oflag); + if (pfd->fd < 0) ERRF("open64(\"%s\", %d) failed: %s", testFileName, fd_oflag, strerror(errno)); #ifdef HAVE_LUSTRE_USER @@ -503,17 +548,22 @@ aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param) if (verbose >= VERBOSE_1) { EINFO("** Disabling lustre range locking **\n"); } - if (ioctl(*fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) - ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", *fd); + if (ioctl(pfd->fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1) + ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", pfd->fd); } #endif /* HAVE_LUSTRE_USER */ #ifdef HAVE_GPFS_FCNTL_H if(o->gpfs_release_token) { - gpfs_free_all_locks(*fd); + gpfs_free_all_locks(pfd->fd); } #endif - return (aiori_fd_t*) fd; +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + init_cufile(pfd); + } +#endif + return (aiori_fd_t*) pfd; } /* @@ -532,7 +582,8 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer if(hints->dryRun) return length; - fd = *(int *)file; + posix_fd * pfd = (posix_fd *) file; + fd = pfd->fd; #ifdef HAVE_GPFS_FCNTL_H if (o->gpfs_hint_access) { @@ -544,7 +595,7 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer /* seek to offset */ if (lseek64(fd, offset, SEEK_SET) == -1) ERRF("lseek64(%d, %lld, SEEK_SET) failed", fd, offset); - + off_t mem_offset = 0; while (remaining > 0) { /* write/read file */ if (access == WRITE) { /* WRITE */ @@ -553,7 +604,15 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer rank, offset + length - remaining); } - rc = write(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + rc = cuFileWrite(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset); + }else{ +#endif + rc = write(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + } +#endif if (rc == -1) ERRF("write(%d, %p, %lld) failed", fd, (void*)ptr, remaining); @@ -566,7 +625,15 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer rank, offset + length - remaining); } - rc = read(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + rc = cuFileRead(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset); + }else{ +#endif + rc = read(fd, ptr, remaining); +#ifdef HAVE_GPU_DIRECT + } +#endif if (rc == 0) ERRF("read(%d, %p, %lld) returned EOF prematurely", fd, (void*)ptr, remaining); @@ -587,6 +654,7 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer assert(rc <= remaining); remaining -= rc; ptr += rc; + mem_offset += rc; xferRetries++; } #ifdef HAVE_GPFS_FCNTL_H @@ -597,10 +665,11 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer return (length); } -void POSIX_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * param) +void POSIX_Fsync(aiori_fd_t *afd, aiori_mod_opt_t * param) { - if (fsync(*(int *)fd) != 0) - EWARNF("fsync(%d) failed", *(int *)fd); + int fd = ((posix_fd*) afd)->fd; + if (fsync(fd) != 0) + EWARNF("fsync(%d) failed", fd); } @@ -616,13 +685,21 @@ void POSIX_Sync(aiori_mod_opt_t * param) /* * Close a file through the POSIX interface. */ -void POSIX_Close(aiori_fd_t *fd, aiori_mod_opt_t * param) +void POSIX_Close(aiori_fd_t *afd, aiori_mod_opt_t * param) { if(hints->dryRun) return; - if (close(*(int *)fd) != 0) - ERRF("close(%d) failed", *(int *)fd); - free(fd); + posix_options_t * o = (posix_options_t*) param; + int fd = ((posix_fd*) afd)->fd; +#ifdef HAVE_GPU_DIRECT + if(o->gpuDirect){ + cuFileHandleDeregister(((posix_fd*) afd)->cf_handle); + } +#endif + if (close(fd) != 0){ + ERRF("close(%d) failed", fd); + } + free(afd); } /* @@ -665,3 +742,15 @@ IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, char *testFileName) return (aggFileSizeFromStat); } + +void POSIX_Initialize(aiori_mod_opt_t * options){ +#ifdef HAVE_GPU_DIRECT + CUfileError_t err = cuFileDriverOpen(); +#endif +} + +void POSIX_Finalize(aiori_mod_opt_t * options){ +#ifdef HAVE_GPU_DIRECT + CUfileError_t err = cuFileDriverClose(); +#endif +} diff --git a/src/aiori-POSIX.h b/src/aiori-POSIX.h index 8884a30..b2f556a 100644 --- a/src/aiori-POSIX.h +++ b/src/aiori-POSIX.h @@ -22,7 +22,7 @@ typedef struct{ /* beegfs variables */ int beegfs_numTargets; /* number storage targets to use */ int beegfs_chunkSize; /* srtipe pattern for new files */ - + int gpuDirect; } posix_options_t; void POSIX_Sync(aiori_mod_opt_t * param); diff --git a/src/ior.c b/src/ior.c index dd0f048..a591b18 100755 --- a/src/ior.c +++ b/src/ior.c @@ -33,6 +33,10 @@ # include /* uname() */ #endif +#ifdef HAVE_CUDA +#include +#endif + #include #include "ior.h" @@ -113,6 +117,13 @@ static int test_initialize(IOR_test_t * test){ verbose = test->params.verbose; backend = test->params.backend; +#ifdef HAVE_CUDA + cudaError_t cret = cudaSetDevice(test->params.gpuID); + if(cret != cudaSuccess){ + EWARNF("cudaSetDevice(%d) error: %s", test->params.gpuID, cudaGetErrorString(cret)); + } +#endif + if(backend->initialize){ backend->initialize(test->params.backend_options); } @@ -507,44 +518,6 @@ static int CountErrors(IOR_param_t * test, int access, int errors) return (allErrors); } -/* - * Allocate a page-aligned (required by O_DIRECT) buffer. - */ -static void *aligned_buffer_alloc(size_t size) -{ - size_t pageMask; - char *buf, *tmp; - char *aligned; - -#ifdef HAVE_SYSCONF - long pageSize = sysconf(_SC_PAGESIZE); -#else - size_t pageSize = getpagesize(); -#endif - - pageMask = pageSize - 1; - buf = malloc(size + pageSize + sizeof(void *)); - if (buf == NULL) - ERR("out of memory"); - /* find the alinged buffer */ - tmp = buf + sizeof(char *); - aligned = tmp + pageSize - ((size_t) tmp & pageMask); - /* write a pointer to the original malloc()ed buffer into the bytes - preceding "aligned", so that the aligned buffer can later be free()ed */ - tmp = aligned - sizeof(void *); - *(void **)tmp = buf; - - return (void *)aligned; -} - -/* - * Free a buffer allocated by aligned_buffer_alloc(). - */ -static void aligned_buffer_free(void *buf) -{ - free(*(void **)((char *)buf - sizeof(char *))); -} - void AllocResults(IOR_test_t *test) { int reps; @@ -1053,7 +1026,7 @@ static void InitTests(IOR_test_t *tests) static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test, int pretendRank) { - ioBuffers->buffer = aligned_buffer_alloc(test->transferSize); + ioBuffers->buffer = aligned_buffer_alloc(test->transferSize, test->gpuMemoryFlags); } /* @@ -1062,7 +1035,7 @@ static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test, static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test) { - aligned_buffer_free(ioBuffers->buffer); + aligned_buffer_free(ioBuffers->buffer, test->gpuMemoryFlags); } @@ -1878,7 +1851,7 @@ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, void * randomPrefillBuffer = NULL; if(test->randomPrefillBlocksize && (access == WRITE || access == WRITECHECK)){ - randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize); + randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize, test->gpuMemoryFlags); // store invalid data into the buffer memset(randomPrefillBuffer, -1, test->randomPrefillBlocksize); } @@ -2000,7 +1973,7 @@ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, backend->fsync(fd, test->backend_options); /*fsync after all accesses */ } if(randomPrefillBuffer){ - aligned_buffer_free(randomPrefillBuffer); + aligned_buffer_free(randomPrefillBuffer, test->gpuMemoryFlags); } return (dataMoved); diff --git a/src/ior.h b/src/ior.h index e4663db..c58b198 100755 --- a/src/ior.h +++ b/src/ior.h @@ -58,6 +58,11 @@ enum PACKET_TYPE }; +typedef enum{ + IOR_MEMORY_TYPE_CPU = 0, + IOR_MEMORY_TYPE_GPU_MANAGED = 1, + IOR_MEMORY_TYPE_GPU_DEVICE_ONLY = 2, +} ior_memory_flags; /***************** IOR_BUFFERS *************************************************/ @@ -101,7 +106,10 @@ typedef struct MPI_Comm testComm; /* Current MPI communicator */ MPI_Comm mpi_comm_world; /* The global MPI communicator */ int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */ - int dualMount; /* dual mount points */ + int dualMount; /* dual mount points */ + ior_memory_flags gpuMemoryFlags; /* use the GPU to store the data */ + int gpuDirect; /* use gpuDirect, this influences gpuMemoryFlags as well */ + int gpuID; /* the GPU to use for gpuDirect or memory options */ int numTasks; /* number of tasks for test */ int numNodes; /* number of nodes for test */ int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */ diff --git a/src/md-workbench.c b/src/md-workbench.c index 7f08611..fc51800 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -92,6 +92,7 @@ struct benchmark_options{ int read_only; int stonewall_timer; int stonewall_timer_wear_out; + int gpu_memory_flags; /* use the GPU to store the data */ char * latency_file_prefix; int latency_keep_all; @@ -381,7 +382,7 @@ static void compute_histogram(const char * name, time_result_t * times, time_sta sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, o.global_iteration, name); FILE * f = fopen(file, "w+"); if(f == NULL){ - ERRF("%d: Error writing to latency file: %s\n", o.rank, file); + ERRF("%d: Error writing to latency file: %s", o.rank, file); return; } fprintf(f, "time,runtime\n"); @@ -546,12 +547,12 @@ void run_precreate(phase_stat_t * s, int current_index){ }else{ s->dset_create.err++; if (! o.ignore_precreate_errors){ - ERRF("%d: Error while creating the dset: %s\n", o.rank, dset); + ERRF("%d: Error while creating the dset: %s", o.rank, dset); } } } - char * buf = malloc(o.file_size); + char * buf = aligned_buffer_alloc(o.file_size, o.gpu_memory_flags); generate_memory_pattern(buf, o.file_size, o.random_buffer_offset, o.rank); double op_timer; // timer for individual operations size_t pos = -1; // position inside the individual measurement array @@ -574,7 +575,7 @@ void run_precreate(phase_stat_t * s, int current_index){ }else{ s->obj_create.err++; if (! o.ignore_precreate_errors){ - ERRF("%d: Error while creating the obj: %s\n", o.rank, obj_name); + ERRF("%d: Error while creating the obj: %s", o.rank, obj_name); } } o.backend->close(aiori_fh, o.backend_options); @@ -586,14 +587,14 @@ void run_precreate(phase_stat_t * s, int current_index){ } } } - free(buf); + aligned_buffer_free(buf, o.gpu_memory_flags); } /* FIFO: create a new file, write to it. Then read from the first created file, delete it... */ void run_benchmark(phase_stat_t * s, int * current_index_p){ char obj_name[MAX_PATHLEN]; int ret; - char * buf = malloc(o.file_size); + char * buf = aligned_buffer_alloc(o.file_size, o.gpu_memory_flags); memset(buf, o.rank % 256, o.file_size); double op_timer; // timer for individual operations size_t pos = -1; // position inside the individual measurement array @@ -632,7 +633,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ if(ret != 0){ if (o.verbosity) - ERRF("%d: Error while stating the obj: %s\n", o.rank, obj_name); + ERRF("%d: Error while stating the obj: %s", o.rank, obj_name); s->obj_stat.err++; continue; } @@ -704,7 +705,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ o.backend->close(aiori_fh, o.backend_options); }else{ if (! o.ignore_precreate_errors){ - ERRF("Unable to open file %s", obj_name); + ERRF("%d: Error while creating the obj: %s", o.rank, obj_name); } EWARNF("Unable to open file %s", obj_name); s->obj_create.err++; @@ -761,7 +762,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ *current_index_p += f; } s->repeats = pos + 1; - free(buf); + aligned_buffer_free(buf, o.gpu_memory_flags); } void run_cleanup(phase_stat_t * s, int start_index){ @@ -822,6 +823,7 @@ static option_help options [] = { {'w', "stonewall-timer", "Stop each benchmark iteration after the specified seconds (if not used with -W this leads to process-specific progress!)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stonewall_timer}, {'W', "stonewall-wear-out", "Stop with stonewall after specified time and use a soft wear-out phase -- all processes perform the same number of iterations", OPTION_FLAG, 'd', & o.stonewall_timer_wear_out}, {'X', "verify-read", "Verify the data on read", OPTION_FLAG, 'd', & o.verify_read}, + {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags}, {0, "start-item", "The iteration number of the item to start with, allowing to offset the operations", OPTION_OPTIONAL_ARGUMENT, 'l', & o.start_item_number}, {0, "print-detailed-stats", "Print detailed machine parsable statistics.", OPTION_FLAG, 'd', & o.print_detailed_stats}, {0, "read-only", "Run read-only during benchmarking phase (no deletes/writes), probably use with -2", OPTION_FLAG, 'd', & o.read_only}, @@ -844,12 +846,12 @@ static int return_position(){ if( o.rank == 0){ FILE * f = fopen(o.run_info_file, "r"); if(! f){ - ERRF("[ERROR] Could not open %s for restart\n", o.run_info_file); + ERRF("[ERROR] Could not open %s for restart", o.run_info_file); exit(1); } ret = fscanf(f, "pos: %d", & position); if (ret != 1){ - ERRF("Could not read from %s for restart\n", o.run_info_file); + ERRF("Could not read from %s for restart", o.run_info_file); exit(1); } fclose(f); @@ -864,7 +866,7 @@ static void store_position(int position){ } FILE * f = fopen(o.run_info_file, "w"); if(! f){ - ERRF("[ERROR] Could not open %s for saving data\n", o.run_info_file); + ERRF("[ERROR] Could not open %s for saving data", o.run_info_file); exit(1); } fprintf(f, "pos: %d\n", position); diff --git a/src/mdtest.c b/src/mdtest.c index 3394675..558bbf4 100644 --- a/src/mdtest.c +++ b/src/mdtest.c @@ -110,6 +110,7 @@ typedef struct { char unique_rm_uni_dir[MAX_PATHLEN]; char *write_buffer; char *stoneWallingStatusFile; + int gpu_memory_flags; int barriers; @@ -641,10 +642,7 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { /* allocate read buffer */ if (o.read_bytes > 0) { - int alloc_res = posix_memalign((void**)&read_buffer, sysconf(_SC_PAGESIZE), o.read_bytes); - if (alloc_res) { - FAIL("out of memory"); - } + read_buffer = aligned_buffer_alloc(o.read_bytes, o.gpu_memory_flags); memset(read_buffer, -1, o.read_bytes); } @@ -743,7 +741,7 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { o.backend->close (aiori_fh, o.backend_options); } if(o.read_bytes){ - free(read_buffer); + aligned_buffer_free(read_buffer, o.gpu_memory_flags); } } @@ -2236,6 +2234,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * {'Y', NULL, "call the sync command after each phase (included in the timing; note it causes all IO to be flushed from your node)", OPTION_FLAG, 'd', & o.call_sync}, {'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth}, {'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time}, + {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags}, {0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors}, {0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV}, LAST_OPTION @@ -2420,10 +2419,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * /* allocate and initialize write buffer with # */ if (o.write_bytes > 0) { - int alloc_res = posix_memalign((void**)& o.write_buffer, sysconf(_SC_PAGESIZE), o.write_bytes); - if (alloc_res) { - FAIL("out of memory"); - } + o.write_buffer = aligned_buffer_alloc(o.write_bytes, o.gpu_memory_flags); generate_memory_pattern(o.write_buffer, o.write_bytes, o.random_buffer_offset, rank); } @@ -2560,7 +2556,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * } if (o.write_bytes > 0) { - free(o.write_buffer); + aligned_buffer_free(o.write_buffer, o.gpu_memory_flags); } free(o.summary_table); diff --git a/src/option.c b/src/option.c index c44dc9b..7be3df6 100644 --- a/src/option.c +++ b/src/option.c @@ -264,11 +264,13 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi return; } txt++; - + int parsed = 0; + + // printf("Parsing: %s : %s\n", txt, arg); // support groups of multiple flags like -vvv or -vq for(int flag_index = 0; flag_index < strlen(txt); ++flag_index){ // don't loop looking for multiple flags if we already processed a long option - if(txt[0] == '-' && flag_index > 0) + if(txt[flag_index] == '=' || (txt[0] == '-' && flag_index > 0)) break; for(int m = 0; m < opt_all->module_count; m++ ){ @@ -281,6 +283,7 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi continue; } if ( (o->shortVar == txt[flag_index]) || (strlen(txt) > 2 && txt[0] == '-' && o->longVar != NULL && strcmp(txt + 1, o->longVar) == 0)){ + // printf("Found %s %c=%c? %d %d\n", o->help, o->shortVar, txt[flag_index], (o->shortVar == txt[flag_index]), (strlen(txt) > 2 && txt[0] == '-' && o->longVar != NULL && strcmp(txt + 1, o->longVar) == 0)); // now process the option. switch(o->arg){ case (OPTION_FLAG):{ @@ -370,12 +373,13 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi (*requiredArgsSeen)++; } - return; + parsed = 1; } } } } - + if(parsed) return; + if(strcmp(txt, "h") == 0 || strcmp(txt, "-help") == 0){ *print_help = 1; }else{ diff --git a/src/parse_options.c b/src/parse_options.c index 82fab98..605de91 100755 --- a/src/parse_options.c +++ b/src/parse_options.c @@ -62,7 +62,17 @@ static void CheckRunSettings(IOR_test_t *tests) } if(params->dualMount && !params->filePerProc) { - MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "Dual Mount can only be used with File Per Process"); + ERR("Dual Mount can only be used with File Per Process"); + } + + if(params->gpuDirect){ + if(params->gpuMemoryFlags == IOR_MEMORY_TYPE_GPU_MANAGED){ + ERR("GPUDirect cannot be used with managed memory"); + } + params->gpuMemoryFlags = IOR_MEMORY_TYPE_GPU_DEVICE_ONLY; + if(params->checkRead || params->checkWrite){ + ERR("GPUDirect data cannot yet be checked"); + } } } } @@ -138,6 +148,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->testFileName = strdup(value); } else if (strcasecmp(option, "dualmount") == 0){ params->dualMount = atoi(value); + } else if (strcasecmp(option, "allocateBufferOnGPU") == 0) { + params->gpuMemoryFlags = atoi(value); + } else if (strcasecmp(option, "GPUid") == 0) { + params->gpuID = atoi(value); + } else if (strcasecmp(option, "GPUDirect") == 0) { + params->gpuDirect = atoi(value); } else if (strcasecmp(option, "deadlineforstonewalling") == 0) { params->deadlineForStonewalling = atoi(value); } else if (strcasecmp(option, "stoneWallingWearOut") == 0) { @@ -413,6 +429,13 @@ option_help * createGlobalOptions(IOR_param_t * params){ {.help=" -O stoneWallingWearOut=1 -- once the stonewalling timeout is over, all process finish to access the amount of data", .arg = OPTION_OPTIONAL_ARGUMENT}, {.help=" -O stoneWallingWearOutIterations=N -- stop after processing this number of iterations, needed for reading data back written with stoneWallingWearOut", .arg = OPTION_OPTIONAL_ARGUMENT}, {.help=" -O stoneWallingStatusFile=FILE -- this file keeps the number of iterations from stonewalling during write and allows to use them for read", .arg = OPTION_OPTIONAL_ARGUMENT}, +#ifdef HAVE_CUDA + {.help=" -O allocateBufferOnGPU=X -- allocate I/O buffers on the GPU: X=1 uses managed memory, X=2 device memory.", .arg = OPTION_OPTIONAL_ARGUMENT}, + {.help=" -O GPUid=X -- select the GPU to use.", .arg = OPTION_OPTIONAL_ARGUMENT}, +#ifdef HAVE_GPU_DIRECT + {0, "gpuDirect", "allocate I/O buffers on the GPU and use gpuDirect to store data; this option is incompatible with any option requiring CPU access to data.", OPTION_FLAG, 'd', & params->gpuDirect}, +#endif +#endif {'e', NULL, "fsync -- perform a fsync() operation at the end of each read/write phase", OPTION_FLAG, 'd', & params->fsync}, {'E', NULL, "useExistingTestFile -- do not remove test file before write access", OPTION_FLAG, 'd', & params->useExistingTestFile}, {'f', NULL, "scriptFile -- test script name", OPTION_OPTIONAL_ARGUMENT, 's', & params->testscripts}, diff --git a/src/utilities.c b/src/utilities.c index 16a31b0..9ab5432 100755 --- a/src/utilities.c +++ b/src/utilities.c @@ -37,6 +37,10 @@ #include #include +#ifdef HAVE_CUDA +#include +#endif + #ifndef _WIN32 # include # ifdef __sun /* SunOS does not support statfs(), instead uses statvfs() */ @@ -210,7 +214,7 @@ void updateParsedOptions(IOR_param_t * options, options_all_t * global_options){ /* Used in aiori-POSIX.c and aiori-PLFS.c */ -void set_o_direct_flag(int *fd) +void set_o_direct_flag(int *flag) { /* note that TRU64 needs O_DIRECTIO, SunOS uses directio(), and everyone else needs O_DIRECT */ @@ -223,7 +227,7 @@ void set_o_direct_flag(int *fd) # endif /* not O_DIRECTIO */ #endif /* not O_DIRECT */ - *fd |= O_DIRECT; + *flag |= O_DIRECT; } @@ -911,3 +915,72 @@ unsigned long GetProcessorAndCore(int *chip, int *core){ return 1; } #endif + + + +/* + * Allocate a page-aligned (required by O_DIRECT) buffer. + */ +void *aligned_buffer_alloc(size_t size, ior_memory_flags type) +{ + size_t pageMask; + char *buf, *tmp; + char *aligned; + + if(type == IOR_MEMORY_TYPE_GPU_MANAGED){ +#ifdef HAVE_CUDA + // use unified memory here to allow drop-in-replacement + if (cudaMallocManaged((void**) & buf, size, cudaMemAttachGlobal) != cudaSuccess){ + ERR("Cannot allocate buffer on GPU"); + } + return buf; +#else + ERR("No CUDA supported, cannot allocate on the GPU"); +#endif + }else if(type == IOR_MEMORY_TYPE_GPU_DEVICE_ONLY){ +#ifdef HAVE_GPU_DIRECT + if (cudaMalloc((void**) & buf, size) != cudaSuccess){ + ERR("Cannot allocate buffer on GPU"); + } + return buf; +#else + ERR("No GPUDirect supported, cannot allocate on the GPU"); +#endif + } + +#ifdef HAVE_SYSCONF + long pageSize = sysconf(_SC_PAGESIZE); +#else + size_t pageSize = getpagesize(); +#endif + + pageMask = pageSize - 1; + buf = safeMalloc(size + pageSize + sizeof(void *)); + /* find the alinged buffer */ + tmp = buf + sizeof(char *); + aligned = tmp + pageSize - ((size_t) tmp & pageMask); + /* write a pointer to the original malloc()ed buffer into the bytes + preceding "aligned", so that the aligned buffer can later be free()ed */ + tmp = aligned - sizeof(void *); + *(void **)tmp = buf; + + return (void *)aligned; +} + +/* + * Free a buffer allocated by aligned_buffer_alloc(). + */ +void aligned_buffer_free(void *buf, ior_memory_flags gpu) +{ + if(gpu){ +#ifdef HAVE_CUDA + if (cudaFree(buf) != cudaSuccess){ + WARN("Cannot free buffer on GPU"); + } + return; +#else + ERR("No CUDA supported, cannot free on the GPU"); +#endif + } + free(*(void **)((char *)buf - sizeof(char *))); +} diff --git a/src/utilities.h b/src/utilities.h index 202bcad..97dc2c0 100755 --- a/src/utilities.h +++ b/src/utilities.h @@ -63,5 +63,6 @@ void init_clock(MPI_Comm com); double GetTimeStamp(void); char * PrintTimestamp(); // TODO remove this function unsigned long GetProcessorAndCore(int *chip, int *core); - +void *aligned_buffer_alloc(size_t size, ior_memory_flags type); +void aligned_buffer_free(void *buf, ior_memory_flags type); #endif /* !_UTILITIES_H */