From 74df77430fbaaaadcd2ac0adb1f997fe9af84a9d Mon Sep 17 00:00:00 2001
From: Julian Kunkel <JulianKunkel@users.noreply.github.com>
Date: Thu, 18 Feb 2021 10:40:42 +0000
Subject: [PATCH] Feature ior gpu #284 (#323)

* Basic support for memory allocation on GPU using CUDA unified memory. Partially addressing #284. IOR support completed.
* Support for GPU alloc in MDTest and MD-Workbench
* Option: support repeated parsing of same option (allows option sharing across modules).
* Checks for gpuDirect
* Integrate gpuDirect options and basic hooks, more testing to be done.
* POSIX: basic gpuDirect implementation working with fake-gpudirect library.
* CUDA allow setting of DeviceID for IOR (not yet MDTest).
* CUDA/GPUDirect Support --with-X=<path>
* Bugfix in option parser for flags that are part of an argument for an option, e.g., -O=1, if 1 is a flag it is wrongly assumed to be a flag.
---
 configure.ac        |  46 ++++++++++++
 src/Makefile.am     |   4 +
 src/aiori-POSIX.c   | 179 +++++++++++++++++++++++++++++++++-----------
 src/aiori-POSIX.h   |   2 +-
 src/ior.c           |  57 ++++----------
 src/ior.h           |  10 ++-
 src/md-workbench.c  |  26 ++++---
 src/mdtest.c        |  16 ++--
 src/option.c        |  12 ++-
 src/parse_options.c |  25 ++++++-
 src/utilities.c     |  77 ++++++++++++++++++-
 src/utilities.h     |   3 +-
 12 files changed, 338 insertions(+), 119 deletions(-)
diff --git a/configure.ac b/configure.ac
index a9d106a..b758f5b 100755
--- a/configure.ac
+++ b/configure.ac
@@ -73,6 +73,52 @@ AS_IF([test "$ac_cv_header_gpfs_h" = "yes" -o "$ac_cv_header_gpfs_fcntl_h" = "ye
     ])
 ])
 
+# Check for CUDA
+AC_ARG_WITH([cuda],
+        [AS_HELP_STRING([--with-cuda],
+                [support configurable CUDA @<:@default=check@:>@])],
+        [], [with_cuda=check])
+
+AS_IF([test "x$with_cuda" != xno], [
+        LDFLAGS="$LDFLAGS -L$with_cuda/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_cuda/lib64"
+        CPPFLAGS="$CPPFLAGS -I$with_cuda/include"
+
+        AC_CHECK_HEADERS([cuda_runtime.h], [AC_DEFINE([HAVE_CUDA], [], [CUDA GPU API found])], [
+                if test "x$with_cuda" != xcheck; then
+                        AC_MSG_FAILURE([--with-cuda was given, <cuda_runtime.h> not found])
+                fi
+        ])
+AS_IF([test "$ac_cv_header_cuda_runtime_h" = "yes"], [
+        AC_SEARCH_LIBS([cudaMalloc], [cudart cudart_static], [],
+        [AC_MSG_ERROR([Library containing cudaMalloc symbol not found])])
+    ])
+])
+AM_CONDITIONAL([USE_CUDA], [test x$with_cuda = xyes])
+
+# Check for GPUDirect
+AC_ARG_WITH([gpuDirect],
+        [AS_HELP_STRING([--with-gpuDirect],
+                [support configurable GPUDirect @<:@default=check@:>@])],
+        [], [with_gpuDirect=check])
+
+AS_IF([test "x$with_gpuDirect" != xno], [
+        LDFLAGS="$LDFLAGS -L$with_gpuDirect/lib64 -Wl,--enable-new-dtags -Wl,-rpath=$with_gpuDirect/lib64"
+        CPPFLAGS="$CPPFLAGS -I$with_gpuDirect/include"
+
+        AC_CHECK_HEADERS([cufile.h], [AC_DEFINE([HAVE_GPU_DIRECT], [], [GPUDirect API found])], [
+                if test "x$with_gpuDirect" != xcheck; then
+                        AC_MSG_FAILURE([--with-gpuDirect was given, <cufile.h> not found])
+                fi
+        ])
+AS_IF([test "$ac_cv_header_cufile_h" = "yes"], [
+        AC_SEARCH_LIBS([cuFileDriverOpen], [cufile], [],
+        [AC_MSG_ERROR([Library containing cuFileDriverOpen symbol not found])])
+    ])
+])
+AM_CONDITIONAL([HAVE_GPU_DIRECT], [test x$with_gpuDirect = xyes])
+
+
+
 # Check for system capabilities
 AC_SYS_LARGEFILE
 
diff --git a/src/Makefile.am b/src/Makefile.am
index 52461a1..fdf746f 100755
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -41,6 +41,10 @@ extraLDFLAGS  += -L/opt/hadoop-2.2.0/lib/native
 extraLDADD    += -lhdfs
 endif
 
+if USE_CUDA
+extraLDADD    += -lcudart
+endif
+
 if USE_HDF5_AIORI
 extraSOURCES  += aiori-HDF5.c
 extraLDADD    += -lhdf5 -lz
diff --git a/src/aiori-POSIX.c b/src/aiori-POSIX.c
index 5f6261a..5040a53 100755
--- a/src/aiori-POSIX.c
+++ b/src/aiori-POSIX.c
@@ -57,6 +57,20 @@
 
 #include "aiori-POSIX.h"
 
+#ifdef HAVE_GPU_DIRECT
+typedef long long loff_t;
+#include <cuda_runtime.h>
+#include <cufile.h>
+#endif
+
+typedef struct {
+  int fd;
+#ifdef HAVE_GPU_DIRECT
+  CUfileHandle_t cf_handle;
+#endif
+} posix_fd;
+
+
 #ifndef   open64                /* necessary for TRU64 -- */
 #  define open64  open            /* unlikely, but may pose */
 #endif  /* not open64 */                        /* conflicting prototypes */
@@ -69,7 +83,30 @@
 #  define O_BINARY 0
 #endif
 
+#ifdef HAVE_GPU_DIRECT
+static const char* cuFileGetErrorString(CUfileError_t status){
+  if(IS_CUDA_ERR(status)){
+    return cudaGetErrorString(status.err);
+  }
+  return strerror(status.err);
+}
+
+static void init_cufile(posix_fd * pfd){
+  CUfileDescr_t cf_descr = (CUfileDescr_t){
+    .handle.fd = pfd->fd,
+    .type = CU_FILE_HANDLE_TYPE_OPAQUE_FD
+  };
+  CUfileError_t status = cuFileHandleRegister(& pfd->cf_handle, & cf_descr);
+  if(status.err != CU_FILE_SUCCESS){
+    EWARNF("Could not register handle %s", cuFileGetErrorString(status));
+  }
+}
+#endif
+
 /**************************** P R O T O T Y P E S *****************************/
+static void POSIX_Initialize(aiori_mod_opt_t * options);
+static void POSIX_Finalize(aiori_mod_opt_t * options);
+
 static IOR_offset_t POSIX_Xfer(int, aiori_fd_t *, IOR_size_t *,
                                IOR_offset_t, IOR_offset_t, aiori_mod_opt_t *);
 
@@ -105,6 +142,9 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o
     {0, "posix.lustre.stripesize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_size},
     {0, "posix.lustre.startost", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_start_ost},
     {0, "posix.lustre.ignorelocks", "", OPTION_FLAG, 'd', & o->lustre_ignore_locks},
+#endif
+#ifdef HAVE_GPU_DIRECT
+    {0, "gpuDirect",        "allocate I/O buffers on the GPU", OPTION_FLAG, 'd', & o->gpuDirect},
 #endif
     LAST_OPTION
   };
@@ -120,6 +160,8 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o
 ior_aiori_t posix_aiori = {
         .name = "POSIX",
         .name_legacy = NULL,
+        .initialize = POSIX_Initialize,
+        .finalize = POSIX_Finalize,
         .create = POSIX_Create,
         .mknod = POSIX_Mknod,
         .open = POSIX_Open,
@@ -156,6 +198,14 @@ int POSIX_check_params(aiori_mod_opt_t * param){
         ERR("beegfsChunkSize must be a power of two and >64k");
   if(o->lustre_stripe_count != -1 || o->lustre_stripe_size != 0)
     o->lustre_set_striping = 1;
+  if(o->gpuDirect && ! o->direct_io){
+    ERR("GPUDirect required direct I/O to be used!");
+  }
+#ifndef HAVE_GPU_DIRECT
+  if(o->gpuDirect){
+    ERR("GPUDirect support is not compiled");
+  }
+#endif
   return 0;
 }
 
@@ -352,14 +402,10 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
 {
         int fd_oflag = O_BINARY;
         int mode = 0664;
-        int *fd;
-
-        fd = (int *)malloc(sizeof(int));
-        if (fd == NULL)
-                ERR("Unable to malloc file descriptor");
+        posix_fd * pfd = safeMalloc(sizeof(posix_fd));
         posix_options_t * o = (posix_options_t*) param;
         if (o->direct_io == TRUE){
-          set_o_direct_flag(&fd_oflag);
+          set_o_direct_flag(& fd_oflag);
         }
 
         if(hints->dryRun)
@@ -378,8 +424,8 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
                 if (!hints->filePerProc && rank != 0) {
                         MPI_CHECK(MPI_Barrier(testComm), "barrier error");
                         fd_oflag |= O_RDWR;
-                        *fd = open64(testFileName, fd_oflag, mode);
-                        if (*fd < 0){
+                        pfd->fd = open64(testFileName, fd_oflag, mode);
+                        if (pfd->fd < 0){
                                 ERRF("open64(\"%s\", %d, %#o) failed. Error: %s",
                                         testFileName, fd_oflag, mode, strerror(errno));
                         }
@@ -396,16 +442,16 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
                          * Lustre striping information on a pre-existing file.*/
 
                         fd_oflag |= O_CREAT | O_EXCL | O_RDWR | O_LOV_DELAY_CREATE;
-                        *fd = open64(testFileName, fd_oflag, mode);
-                        if (*fd < 0) {
+                        pfd->fd = open64(testFileName, fd_oflag, mode);
+                        if (pfd->fd < 0) {
                                 ERRF("Unable to open '%s': %s\n",
                                         testFileName, strerror(errno));
-                        } else if (ioctl(*fd, LL_IOC_LOV_SETSTRIPE, &opts)) {
+                        } else if (ioctl(pfd->fd, LL_IOC_LOV_SETSTRIPE, &opts)) {
                                 char *errmsg = "stripe already set";
                                 if (errno != EEXIST && errno != EALREADY)
                                         errmsg = strerror(errno);
                                 ERRF("Error on ioctl for '%s' (%d): %s\n",
-                                        testFileName, *fd, errmsg);
+                                        testFileName, pfd->fd, errmsg);
                         }
                         if (!hints->filePerProc)
                                 MPI_CHECK(MPI_Barrier(testComm),
@@ -431,8 +477,8 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
                  }
 #endif /* HAVE_BEEGFS_BEEGFS_H */
 
-                *fd = open64(testFileName, fd_oflag, mode);
-                if (*fd < 0){
+                pfd->fd = open64(testFileName, fd_oflag, mode);
+                if (pfd->fd < 0){
                         ERRF("open64(\"%s\", %d, %#o) failed. Error: %s",
                                 testFileName, fd_oflag, mode, strerror(errno));
                 }
@@ -442,8 +488,8 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
 
         if (o->lustre_ignore_locks) {
                 int lustre_ioctl_flags = LL_FILE_IGNORE_LOCK;
-                if (ioctl(*fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1)
-                        ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", *fd);
+                if (ioctl(pfd->fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1)
+                        ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", pfd->fd);
         }
 #endif                          /* HAVE_LUSTRE_USER */
 
@@ -452,10 +498,15 @@ aiori_fd_t *POSIX_Create(char *testFileName, int flags, aiori_mod_opt_t * param)
          * the intent that we can avoid some byte range lock revocation:
          * everyone will be writing/reading from individual regions */
         if (o->gpfs_release_token ) {
-                gpfs_free_all_locks(*fd);
+                gpfs_free_all_locks(pfd->fd);
         }
 #endif
-        return (aiori_fd_t*) fd;
+#ifdef HAVE_GPU_DIRECT
+  if(o->gpuDirect){
+    init_cufile(pfd);
+  }
+#endif
+        return (aiori_fd_t*) pfd;
 }
 
 /*
@@ -477,24 +528,18 @@ int POSIX_Mknod(char *testFileName)
  */
 aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param)
 {
-        int fd_oflag = O_BINARY;
-        int *fd;
-
-        fd = (int *)malloc(sizeof(int));
-        if (fd == NULL)
-                ERR("Unable to malloc file descriptor");
-
+        int fd_oflag = O_BINARY | O_RDWR;
+        posix_fd * pfd = safeMalloc(sizeof(posix_fd));
         posix_options_t * o = (posix_options_t*) param;
-        if (o->direct_io == TRUE)
+        if (o->direct_io == TRUE){
                 set_o_direct_flag(&fd_oflag);
-
-        fd_oflag |= O_RDWR;
+        }
 
         if(hints->dryRun)
           return (aiori_fd_t*) 0;
 
-        *fd = open64(testFileName, fd_oflag);
-        if (*fd < 0)
+        pfd->fd = open64(testFileName, fd_oflag);
+        if (pfd->fd < 0)
                 ERRF("open64(\"%s\", %d) failed: %s", testFileName, fd_oflag, strerror(errno));
 
 #ifdef HAVE_LUSTRE_USER
@@ -503,17 +548,22 @@ aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param)
                 if (verbose >= VERBOSE_1) {
                         EINFO("** Disabling lustre range locking **\n");
                 }
-                if (ioctl(*fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1)
-                        ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", *fd);
+                if (ioctl(pfd->fd, LL_IOC_SETFLAGS, &lustre_ioctl_flags) == -1)
+                        ERRF("ioctl(%d, LL_IOC_SETFLAGS, ...) failed", pfd->fd);
         }
 #endif                          /* HAVE_LUSTRE_USER */
 
 #ifdef HAVE_GPFS_FCNTL_H
         if(o->gpfs_release_token) {
-                gpfs_free_all_locks(*fd);
+                gpfs_free_all_locks(pfd->fd);
         }
 #endif
-        return (aiori_fd_t*) fd;
+#ifdef HAVE_GPU_DIRECT
+        if(o->gpuDirect){
+          init_cufile(pfd);
+        }
+#endif
+        return (aiori_fd_t*) pfd;
 }
 
 /*
@@ -532,7 +582,8 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer
         if(hints->dryRun)
           return length;
 
-        fd = *(int *)file;
+        posix_fd * pfd = (posix_fd *) file;
+        fd = pfd->fd;
 
 #ifdef HAVE_GPFS_FCNTL_H
         if (o->gpfs_hint_access) {
@@ -544,7 +595,7 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer
         /* seek to offset */
         if (lseek64(fd, offset, SEEK_SET) == -1)
                 ERRF("lseek64(%d, %lld, SEEK_SET) failed", fd, offset);
-
+        off_t mem_offset = 0;
         while (remaining > 0) {
                 /* write/read file */
                 if (access == WRITE) {  /* WRITE */
@@ -553,7 +604,15 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer
                                         rank,
                                         offset + length - remaining);
                         }
-                        rc = write(fd, ptr, remaining);
+#ifdef HAVE_GPU_DIRECT
+                        if(o->gpuDirect){
+                          rc = cuFileWrite(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset);
+                        }else{
+#endif
+                          rc = write(fd, ptr, remaining);
+#ifdef HAVE_GPU_DIRECT
+                        }
+#endif
                         if (rc == -1)
                                 ERRF("write(%d, %p, %lld) failed",
                                         fd, (void*)ptr, remaining);
@@ -566,7 +625,15 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer
                                         rank,
                                         offset + length - remaining);
                         }
-                        rc = read(fd, ptr, remaining);
+#ifdef HAVE_GPU_DIRECT
+                        if(o->gpuDirect){
+                          rc = cuFileRead(pfd->cf_handle, ptr, remaining, offset + mem_offset, mem_offset);
+                        }else{
+#endif
+                          rc = read(fd, ptr, remaining);
+#ifdef HAVE_GPU_DIRECT
+                        }
+#endif
                         if (rc == 0)
                                 ERRF("read(%d, %p, %lld) returned EOF prematurely",
                                         fd, (void*)ptr, remaining);
@@ -587,6 +654,7 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer
                 assert(rc <= remaining);
                 remaining -= rc;
                 ptr += rc;
+                mem_offset += rc;
                 xferRetries++;
         }
 #ifdef HAVE_GPFS_FCNTL_H
@@ -597,10 +665,11 @@ static IOR_offset_t POSIX_Xfer(int access, aiori_fd_t *file, IOR_size_t * buffer
         return (length);
 }
 
-void POSIX_Fsync(aiori_fd_t *fd, aiori_mod_opt_t * param)
+void POSIX_Fsync(aiori_fd_t *afd, aiori_mod_opt_t * param)
 {
-        if (fsync(*(int *)fd) != 0)
-                EWARNF("fsync(%d) failed", *(int *)fd);
+    int fd = ((posix_fd*) afd)->fd;
+    if (fsync(fd) != 0)
+        EWARNF("fsync(%d) failed", fd);
 }
 
 
@@ -616,13 +685,21 @@ void POSIX_Sync(aiori_mod_opt_t * param)
 /*
  * Close a file through the POSIX interface.
  */
-void POSIX_Close(aiori_fd_t *fd, aiori_mod_opt_t * param)
+void POSIX_Close(aiori_fd_t *afd, aiori_mod_opt_t * param)
 {
         if(hints->dryRun)
           return;
-        if (close(*(int *)fd) != 0)
-                ERRF("close(%d) failed", *(int *)fd);
-        free(fd);
+        posix_options_t * o = (posix_options_t*) param;
+        int fd = ((posix_fd*) afd)->fd;
+#ifdef HAVE_GPU_DIRECT
+        if(o->gpuDirect){
+          cuFileHandleDeregister(((posix_fd*) afd)->cf_handle);
+        }
+#endif
+        if (close(fd) != 0){
+                ERRF("close(%d) failed", fd);
+        }
+        free(afd);
 }
 
 /*
@@ -665,3 +742,15 @@ IOR_offset_t POSIX_GetFileSize(aiori_mod_opt_t * test, char *testFileName)
 
         return (aggFileSizeFromStat);
 }
+
+void POSIX_Initialize(aiori_mod_opt_t * options){
+#ifdef HAVE_GPU_DIRECT
+  CUfileError_t err = cuFileDriverOpen();
+#endif
+}
+
+void POSIX_Finalize(aiori_mod_opt_t * options){
+#ifdef HAVE_GPU_DIRECT
+  CUfileError_t err = cuFileDriverClose();
+#endif
+}
diff --git a/src/aiori-POSIX.h b/src/aiori-POSIX.h
index 8884a30..b2f556a 100644
--- a/src/aiori-POSIX.h
+++ b/src/aiori-POSIX.h
@@ -22,7 +22,7 @@ typedef struct{
   /* beegfs variables */
   int beegfs_numTargets;           /* number storage targets to use */
   int beegfs_chunkSize;            /* srtipe pattern for new files */
-
+  int gpuDirect;
 } posix_options_t;
 
 void POSIX_Sync(aiori_mod_opt_t * param);
diff --git a/src/ior.c b/src/ior.c
index dd0f048..a591b18 100755
--- a/src/ior.c
+++ b/src/ior.c
@@ -33,6 +33,10 @@
 # include <sys/utsname.h>        /* uname() */
 #endif
 
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include <assert.h>
 
 #include "ior.h"
@@ -113,6 +117,13 @@ static int test_initialize(IOR_test_t * test){
   verbose = test->params.verbose;
   backend = test->params.backend;
 
+#ifdef HAVE_CUDA
+  cudaError_t cret = cudaSetDevice(test->params.gpuID);
+  if(cret != cudaSuccess){
+    EWARNF("cudaSetDevice(%d) error: %s", test->params.gpuID, cudaGetErrorString(cret));
+  }
+#endif
+
   if(backend->initialize){
     backend->initialize(test->params.backend_options);
   }
@@ -507,44 +518,6 @@ static int CountErrors(IOR_param_t * test, int access, int errors)
         return (allErrors);
 }
 
-/*
- * Allocate a page-aligned (required by O_DIRECT) buffer.
- */
-static void *aligned_buffer_alloc(size_t size)
-{
-        size_t pageMask;
-        char *buf, *tmp;
-        char *aligned;
-
-#ifdef HAVE_SYSCONF
-        long pageSize = sysconf(_SC_PAGESIZE);
-#else
-        size_t pageSize = getpagesize();
-#endif
-
-        pageMask = pageSize - 1;
-        buf = malloc(size + pageSize + sizeof(void *));
-        if (buf == NULL)
-                ERR("out of memory");
-        /* find the alinged buffer */
-        tmp = buf + sizeof(char *);
-        aligned = tmp + pageSize - ((size_t) tmp & pageMask);
-        /* write a pointer to the original malloc()ed buffer into the bytes
-           preceding "aligned", so that the aligned buffer can later be free()ed */
-        tmp = aligned - sizeof(void *);
-        *(void **)tmp = buf;
-
-        return (void *)aligned;
-}
-
-/*
- * Free a buffer allocated by aligned_buffer_alloc().
- */
-static void aligned_buffer_free(void *buf)
-{
-        free(*(void **)((char *)buf - sizeof(char *)));
-}
-
 void AllocResults(IOR_test_t *test)
 {
   int reps;
@@ -1053,7 +1026,7 @@ static void InitTests(IOR_test_t *tests)
 static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test,
                              int pretendRank)
 {
-        ioBuffers->buffer = aligned_buffer_alloc(test->transferSize);
+        ioBuffers->buffer = aligned_buffer_alloc(test->transferSize, test->gpuMemoryFlags);
 }
 
 /*
@@ -1062,7 +1035,7 @@ static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test,
 static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test)
 
 {
-        aligned_buffer_free(ioBuffers->buffer);
+        aligned_buffer_free(ioBuffers->buffer, test->gpuMemoryFlags);
 }
 
 
@@ -1878,7 +1851,7 @@ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results,
 
         void * randomPrefillBuffer = NULL;
         if(test->randomPrefillBlocksize && (access == WRITE || access == WRITECHECK)){
-          randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize);
+          randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize, test->gpuMemoryFlags);
           // store invalid data into the buffer
           memset(randomPrefillBuffer, -1, test->randomPrefillBlocksize);
         }
@@ -2000,7 +1973,7 @@ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results,
                 backend->fsync(fd, test->backend_options);       /*fsync after all accesses */
         }
         if(randomPrefillBuffer){
-          aligned_buffer_free(randomPrefillBuffer);
+          aligned_buffer_free(randomPrefillBuffer, test->gpuMemoryFlags);
         }
 
         return (dataMoved);
diff --git a/src/ior.h b/src/ior.h
index e4663db..c58b198 100755
--- a/src/ior.h
+++ b/src/ior.h
@@ -58,6 +58,11 @@ enum PACKET_TYPE
 
 };
 
+typedef enum{
+    IOR_MEMORY_TYPE_CPU = 0,
+    IOR_MEMORY_TYPE_GPU_MANAGED = 1,
+    IOR_MEMORY_TYPE_GPU_DEVICE_ONLY = 2,
+} ior_memory_flags;
 
 
 /***************** IOR_BUFFERS *************************************************/
@@ -101,7 +106,10 @@ typedef struct
     MPI_Comm     testComm;           /* Current MPI communicator */
     MPI_Comm     mpi_comm_world;           /* The global MPI communicator */
     int dryRun;                      /* do not perform any I/Os just run evtl. inputs print dummy output */
-  int dualMount;                   /* dual mount points */
+    int dualMount;                   /* dual mount points */
+    ior_memory_flags gpuMemoryFlags;  /* use the GPU to store the data */
+    int gpuDirect;                /* use gpuDirect, this influences gpuMemoryFlags as well */
+    int gpuID;                       /* the GPU to use for gpuDirect or memory options */
     int numTasks;                    /* number of tasks for test */
     int numNodes;                    /* number of nodes for test */
     int numTasksOnNode0;             /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */
diff --git a/src/md-workbench.c b/src/md-workbench.c
index 7f08611..fc51800 100644
--- a/src/md-workbench.c
+++ b/src/md-workbench.c
@@ -92,6 +92,7 @@ struct benchmark_options{
   int read_only;
   int stonewall_timer;
   int stonewall_timer_wear_out;
+  int gpu_memory_flags;              /* use the GPU to store the data */
 
   char * latency_file_prefix;
   int latency_keep_all;
@@ -381,7 +382,7 @@ static void compute_histogram(const char * name, time_result_t * times, time_sta
     sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, o.global_iteration, name);
     FILE * f = fopen(file, "w+");
     if(f == NULL){
-      ERRF("%d: Error writing to latency file: %s\n", o.rank, file);
+      ERRF("%d: Error writing to latency file: %s", o.rank, file);
       return;
     }
     fprintf(f, "time,runtime\n");
@@ -546,12 +547,12 @@ void run_precreate(phase_stat_t * s, int current_index){
     }else{
       s->dset_create.err++;
       if (! o.ignore_precreate_errors){
-        ERRF("%d: Error while creating the dset: %s\n", o.rank, dset);
+        ERRF("%d: Error while creating the dset: %s", o.rank, dset);
       }
     }
   }
 
-  char * buf = malloc(o.file_size);
+  char * buf = aligned_buffer_alloc(o.file_size, o.gpu_memory_flags);
   generate_memory_pattern(buf, o.file_size, o.random_buffer_offset, o.rank);
   double op_timer; // timer for individual operations
   size_t pos = -1; // position inside the individual measurement array
@@ -574,7 +575,7 @@ void run_precreate(phase_stat_t * s, int current_index){
       }else{
         s->obj_create.err++;
         if (! o.ignore_precreate_errors){
-         ERRF("%d: Error while creating the obj: %s\n", o.rank, obj_name);
+         ERRF("%d: Error while creating the obj: %s", o.rank, obj_name);
         }
       }
       o.backend->close(aiori_fh, o.backend_options);
@@ -586,14 +587,14 @@ void run_precreate(phase_stat_t * s, int current_index){
       }
     }
   }
-  free(buf);
+  aligned_buffer_free(buf, o.gpu_memory_flags);
 }
 
 /* FIFO: create a new file, write to it. Then read from the first created file, delete it... */
 void run_benchmark(phase_stat_t * s, int * current_index_p){
   char obj_name[MAX_PATHLEN];
   int ret;
-  char * buf = malloc(o.file_size);
+  char * buf = aligned_buffer_alloc(o.file_size, o.gpu_memory_flags);
   memset(buf, o.rank % 256, o.file_size);
   double op_timer; // timer for individual operations
   size_t pos = -1; // position inside the individual measurement array
@@ -632,7 +633,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){
 
       if(ret != 0){
         if (o.verbosity)
-          ERRF("%d: Error while stating the obj: %s\n", o.rank, obj_name);
+          ERRF("%d: Error while stating the obj: %s", o.rank, obj_name);
         s->obj_stat.err++;
         continue;
       }
@@ -704,7 +705,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){
         o.backend->close(aiori_fh, o.backend_options);
       }else{
         if (! o.ignore_precreate_errors){
-          ERRF("Unable to open file %s", obj_name);
+         ERRF("%d: Error while creating the obj: %s", o.rank, obj_name);
         }
         EWARNF("Unable to open file %s", obj_name);
         s->obj_create.err++;
@@ -761,7 +762,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){
     *current_index_p += f;
   }
   s->repeats = pos + 1;
-  free(buf);
+  aligned_buffer_free(buf, o.gpu_memory_flags);
 }
 
 void run_cleanup(phase_stat_t * s, int start_index){
@@ -822,6 +823,7 @@ static option_help options [] = {
   {'w', "stonewall-timer", "Stop each benchmark iteration after the specified seconds (if not used with -W this leads to process-specific progress!)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stonewall_timer},
   {'W', "stonewall-wear-out", "Stop with stonewall after specified time and use a soft wear-out phase -- all processes perform the same number of iterations", OPTION_FLAG, 'd', & o.stonewall_timer_wear_out},
   {'X', "verify-read", "Verify the data on read", OPTION_FLAG, 'd', & o.verify_read},
+  {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags},
   {0, "start-item", "The iteration number of the item to start with, allowing to offset the operations", OPTION_OPTIONAL_ARGUMENT, 'l', & o.start_item_number},
   {0, "print-detailed-stats", "Print detailed machine parsable statistics.", OPTION_FLAG, 'd', & o.print_detailed_stats},
   {0, "read-only", "Run read-only during benchmarking phase (no deletes/writes), probably use with -2", OPTION_FLAG, 'd', & o.read_only},
@@ -844,12 +846,12 @@ static int return_position(){
   if( o.rank == 0){
     FILE * f = fopen(o.run_info_file, "r");
     if(! f){
-      ERRF("[ERROR] Could not open %s for restart\n", o.run_info_file);
+      ERRF("[ERROR] Could not open %s for restart", o.run_info_file);
       exit(1);
     }
     ret = fscanf(f, "pos: %d", & position);
     if (ret != 1){
-      ERRF("Could not read from %s for restart\n", o.run_info_file);
+      ERRF("Could not read from %s for restart", o.run_info_file);
       exit(1);
     }
     fclose(f);
@@ -864,7 +866,7 @@ static void store_position(int position){
   }
   FILE * f = fopen(o.run_info_file, "w");
   if(! f){
-    ERRF("[ERROR] Could not open %s for saving data\n", o.run_info_file);
+    ERRF("[ERROR] Could not open %s for saving data", o.run_info_file);
     exit(1);
   }
   fprintf(f, "pos: %d\n", position);
diff --git a/src/mdtest.c b/src/mdtest.c
index 3394675..558bbf4 100644
--- a/src/mdtest.c
+++ b/src/mdtest.c
@@ -110,6 +110,7 @@ typedef struct {
   char unique_rm_uni_dir[MAX_PATHLEN];
   char *write_buffer;
   char *stoneWallingStatusFile;
+  int gpu_memory_flags;
 
 
   int barriers;
@@ -641,10 +642,7 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) {
 
     /* allocate read buffer */
     if (o.read_bytes > 0) {
-        int alloc_res = posix_memalign((void**)&read_buffer, sysconf(_SC_PAGESIZE), o.read_bytes);
-        if (alloc_res) {
-            FAIL("out of memory");
-        }
+        read_buffer = aligned_buffer_alloc(o.read_bytes, o.gpu_memory_flags);
         memset(read_buffer, -1, o.read_bytes);
     }
 
@@ -743,7 +741,7 @@ void mdtest_read(int random, int dirs, const long dir_iter, char *path) {
         o.backend->close (aiori_fh, o.backend_options);
     }
     if(o.read_bytes){
-      free(read_buffer);
+      aligned_buffer_free(read_buffer, o.gpu_memory_flags);
     }
 }
 
@@ -2236,6 +2234,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
       {'Y', NULL,        "call the sync command after each phase (included in the timing; note it causes all IO to be flushed from your node)", OPTION_FLAG, 'd', & o.call_sync},
       {'z', NULL,        "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth},
       {'Z', NULL,        "print time instead of rate", OPTION_FLAG, 'd', & o.print_time},
+      {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags},
       {0, "warningAsErrors",        "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors},
       {0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV},
       LAST_OPTION
@@ -2420,10 +2419,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
 
     /* allocate and initialize write buffer with # */
     if (o.write_bytes > 0) {
-        int alloc_res = posix_memalign((void**)& o.write_buffer, sysconf(_SC_PAGESIZE), o.write_bytes);
-        if (alloc_res) {
-            FAIL("out of memory");
-        }
+        o.write_buffer = aligned_buffer_alloc(o.write_bytes, o.gpu_memory_flags);
         generate_memory_pattern(o.write_buffer, o.write_bytes, o.random_buffer_offset, rank);
     }
 
@@ -2560,7 +2556,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
     }
 
     if (o.write_bytes > 0) {
-      free(o.write_buffer);
+      aligned_buffer_free(o.write_buffer, o.gpu_memory_flags);
     }
     free(o.summary_table);
 
diff --git a/src/option.c b/src/option.c
index c44dc9b..7be3df6 100644
--- a/src/option.c
+++ b/src/option.c
@@ -264,11 +264,13 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi
       return;
   }
   txt++;
-
+  int parsed = 0;
+  
+  // printf("Parsing: %s : %s\n", txt, arg);
   // support groups of multiple flags like -vvv or -vq
   for(int flag_index = 0; flag_index < strlen(txt); ++flag_index){
     // don't loop looking for multiple flags if we already processed a long option
-    if(txt[0] == '-' && flag_index > 0)
+    if(txt[flag_index] == '=' || (txt[0] == '-' && flag_index > 0))
         break;
 
     for(int m = 0; m < opt_all->module_count; m++ ){
@@ -281,6 +283,7 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi
           continue;
         }
         if ( (o->shortVar == txt[flag_index]) || (strlen(txt) > 2 && txt[0] == '-' && o->longVar != NULL && strcmp(txt + 1, o->longVar) == 0)){
+          //  printf("Found %s %c=%c? %d %d\n", o->help, o->shortVar, txt[flag_index], (o->shortVar == txt[flag_index]), (strlen(txt) > 2 && txt[0] == '-' && o->longVar != NULL && strcmp(txt + 1, o->longVar) == 0));
           // now process the option.
           switch(o->arg){
             case (OPTION_FLAG):{
@@ -370,12 +373,13 @@ static void option_parse_token(char ** argv, int * flag_parsed_next, int * requi
             (*requiredArgsSeen)++;
           }
 
-          return;
+          parsed = 1;
         }
       }
     }
   }
-
+  if(parsed) return;
+  
   if(strcmp(txt, "h") == 0 || strcmp(txt, "-help") == 0){
     *print_help = 1;
   }else{
diff --git a/src/parse_options.c b/src/parse_options.c
index 82fab98..605de91 100755
--- a/src/parse_options.c
+++ b/src/parse_options.c
@@ -62,7 +62,17 @@ static void CheckRunSettings(IOR_test_t *tests)
                 }
 
                 if(params->dualMount && !params->filePerProc) {
-                        MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "Dual Mount can only be used with File Per Process");
+                  ERR("Dual Mount can only be used with File Per Process");
+                }
+
+                if(params->gpuDirect){
+                  if(params->gpuMemoryFlags == IOR_MEMORY_TYPE_GPU_MANAGED){
+                    ERR("GPUDirect cannot be used with managed memory");
+                  }
+                  params->gpuMemoryFlags = IOR_MEMORY_TYPE_GPU_DEVICE_ONLY;
+                  if(params->checkRead || params->checkWrite){
+                    ERR("GPUDirect data cannot yet be checked");
+                  }
                 }
         }
 }
@@ -138,6 +148,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt
                 params->testFileName  = strdup(value);
         } else if (strcasecmp(option, "dualmount") == 0){
                 params->dualMount = atoi(value);
+        } else if (strcasecmp(option, "allocateBufferOnGPU") == 0) {
+                params->gpuMemoryFlags = atoi(value);
+        } else if (strcasecmp(option, "GPUid") == 0) {
+                params->gpuID = atoi(value);
+        } else if (strcasecmp(option, "GPUDirect") == 0) {
+                params->gpuDirect  = atoi(value);
         } else if (strcasecmp(option, "deadlineforstonewalling") == 0) {
                 params->deadlineForStonewalling = atoi(value);
         } else if (strcasecmp(option, "stoneWallingWearOut") == 0) {
@@ -413,6 +429,13 @@ option_help * createGlobalOptions(IOR_param_t * params){
     {.help="  -O stoneWallingWearOut=1           -- once the stonewalling timeout is over, all process finish to access the amount of data", .arg = OPTION_OPTIONAL_ARGUMENT},
     {.help="  -O stoneWallingWearOutIterations=N -- stop after processing this number of iterations, needed for reading data back written with stoneWallingWearOut", .arg = OPTION_OPTIONAL_ARGUMENT},
     {.help="  -O stoneWallingStatusFile=FILE     -- this file keeps the number of iterations from stonewalling during write and allows to use them for read", .arg = OPTION_OPTIONAL_ARGUMENT},
+#ifdef HAVE_CUDA
+    {.help="  -O allocateBufferOnGPU=X           -- allocate I/O buffers on the GPU: X=1 uses managed memory, X=2 device memory.", .arg = OPTION_OPTIONAL_ARGUMENT},
+    {.help="  -O GPUid=X                         -- select the GPU to use.", .arg = OPTION_OPTIONAL_ARGUMENT},
+#ifdef HAVE_GPU_DIRECT
+    {0, "gpuDirect",        "allocate I/O buffers on the GPU and use gpuDirect to store data; this option is incompatible with any option requiring CPU access to data.", OPTION_FLAG, 'd', & params->gpuDirect},
+#endif
+#endif
     {'e', NULL,        "fsync -- perform a fsync() operation at the end of each read/write phase", OPTION_FLAG, 'd', & params->fsync},
     {'E', NULL,        "useExistingTestFile -- do not remove test file before write access", OPTION_FLAG, 'd', & params->useExistingTestFile},
     {'f', NULL,        "scriptFile -- test script name", OPTION_OPTIONAL_ARGUMENT, 's', & params->testscripts},
diff --git a/src/utilities.c b/src/utilities.c
index 16a31b0..9ab5432 100755
--- a/src/utilities.c
+++ b/src/utilities.c
@@ -37,6 +37,10 @@
 #include <sys/types.h>
 #include <time.h>
 
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #ifndef _WIN32
 #  include <regex.h>
 #  ifdef __sun                    /* SunOS does not support statfs(), instead uses statvfs() */
@@ -210,7 +214,7 @@ void updateParsedOptions(IOR_param_t * options, options_all_t * global_options){
 /* Used in aiori-POSIX.c and aiori-PLFS.c
  */
 
-void set_o_direct_flag(int *fd)
+void set_o_direct_flag(int *flag)
 {
 /* note that TRU64 needs O_DIRECTIO, SunOS uses directio(),
    and everyone else needs O_DIRECT */
@@ -223,7 +227,7 @@ void set_o_direct_flag(int *fd)
 #  endif                          /* not O_DIRECTIO */
 #endif                            /* not O_DIRECT */
 
-        *fd |= O_DIRECT;
+        *flag |= O_DIRECT;
 }
 
 
@@ -911,3 +915,72 @@ unsigned long GetProcessorAndCore(int *chip, int *core){
 	return 1;
 }
 #endif
+
+
+
+/*
+ * Allocate a page-aligned (required by O_DIRECT) buffer.
+ */
+void *aligned_buffer_alloc(size_t size, ior_memory_flags type)
+{
+  size_t pageMask;
+  char *buf, *tmp;
+  char *aligned;
+
+  if(type == IOR_MEMORY_TYPE_GPU_MANAGED){
+#ifdef HAVE_CUDA
+    // use unified memory here to allow drop-in-replacement
+    if (cudaMallocManaged((void**) & buf, size, cudaMemAttachGlobal) != cudaSuccess){
+      ERR("Cannot allocate buffer on GPU");
+    }
+    return buf;
+#else
+    ERR("No CUDA supported, cannot allocate on the GPU");
+#endif
+  }else if(type == IOR_MEMORY_TYPE_GPU_DEVICE_ONLY){
+#ifdef HAVE_GPU_DIRECT
+      if (cudaMalloc((void**) & buf, size) != cudaSuccess){
+        ERR("Cannot allocate buffer on GPU");
+      }
+      return buf;
+#else
+      ERR("No GPUDirect supported, cannot allocate on the GPU");
+#endif
+    }
+
+#ifdef HAVE_SYSCONF
+  long pageSize = sysconf(_SC_PAGESIZE);
+#else
+  size_t pageSize = getpagesize();
+#endif
+
+  pageMask = pageSize - 1;
+  buf = safeMalloc(size + pageSize + sizeof(void *));
+  /* find the alinged buffer */
+  tmp = buf + sizeof(char *);
+  aligned = tmp + pageSize - ((size_t) tmp & pageMask);
+  /* write a pointer to the original malloc()ed buffer into the bytes
+     preceding "aligned", so that the aligned buffer can later be free()ed */
+  tmp = aligned - sizeof(void *);
+  *(void **)tmp = buf;
+
+  return (void *)aligned;
+}
+
+/*
+ * Free a buffer allocated by aligned_buffer_alloc().
+ */
+void aligned_buffer_free(void *buf, ior_memory_flags gpu)
+{
+  if(gpu){
+#ifdef HAVE_CUDA
+    if (cudaFree(buf) != cudaSuccess){
+      WARN("Cannot free buffer on GPU");
+    }
+    return;
+#else
+    ERR("No CUDA supported, cannot free on the GPU");
+#endif
+  }
+  free(*(void **)((char *)buf - sizeof(char *)));
+}
diff --git a/src/utilities.h b/src/utilities.h
index 202bcad..97dc2c0 100755
--- a/src/utilities.h
+++ b/src/utilities.h
@@ -63,5 +63,6 @@ void init_clock(MPI_Comm com);
 double GetTimeStamp(void);
 char * PrintTimestamp(); // TODO remove this function
 unsigned long GetProcessorAndCore(int *chip, int *core);
-
+void *aligned_buffer_alloc(size_t size, ior_memory_flags type);
+void aligned_buffer_free(void *buf, ior_memory_flags type);
 #endif  /* !_UTILITIES_H */