diff --git a/configure.ac b/configure.ac index cd71b39..d9f302d 100755 --- a/configure.ac +++ b/configure.ac @@ -186,6 +186,21 @@ AM_COND_IF([USE_POSIX_AIORI],[ AC_DEFINE([USE_POSIX_AIORI], [], [Build POSIX backend AIORI]) ]) +# PMDK IO support +AC_ARG_WITH([pmdk], + [AS_HELP_STRING([--with-pmdk], + [support IO with PMDK backend @<:@default=no@:>@])], + [], + [with_pmdk=no]) +AM_CONDITIONAL([USE_PMDK_AIORI], [test x$with_pmdk = xyes]) +AS_IF([test "x$with_pmdk" != xno], [ + AC_DEFINE([USE_PMDK_AIORI], [], [Build PMDK backend AIORI]) + AC_CHECK_HEADERS(libpmem.h,, [unset PMDK]) + AC_SEARCH_LIBS([pmem_map_file], [pmdk], + [AC_MSG_ERROR([Library containing pmdk symbols not found])]) +]) + + # RADOS support AC_ARG_WITH([rados], [AS_HELP_STRING([--with-rados], diff --git a/src/Makefile.am b/src/Makefile.am index 3786560..567d9ce 100755 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -65,6 +65,11 @@ if USE_POSIX_AIORI extraSOURCES += aiori-POSIX.c endif +if USE_PMDK_AIORI +extraSOURCES += aiori-PMDK.c +extraLDADD += -lpmem +endif + if USE_RADOS_AIORI extraSOURCES += aiori-RADOS.c extraLDADD += -lrados diff --git a/src/aiori-PMDK.c b/src/aiori-PMDK.c new file mode 100644 index 0000000..4a3953b --- /dev/null +++ b/src/aiori-PMDK.c @@ -0,0 +1,257 @@ +/******************************************************************************\ + * * + * Copyright (c) 2019 EPCC, The University of Edinburgh * + * Written by Adrian Jackson a.jackson@epcc.ed.ac.uk * + * * + ******************************************************************************* + * * + * * + * This file implements the abstract I/O interface for the low-level PMDK API * + * * +\******************************************************************************/ + + +#include "aiori.h" /* abstract IOR interface */ +#include /* sys_errlist */ +#include /* only for fprintf() */ +#include +#include +#include + + + +static option_help options [] = { + LAST_OPTION +}; + + +/**************************** P R O T O T Y P E S *****************************/ + +static option_help * PMDK_options(); +static void *PMDK_Create(char *, IOR_param_t *); +static void *PMDK_Open(char *, IOR_param_t *); +static IOR_offset_t PMDK_Xfer(int, void *, IOR_size_t *, IOR_offset_t, IOR_param_t *); +static void PMDK_Fsync(void *, IOR_param_t *); +static void PMDK_Close(void *, IOR_param_t *); +static void PMDK_Delete(char *, IOR_param_t *); +static IOR_offset_t PMDK_GetFileSize(IOR_param_t *, MPI_Comm, char *); + + +/************************** D E C L A R A T I O N S ***************************/ + +extern int errno; +extern int rank; +extern int rankOffset; +extern int verbose; +extern MPI_Comm testComm; + +ior_aiori_t pmdk_aiori = { + .name = "PMDK", + .name_legacy = NULL, + .create = PMDK_Create, + .open = PMDK_Open, + .xfer = PMDK_Xfer, + .close = PMDK_Close, + .delete = PMDK_Delete, + .get_version = aiori_get_version, + .fsync = PMDK_Fsync, + .get_file_size = PMDK_GetFileSize, + .statfs = aiori_posix_statfs, + .mkdir = aiori_posix_mkdir, + .rmdir = aiori_posix_rmdir, + .access = aiori_posix_access, + .stat = aiori_posix_stat, + .get_options = PMDK_options, + .enable_mdtest = false, +}; + + +/***************************** F U N C T I O N S ******************************/ + +/******************************************************************************/ + +static option_help * PMDK_options(){ + return options; +} + + +/* + * Create and open a memory space through the PMDK interface. + */ +static void *PMDK_Create(char * testFileName, IOR_param_t * param){ + char *pmemaddr = NULL; + int is_pmem; + size_t mapped_len; + size_t open_length; + + if(!param->filePerProc){ + fprintf(stdout, "\nPMDK functionality can only be used with filePerProc functionality\n"); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + open_length = param->blockSize * param->segmentCount; + + if((pmemaddr = pmem_map_file(testFileName, open_length, + PMEM_FILE_CREATE|PMEM_FILE_EXCL, + 0666, &mapped_len, &is_pmem)) == NULL) { + fprintf(stdout, "\nFailed to pmem_map_file for filename: %s in IOR_Create_PMDK\n", testFileName); + perror("pmem_map_file"); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + if(!is_pmem){ + fprintf(stdout, "\n is_pmem is %d\n",is_pmem); + fprintf(stdout, "\npmem_map_file thinks the hardware being used is not pmem\n"); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + + + return((void *)pmemaddr); +} /* PMDK_Create() */ + + +/******************************************************************************/ +/* + * Open a memory space through the PMDK interface. + */ + +static void *PMDK_Open(char * testFileName, IOR_param_t * param){ + + char *pmemaddr = NULL; + int is_pmem; + size_t mapped_len; + size_t open_length; + + if(!param->filePerProc){ + fprintf(stdout, "\nPMDK functionality can only be used with filePerProc functionality\n"); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + open_length = param->blockSize * param->segmentCount; + + if((pmemaddr = pmem_map_file(testFileName, 0, + PMEM_FILE_EXCL, + 0666, &mapped_len, &is_pmem)) == NULL) { + fprintf(stdout, "\nFailed to pmem_map_file for filename: %s\n in IOR_Open_PMDK", testFileName); + perror("pmem_map_file"); + fprintf(stdout, "\n %ld %ld\n",open_length, mapped_len); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + if(!is_pmem){ + fprintf(stdout, "pmem_map_file thinks the hardware being used is not pmem\n"); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + return((void *)pmemaddr); +} /* PMDK_Open() */ + + +/******************************************************************************/ +/* + * Write or read access to a memory space created with PMDK. Include drain/flush functionality. + */ + +static IOR_offset_t PMDK_Xfer(int access, void *file, IOR_size_t * buffer, + IOR_offset_t length, IOR_param_t * param){ + int xferRetries = 0; + long long remaining = (long long)length; + char * ptr = (char *)buffer; + long long rc; + long long i; + long long offset_size; + + offset_size = param->offset; + + if(access == WRITE){ + if(param->fsync){ + pmem_memcpy_nodrain(&file[offset_size], ptr, length); + }else{ + pmem_memcpy_persist(&file[offset_size], ptr, length); + } + }else{ + memcpy(ptr, &file[offset_size], length); + } + + return(length); +} /* PMDK_Xfer() */ + + +/******************************************************************************/ +/* + * Perform fsync(). + */ + +static void PMDK_Fsync(void *fd, IOR_param_t * param) +{ + pmem_drain(); +} /* PMDK_Fsync() */ + + +/******************************************************************************/ +/* + * Stub for close functionality that is not required for PMDK + */ + +static void PMDK_Close(void *fd, IOR_param_t * param){ + size_t open_length; + open_length = param->transferSize; + pmem_unmap(fd, open_length); + +} /* PMDK_Close() */ + + +/******************************************************************************/ +/* + * Delete the file backing a memory space through PMDK + */ + +static void PMDK_Delete(char *testFileName, IOR_param_t * param) +{ + char errmsg[256]; + sprintf(errmsg,"[RANK %03d]:cannot delete file %s\n",rank,testFileName); + if (unlink(testFileName) != 0) WARN(errmsg); +} /* PMDK_Delete() */ + + +/******************************************************************************/ +/* + * Determine api version. + */ + +static void PMDK_SetVersion(IOR_param_t *test) +{ + strcpy(test->apiVersion, test->api); +} /* PMDK_SetVersion() */ + + +/******************************************************************************/ +/* + * Use POSIX stat() to return aggregate file size. + */ + +static IOR_offset_t PMDK_GetFileSize(IOR_param_t * test, + MPI_Comm testComm, + char * testFileName) +{ + struct stat stat_buf; + IOR_offset_t aggFileSizeFromStat, + tmpMin, tmpMax, tmpSum; + if (test->filePerProc == FALSE) { + fprintf(stdout, "\nPMDK functionality can only be used with filePerProc functionality\n"); + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "MPI_Abort() error"); + } + + if (stat(testFileName, &stat_buf) != 0) { + ERR("cannot get status of written file"); + } + aggFileSizeFromStat = stat_buf.st_size; + + MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1, + MPI_LONG_LONG_INT, MPI_SUM, testComm), + "cannot total data moved"); + aggFileSizeFromStat = tmpSum; + + return(aggFileSizeFromStat); +} /* PMDK_GetFileSize() */ diff --git a/src/aiori.c b/src/aiori.c index f66b9bc..303f367 100644 --- a/src/aiori.c +++ b/src/aiori.c @@ -42,6 +42,9 @@ ior_aiori_t *available_aiori[] = { #ifdef USE_POSIX_AIORI &posix_aiori, #endif +#ifdef USE_PMDK_AIORI + &pmdk_aiori, +#endif #ifdef USE_DAOS_AIORI &daos_aiori, &dfs_aiori, diff --git a/src/aiori.h b/src/aiori.h index 37bc3c0..ad10e4d 100755 --- a/src/aiori.h +++ b/src/aiori.h @@ -133,6 +133,7 @@ extern ior_aiori_t ime_aiori; extern ior_aiori_t mpiio_aiori; extern ior_aiori_t ncmpi_aiori; extern ior_aiori_t posix_aiori; +extern ior_aiori_t pmdk_aiori; extern ior_aiori_t mmap_aiori; extern ior_aiori_t s3_aiori; extern ior_aiori_t s3_plus_aiori; diff --git a/src/ior.c b/src/ior.c index f05defb..08f95ef 100755 --- a/src/ior.c +++ b/src/ior.c @@ -771,10 +771,17 @@ void GetTestFileName(char *testFileName, IOR_param_t * test) char initialTestFileName[MAX_PATHLEN]; char testFileNameRoot[MAX_STR]; char tmpString[MAX_STR]; - int count; + int count; + int socket, core; /* parse filename for multiple file systems */ strcpy(initialTestFileName, test->testFileName); + if(test->dualMount){ + GetProcessorAndCore(&socket, &core); + sprintf(tmpString, "%s%d/%s",initialTestFileName, + socket, "data"); + strcpy(initialTestFileName, tmpString); + } fileNames = ParseFileName(initialTestFileName, &count); if (count > 1 && test->uniqueDir == TRUE) ERR("cannot use multiple file names with unique directories"); diff --git a/src/ior.h b/src/ior.h index 202fffb..c3d9ad4 100755 --- a/src/ior.h +++ b/src/ior.h @@ -94,6 +94,7 @@ typedef struct int collective; /* collective I/O */ MPI_Comm testComm; /* MPI communicator */ int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */ + int dualMount; /* dual mount points */ int numTasks; /* number of tasks for test */ int numNodes; /* number of nodes for test */ int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */ diff --git a/src/parse_options.c b/src/parse_options.c index 97e10dc..ce5421c 100755 --- a/src/parse_options.c +++ b/src/parse_options.c @@ -60,6 +60,10 @@ static void CheckRunSettings(IOR_test_t *tests) params->readFile = TRUE; params->writeFile = TRUE; } + + if(params->dualMount && !params->filePerProc) { + MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, -1), "Dual Mount can only be used with File Per Process"); + } } } @@ -117,6 +121,8 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->platform = strdup(value); } else if (strcasecmp(option, "testfile") == 0) { params->testFileName = strdup(value); + } else if (strcasecmp(option, "dualmount") == 0){ + params->dualMount = atoi(value); } else if (strcasecmp(option, "deadlineforstonewalling") == 0) { params->deadlineForStonewalling = atoi(value); } else if (strcasecmp(option, "stoneWallingWearOut") == 0) { @@ -425,6 +431,7 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'W', NULL, "checkWrite -- check read after write", OPTION_FLAG, 'd', & params->checkWrite}, {'x', NULL, "singleXferAttempt -- do not retry transfer if incomplete", OPTION_FLAG, 'd', & params->singleXferAttempt}, {'X', NULL, "reorderTasksRandomSeed -- random seed for -Z option", OPTION_OPTIONAL_ARGUMENT, 'd', & params->reorderTasksRandomSeed}, + {'y', NULL, "dualMount -- use dual mount points for a filesystem", OPTION_FLAG, 'd', & params->dualMount}, {'Y', NULL, "fsyncPerWrite -- perform sync operation after every write operation", OPTION_FLAG, 'd', & params->fsyncPerWrite}, {'z', NULL, "randomOffset -- access is to random, not sequential, offsets within a file", OPTION_FLAG, 'd', & params->randomOffset}, {'Z', NULL, "reorderTasksRandom -- changes task ordering to random ordering for readback", OPTION_FLAG, 'd', & params->reorderTasksRandom}, diff --git a/src/utilities.c b/src/utilities.c index 33b23d8..715e30d 100755 --- a/src/utilities.c +++ b/src/utilities.c @@ -868,3 +868,25 @@ char *HumanReadable(IOR_offset_t value, int base) } return valueStr; } + +#if defined(__aarch64__) +// TODO: This might be general enough to provide the functionality for any system +// regardless of processor type given we aren't worried about thread/process migration. +// Test on Intel systems and see if we can get rid of the architecture specificity +// of the code. +unsigned long GetProcessorAndCore(int *chip, int *core){ + return syscall(SYS_getcpu, core, chip, NULL); +} +// TODO: Add in AMD function +#else +// If we're not on an ARM processor assume we're on an intel processor and use the +// rdtscp instruction. +unsigned long GetProcessorAndCore(int *chip, int *core){ + unsigned long a,d,c; + __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); + *chip = (c & 0xFFF000)>>12; + *core = c & 0xFFF; + return ((unsigned long)a) | (((unsigned long)d) << 32);; +} +#endif + diff --git a/src/utilities.h b/src/utilities.h index a03e17c..32292a4 100755 --- a/src/utilities.h +++ b/src/utilities.h @@ -68,6 +68,7 @@ void StoreStoneWallingIterations(char * const filename, int64_t count); void init_clock(void); double GetTimeStamp(void); char * PrintTimestamp(); // TODO remove this function +unsigned long GetProcessorAndCore(int *chip, int *core); extern double wall_clock_deviation; extern double wall_clock_delta;