/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ /******************************************************************************\ * * * Copyright (c) 2003, The Regents of the University of California * * See the file COPYRIGHT for a complete copyright notice and license. * * * \******************************************************************************/ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include /* tolower() */ #include #include #include #include #if defined(HAVE_STRINGS_H) #include #endif #include /* struct stat */ #include #ifndef _WIN32 # include /* gettimeofday() */ # include /* uname() */ #endif #include #include "ior.h" #include "ior-internal.h" #include "aiori.h" #include "utilities.h" #include "parse_options.h" #define IOR_NB_TIMERS 6 /* file scope globals */ extern char **environ; static int totalErrorCount; static const ior_aiori_t *backend; static void DestroyTests(IOR_test_t *tests_head); static char *PrependDir(IOR_param_t *, char *); static char **ParseFileName(char *, int *); static void InitTests(IOR_test_t * , MPI_Comm); static void TestIoSys(IOR_test_t *); static void ValidateTests(IOR_param_t *); static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers); static void ior_set_xfer_hints(IOR_param_t * p){ aiori_xfer_hint_t * hints = & p->hints; hints->dryRun = p->dryRun; hints->filePerProc = p->filePerProc; hints->collective = p->collective; hints->numTasks = p->numTasks; hints->numNodes = p->numNodes; hints->randomOffset = p->randomOffset; hints->fsyncPerWrite = p->fsyncPerWrite; hints->segmentCount = p->segmentCount; hints->blockSize = p->blockSize; hints->transferSize = p->transferSize; hints->expectedAggFileSize = p->expectedAggFileSize; hints->singleXferAttempt = p->singleXferAttempt; if(backend->xfer_hints){ backend->xfer_hints(hints); } } int aiori_warning_as_errors = 0; static void test_initialize(IOR_test_t * test){ verbose = test->params.verbose; backend = test->params.backend; if(backend->initialize){ backend->initialize(test->params.backend_options); } ior_set_xfer_hints(& test->params); aiori_warning_as_errors = test->params.warningAsErrors; if (rank == 0 && verbose >= VERBOSE_0) { ShowTestStart(& test->params); } } static void test_finalize(IOR_test_t * test){ backend = test->params.backend; if(backend->finalize){ backend->finalize(test->params.backend_options); } } IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out){ IOR_test_t *tests_head; IOR_test_t *tptr; out_logfile = world_out; out_resultfile = world_out; mpi_comm_world = world_com; MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); /* setup tests, and validate parameters */ tests_head = ParseCommandLine(argc, argv); InitTests(tests_head, world_com); PrintHeader(argc, argv); /* perform each test */ for (tptr = tests_head; tptr != NULL; tptr = tptr->next) { test_initialize(tptr); totalErrorCount = 0; TestIoSys(tptr); tptr->results->errors = totalErrorCount; ShowTestEnd(tptr); test_finalize(tptr); } PrintLongSummaryAllTests(tests_head); /* display finish time */ PrintTestEnds(); return tests_head; } int ior_main(int argc, char **argv) { IOR_test_t *tests_head; IOR_test_t *tptr; out_logfile = stdout; out_resultfile = stdout; /* * check -h option from commandline without starting MPI; */ tests_head = ParseCommandLine(argc, argv); /* start the MPI code */ MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI"); mpi_comm_world = MPI_COMM_WORLD; MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); /* set error-handling */ /*MPI_CHECK(MPI_Errhandler_set(mpi_comm_world, MPI_ERRORS_RETURN), "cannot set errhandler"); */ /* setup tests, and validate parameters */ InitTests(tests_head, mpi_comm_world); PrintHeader(argc, argv); /* perform each test */ for (tptr = tests_head; tptr != NULL; tptr = tptr->next) { test_initialize(tptr); // This is useful for trapping a running MPI process. While // this is sleeping, run the script 'testing/hdfs/gdb.attach' if (verbose >= VERBOSE_4) { fprintf(out_logfile, "\trank %d: sleeping\n", rank); sleep(5); fprintf(out_logfile, "\trank %d: awake.\n", rank); } TestIoSys(tptr); ShowTestEnd(tptr); test_finalize(tptr); } if (verbose <= VERBOSE_0) /* always print final summary */ verbose = VERBOSE_1; PrintLongSummaryAllTests(tests_head); /* display finish time */ PrintTestEnds(); MPI_CHECK(MPI_Finalize(), "cannot finalize MPI"); DestroyTests(tests_head); return totalErrorCount; } /***************************** F U N C T I O N S ******************************/ /* * Initialize an IOR_param_t structure to the defaults */ void init_IOR_Param_t(IOR_param_t * p) { const char *default_aiori = aiori_default (); char *hdfs_user; assert (NULL != default_aiori); memset(p, 0, sizeof(IOR_param_t)); p->api = strdup(default_aiori); p->platform = strdup("HOST(OSTYPE)"); p->testFileName = strdup("testFile"); p->writeFile = p->readFile = FALSE; p->checkWrite = p->checkRead = FALSE; /* * These can be overridden from the command-line but otherwise will be * set from MPI. */ p->numTasks = -1; p->numNodes = -1; p->numTasksOnNode0 = -1; p->repetitions = 1; p->repCounter = -1; p->open = WRITE; p->taskPerNodeOffset = 1; p->segmentCount = 1; p->blockSize = 1048576; p->transferSize = 262144; p->randomSeed = -1; p->incompressibleSeed = 573; p->testComm = mpi_comm_world; hdfs_user = getenv("USER"); if (!hdfs_user) hdfs_user = ""; p->hdfs_user = strdup(hdfs_user); p->hdfs_name_node = "default"; p->hdfs_name_node_port = 0; /* ??? */ p->hdfs_fs = NULL; p->hdfs_replicas = 0; /* invokes the default */ p->hdfs_block_size = 0; p->URI = NULL; p->part_number = 0; } static void DisplayOutliers(int numTasks, double timerVal, char *timeString, int access, int outlierThreshold) { char accessString[MAX_STR]; double sum, mean, sqrDiff, var, sd; /* for local timerVal, don't compensate for wall clock delta */ timerVal += wall_clock_delta; MPI_CHECK(MPI_Allreduce (&timerVal, &sum, 1, MPI_DOUBLE, MPI_SUM, testComm), "MPI_Allreduce()"); mean = sum / numTasks; sqrDiff = pow((mean - timerVal), 2); MPI_CHECK(MPI_Allreduce (&sqrDiff, &var, 1, MPI_DOUBLE, MPI_SUM, testComm), "MPI_Allreduce()"); var = var / numTasks; sd = sqrt(var); if (access == WRITE) { strcpy(accessString, "write"); } else { /* READ */ strcpy(accessString, "read"); } if (fabs(timerVal - mean) > (double)outlierThreshold) { char hostname[MAX_STR]; int ret = gethostname(hostname, MAX_STR); if (ret != 0) strcpy(hostname, "unknown"); EWARNF("for %s, task %d, %s %s is %f (mean=%f, stddev=%f)\n", hostname, rank, accessString, timeString, timerVal, mean, sd); } } /* * Check for outliers in start/end times and elapsed create/xfer/close times. */ static void CheckForOutliers(IOR_param_t *test, const double *timer, const int access) { DisplayOutliers(test->numTasks, timer[0], "start time", access, test->outlierThreshold); DisplayOutliers(test->numTasks, timer[1] - timer[0], "elapsed create time", access, test->outlierThreshold); DisplayOutliers(test->numTasks, timer[3] - timer[2], "elapsed transfer time", access, test->outlierThreshold); DisplayOutliers(test->numTasks, timer[5] - timer[4], "elapsed close time", access, test->outlierThreshold); DisplayOutliers(test->numTasks, timer[5], "end time", access, test->outlierThreshold); } /* * Check if actual file size equals expected size; if not use actual for * calculating performance rate. */ static void CheckFileSize(IOR_test_t *test, IOR_offset_t dataMoved, int rep, const int access) { IOR_param_t *params = &test->params; IOR_results_t *results = test->results; IOR_point_t *point = (access == WRITE) ? &results[rep].write : &results[rep].read; MPI_CHECK(MPI_Allreduce(&dataMoved, &point->aggFileSizeFromXfer, 1, MPI_LONG_LONG_INT, MPI_SUM, testComm), "cannot total data moved"); if (strcasecmp(params->api, "HDF5") != 0 && strcasecmp(params->api, "NCMPI") != 0 && strcasecmp(params->api, "DAOS") != 0) { if (verbose >= VERBOSE_0 && rank == 0) { if ((params->expectedAggFileSize != point->aggFileSizeFromXfer) || (point->aggFileSizeFromStat != point->aggFileSizeFromXfer)) { EWARNF("Expected aggregate file size = %lld", (long long) params->expectedAggFileSize); EWARNF("Stat() of aggregate file size = %lld", (long long) point->aggFileSizeFromStat); EWARNF("Using actual aggregate bytes moved = %lld", (long long) point->aggFileSizeFromXfer); if(params->deadlineForStonewalling){ EWARN("Maybe caused by deadlineForStonewalling"); } } } } point->aggFileSizeForBW = point->aggFileSizeFromXfer; } /* * Compare buffers after reading/writing each transfer. Displays only first * difference in buffers and returns total errors counted. */ static size_t CompareBuffers(void *expectedBuffer, void *unknownBuffer, size_t size, IOR_offset_t transferCount, IOR_param_t *test, int access) { char testFileName[MAX_PATHLEN]; char bufferLabel1[MAX_STR]; char bufferLabel2[MAX_STR]; size_t i, j, length, first, last; size_t errorCount = 0; int inError = 0; unsigned long long *goodbuf = (unsigned long long *)expectedBuffer; unsigned long long *testbuf = (unsigned long long *)unknownBuffer; if (access == WRITECHECK || access == READCHECK) { strcpy(bufferLabel1, "Expected: "); strcpy(bufferLabel2, "Actual: "); } else { ERR("incorrect argument for CompareBuffers()"); } length = size / sizeof(IOR_size_t); first = -1; if (verbose >= VERBOSE_3) { fprintf(out_logfile, "[%d] At file byte offset %lld, comparing %llu-byte transfer\n", rank, (long long) offset, (long long)size); } for (i = 0; i < length; i++) { if (testbuf[i] != goodbuf[i]) { errorCount++; if (verbose >= VERBOSE_2) { fprintf(out_logfile, "[%d] At transfer buffer #%lld, index #%lld (file byte offset %lld):\n", rank, transferCount - 1, (long long)i, (long long) offset + (IOR_size_t) (i * sizeof(IOR_size_t))); fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1); fprintf(out_logfile, "%016llx\n", goodbuf[i]); fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel2); fprintf(out_logfile, "%016llx\n", testbuf[i]); } if (!inError) { inError = 1; first = i; last = i; } else { last = i; } } else if (verbose >= VERBOSE_5 && i % 4 == 0) { fprintf(out_logfile, "[%d] PASSED offset = %lu bytes, transfer %lld\n", rank, ((i * sizeof(unsigned long long)) + offset), transferCount); fprintf(out_logfile, "[%d] GOOD %s0x", rank, bufferLabel1); for (j = 0; j < 4; j++) fprintf(out_logfile, "%016llx ", goodbuf[i + j]); fprintf(out_logfile, "\n[%d] GOOD %s0x", rank, bufferLabel2); for (j = 0; j < 4; j++) fprintf(out_logfile, "%016llx ", testbuf[i + j]); fprintf(out_logfile, "\n"); } } if (inError) { inError = 0; GetTestFileName(testFileName, test); EWARNF("[%d] FAILED comparison of buffer containing %d-byte ints:\n", rank, (int)sizeof(unsigned long long int)); fprintf(out_logfile, "[%d] File name = %s\n", rank, testFileName); fprintf(out_logfile, "[%d] In transfer %lld, ", rank, transferCount); fprintf(out_logfile, "%lld errors between buffer indices %lld and %lld.\n", (long long)errorCount, (long long)first, (long long)last); fprintf(out_logfile, "[%d] File byte offset = %lu:\n", rank, ((first * sizeof(unsigned long long)) + offset)); fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1); for (j = first; j < length && j < first + 4; j++) fprintf(out_logfile, "%016llx ", goodbuf[j]); if (j == length) fprintf(out_logfile, "[end of buffer]"); fprintf(out_logfile, "\n[%d] %s0x", rank, bufferLabel2); for (j = first; j < length && j < first + 4; j++) fprintf(out_logfile, "%016llx ", testbuf[j]); if (j == length) fprintf(out_logfile, "[end of buffer]"); fprintf(out_logfile, "\n"); } return (errorCount); } /* * Count all errors across all tasks; report errors found. */ static int CountErrors(IOR_param_t * test, int access, int errors) { int allErrors = 0; if (test->checkWrite || test->checkRead) { MPI_CHECK(MPI_Reduce(&errors, &allErrors, 1, MPI_INT, MPI_SUM, 0, testComm), "cannot reduce errors"); MPI_CHECK(MPI_Bcast(&allErrors, 1, MPI_INT, 0, testComm), "cannot broadcast allErrors value"); if (allErrors != 0) { totalErrorCount += allErrors; test->errorFound = TRUE; } if (rank == 0 && allErrors != 0) { if (allErrors < 0) { WARN("overflow in errors counted"); allErrors = -1; } EWARNF("Incorrect data on %s (%d errors found).\n", access == WRITECHECK ? "write" : "read", allErrors); fprintf(out_logfile, "Used Time Stamp %u (0x%x) for Data Signature\n", test->timeStampSignatureValue, test->timeStampSignatureValue); } } return (allErrors); } /* * Allocate a page-aligned (required by O_DIRECT) buffer. */ static void *aligned_buffer_alloc(size_t size) { size_t pageMask; char *buf, *tmp; char *aligned; #ifdef HAVE_SYSCONF long pageSize = sysconf(_SC_PAGESIZE); #else size_t pageSize = getpagesize(); #endif pageMask = pageSize - 1; buf = malloc(size + pageSize + sizeof(void *)); if (buf == NULL) ERR("out of memory"); /* find the alinged buffer */ tmp = buf + sizeof(char *); aligned = tmp + pageSize - ((size_t) tmp & pageMask); /* write a pointer to the original malloc()ed buffer into the bytes preceding "aligned", so that the aligned buffer can later be free()ed */ tmp = aligned - sizeof(void *); *(void **)tmp = buf; return (void *)aligned; } /* * Free a buffer allocated by aligned_buffer_alloc(). */ static void aligned_buffer_free(void *buf) { free(*(void **)((char *)buf - sizeof(char *))); } void AllocResults(IOR_test_t *test) { int reps; if (test->results != NULL) return; reps = test->params.repetitions; test->results = (IOR_results_t *) safeMalloc(sizeof(IOR_results_t) * reps); } void FreeResults(IOR_test_t *test) { if (test->results != NULL) { free(test->results); } } /** * Create new test for list of tests. */ IOR_test_t *CreateTest(IOR_param_t *init_params, int test_num) { IOR_test_t *newTest = NULL; newTest = (IOR_test_t *) malloc(sizeof(IOR_test_t)); if (newTest == NULL) ERR("malloc() of IOR_test_t failed"); newTest->params = *init_params; newTest->params.platform = GetPlatformName(); newTest->params.id = test_num; newTest->next = NULL; newTest->results = NULL; return newTest; } static void DestroyTest(IOR_test_t *test) { FreeResults(test); free(test); } static void DestroyTests(IOR_test_t *tests_head) { IOR_test_t *tptr, *next; for (tptr = tests_head; tptr != NULL; tptr = next) { next = tptr->next; DestroyTest(tptr); } } /* * Distribute IOR_HINTs to all tasks' environments. */ void DistributeHints(void) { char hint[MAX_HINTS][MAX_STR], fullHint[MAX_STR], hintVariable[MAX_STR]; int hintCount = 0, i; if (rank == 0) { for (i = 0; environ[i] != NULL; i++) { if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT")) == 0) { hintCount++; if (hintCount == MAX_HINTS) { WARN("exceeded max hints; reset MAX_HINTS and recompile"); hintCount = MAX_HINTS; break; } /* assume no IOR_HINT is greater than MAX_STR in length */ strncpy(hint[hintCount - 1], environ[i], MAX_STR - 1); } } } MPI_CHECK(MPI_Bcast(&hintCount, sizeof(hintCount), MPI_BYTE, 0, MPI_COMM_WORLD), "cannot broadcast hints"); for (i = 0; i < hintCount; i++) { MPI_CHECK(MPI_Bcast(&hint[i], MAX_STR, MPI_BYTE, 0, MPI_COMM_WORLD), "cannot broadcast hints"); strcpy(fullHint, hint[i]); strcpy(hintVariable, strtok(fullHint, "=")); if (getenv(hintVariable) == NULL) { /* doesn't exist in this task's environment; better set it */ if (putenv(hint[i]) != 0) WARN("cannot set environment variable"); } } } /* * Fill buffer, which is transfer size bytes long, with known 8-byte long long * int values. In even-numbered 8-byte long long ints, store MPI task in high * bits and timestamp signature in low bits. In odd-numbered 8-byte long long * ints, store transfer offset. If storeFileOffset option is used, the file * (not transfer) offset is stored instead. */ static void FillIncompressibleBuffer(void* buffer, IOR_param_t * test) { size_t i; unsigned long long hi, lo; unsigned long long *buf = (unsigned long long *)buffer; for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) { hi = ((unsigned long long) rand_r(&test->incompressibleSeed) << 32); lo = (unsigned long long) rand_r(&test->incompressibleSeed); buf[i] = hi | lo; } } unsigned int reseed_incompressible_prng = TRUE; static void FillBuffer(void *buffer, IOR_param_t * test, unsigned long long offset, int fillrank) { size_t i; unsigned long long hi, lo; unsigned long long *buf = (unsigned long long *)buffer; if(test->dataPacketType == incompressible ) { /* Make for some non compressible buffers with randomish data */ /* In order for write checks to work, we have to restart the pseudo random sequence */ if(reseed_incompressible_prng == TRUE) { test->incompressibleSeed = test->setTimeStampSignature + rank; /* We copied seed into timestampSignature at initialization, also add the rank to add randomness between processes */ reseed_incompressible_prng = FALSE; } FillIncompressibleBuffer(buffer, test); } else { hi = ((unsigned long long)fillrank) << 32; lo = (unsigned long long)test->timeStampSignatureValue; for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) { if ((i % 2) == 0) { /* evens contain MPI rank and time in seconds */ buf[i] = hi | lo; } else { /* odds contain offset */ buf[i] = offset + (i * sizeof(unsigned long long)); } } } } /* * Return string describing machine name and type. */ char * GetPlatformName() { char nodeName[MAX_STR], *p, *start, sysName[MAX_STR]; char platformName[MAX_STR]; struct utsname name; if (uname(&name) != 0) { EWARN("cannot get platform name"); sprintf(sysName, "%s", "Unknown"); sprintf(nodeName, "%s", "Unknown"); } else { sprintf(sysName, "%s", name.sysname); sprintf(nodeName, "%s", name.nodename); } start = nodeName; if (strlen(nodeName) == 0) { p = start; } else { /* point to one character back from '\0' */ p = start + strlen(nodeName) - 1; } /* * to cut off trailing node number, search backwards * for the first non-numeric character */ while (p != start) { if (*p < '0' || *p > '9') { *(p + 1) = '\0'; break; } else { p--; } } sprintf(platformName, "%s(%s)", nodeName, sysName); return strdup(platformName); } /* * Parse file name. */ static char **ParseFileName(char *name, int *count) { char **fileNames, *tmp, *token; char delimiterString[3] = { FILENAME_DELIMITER, '\n', '\0' }; int i = 0; *count = 0; tmp = name; /* pass one */ /* if something there, count the first item */ if (*tmp != '\0') { (*count)++; } /* count the rest of the filenames */ while (*tmp != '\0') { if (*tmp == FILENAME_DELIMITER) { (*count)++; } tmp++; } fileNames = (char **)malloc((*count) * sizeof(char **)); if (fileNames == NULL) ERR("out of memory"); /* pass two */ token = strtok(name, delimiterString); while (token != NULL) { fileNames[i] = token; token = strtok(NULL, delimiterString); i++; } return (fileNames); } /* * Return test file name to access. * for single shared file, fileNames[0] is returned in testFileName */ void GetTestFileName(char *testFileName, IOR_param_t * test) { char **fileNames; char initialTestFileName[MAX_PATHLEN]; char testFileNameRoot[MAX_STR]; char tmpString[MAX_STR]; int count; int socket, core; /* parse filename for multiple file systems */ strcpy(initialTestFileName, test->testFileName); if(test->dualMount){ GetProcessorAndCore(&socket, &core); sprintf(tmpString, "%s%d/%s",initialTestFileName, socket, "data"); strcpy(initialTestFileName, tmpString); } fileNames = ParseFileName(initialTestFileName, &count); if (count > 1 && test->uniqueDir == TRUE) ERR("cannot use multiple file names with unique directories"); if (test->filePerProc) { strcpy(testFileNameRoot, fileNames[((rank + rankOffset) % test->numTasks) % count]); } else { strcpy(testFileNameRoot, fileNames[0]); } /* give unique name if using multiple files */ if (test->filePerProc) { /* * prepend rank subdirectory before filename * e.g., /dir/file => /dir//file */ if (test->uniqueDir == TRUE) { strcpy(testFileNameRoot, PrependDir(test, testFileNameRoot)); } sprintf(testFileName, "%s.%08d", testFileNameRoot, (rank + rankOffset) % test->numTasks); } else { strcpy(testFileName, testFileNameRoot); } /* add suffix for multiple files */ if (test->repCounter > -1) { sprintf(tmpString, ".%d", test->repCounter); strcat(testFileName, tmpString); } free (fileNames); } /* * From absolute directory, insert rank as subdirectory. Allows each task * to write to its own directory. E.g., /dir/file => /dir//file. */ static char *PrependDir(IOR_param_t * test, char *rootDir) { char *dir; char *fname; int i; dir = (char *)malloc(MAX_STR + 1); if (dir == NULL) ERR("out of memory"); /* get dir name */ strcpy(dir, rootDir); i = strlen(dir) - 1; while (i > 0) { if (dir[i] == '\0' || dir[i] == '/') { dir[i] = '/'; dir[i + 1] = '\0'; break; } i--; } /* get file name */ fname = rootDir + i + 1; /* create directory with rank as subdirectory */ sprintf(dir + i + 1, "%d", (rank + rankOffset) % test->numTasks); /* dir doesn't exist, so create */ if (backend->access(dir, F_OK, test->backend_options) != 0) { if (backend->mkdir(dir, S_IRWXU, test->backend_options) < 0) { ERRF("cannot create directory: %s", dir); } /* check if correct permissions */ } else if (backend->access(dir, R_OK, test->backend_options) != 0 || backend->access(dir, W_OK, test->backend_options) != 0 || backend->access(dir, X_OK, test->backend_options) != 0) { ERRF("invalid directory permissions: %s", dir); } /* concatenate dir and file names */ strcat(dir, "/"); strcat(dir, fname); return dir; } /******************************************************************************/ /* * Reduce test results, and show if verbose set. */ static void ReduceIterResults(IOR_test_t *test, double *timer, const int rep, const int access) { double reduced[IOR_NB_TIMERS] = { 0 }; double diff[IOR_NB_TIMERS / 2 + 1]; double totalTime, accessTime; IOR_param_t *params = &test->params; double bw, iops, latency, minlatency; int i; MPI_Op op; assert(access == WRITE || access == READ); /* Find the minimum start time of the even numbered timers, and the maximum finish time for the odd numbered timers */ for (i = 0; i < IOR_NB_TIMERS; i++) { op = i % 2 ? MPI_MAX : MPI_MIN; MPI_CHECK(MPI_Reduce(&timer[i], &reduced[i], 1, MPI_DOUBLE, op, 0, testComm), "MPI_Reduce()"); } /* Calculate elapsed times and throughput numbers */ for (i = 0; i < IOR_NB_TIMERS / 2; i++) diff[i] = reduced[2 * i + 1] - reduced[2 * i]; totalTime = reduced[5] - reduced[0]; accessTime = reduced[3] - reduced[2]; IOR_point_t *point = (access == WRITE) ? &test->results[rep].write : &test->results[rep].read; point->time = totalTime; if (verbose < VERBOSE_0) return; bw = (double)point->aggFileSizeForBW / totalTime; /* For IOPS in this iteration, we divide the total amount of IOs from * all ranks over the entire access time (first start -> last end). */ iops = (point->aggFileSizeForBW / params->transferSize) / accessTime; /* For Latency, we divide the total access time for each task over the * number of I/Os issued from that task; then reduce and display the * minimum (best) latency achieved. So what is reported is the average * latency of all ops from a single task, then taking the minimum of * that between all tasks. */ latency = (timer[3] - timer[2]) / (params->blockSize / params->transferSize); MPI_CHECK(MPI_Reduce(&latency, &minlatency, 1, MPI_DOUBLE, MPI_MIN, 0, testComm), "MPI_Reduce()"); /* Only rank 0 tallies and prints the results. */ if (rank != 0) return; PrintReducedResult(test, access, bw, iops, latency, diff, totalTime, rep); } /* * Check for file(s), then remove all files if file-per-proc, else single file. * */ static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test) { int tmpRankOffset = 0; if (filePerProc) { /* in random tasks, delete own file */ if (test->reorderTasksRandom == TRUE) { tmpRankOffset = rankOffset; rankOffset = 0; GetTestFileName(testFileName, test); } if (backend->access(testFileName, F_OK, test->backend_options) == 0) { if (verbose >= VERBOSE_3) { fprintf(out_logfile, "task %d removing %s\n", rank, testFileName); } backend->delete(testFileName, test->backend_options); } if (test->reorderTasksRandom == TRUE) { rankOffset = tmpRankOffset; GetTestFileName(testFileName, test); } } else { if ((rank == 0) && (backend->access(testFileName, F_OK, test->backend_options) == 0)) { if (verbose >= VERBOSE_3) { fprintf(out_logfile, "task %d removing %s\n", rank, testFileName); } backend->delete(testFileName, test->backend_options); } } } /* * Setup tests by parsing commandline and creating test script. * Perform a sanity-check on the configured parameters. */ static void InitTests(IOR_test_t *tests, MPI_Comm com) { int mpiNumNodes = 0; int mpiNumTasks = 0; int mpiNumTasksOnNode0 = 0; verbose = tests->params.verbose; aiori_warning_as_errors = tests->params.warningAsErrors; /* * These default values are the same for every test and expensive to * retrieve so just do it once. */ mpiNumNodes = GetNumNodes(com); mpiNumTasks = GetNumTasks(com); mpiNumTasksOnNode0 = GetNumTasksOnNode0(com); /* * Since there is no guarantee that anyone other than * task 0 has the environment settings for the hints, pass * the hint=value pair to everyone else in mpi_comm_world */ DistributeHints(); /* check validity of tests and create test queue */ while (tests != NULL) { IOR_param_t *params = & tests->params; params->testComm = com; /* use MPI values if not overridden on command-line */ if (params->numNodes == -1) { params->numNodes = mpiNumNodes; } if (params->numTasks == -1) { params->numTasks = mpiNumTasks; } else if (params->numTasks > mpiNumTasks) { if (rank == 0) { EWARNF("More tasks requested (%d) than available (%d),", params->numTasks, mpiNumTasks); EWARNF(" running with %d tasks.\n", mpiNumTasks); } params->numTasks = mpiNumTasks; } if (params->numTasksOnNode0 == -1) { params->numTasksOnNode0 = mpiNumTasksOnNode0; } params->tasksBlockMapping = QueryNodeMapping(com,false); params->expectedAggFileSize = params->blockSize * params->segmentCount * params->numTasks; ValidateTests(&tests->params); tests = tests->next; } init_clock(); /* seed random number generator */ SeedRandGen(mpi_comm_world); } /* * Setup transfer buffers, creating and filling as needed. */ static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test, int pretendRank) { ioBuffers->buffer = aligned_buffer_alloc(test->transferSize); if (test->checkWrite || test->checkRead) { ioBuffers->checkBuffer = aligned_buffer_alloc(test->transferSize); } if (test->checkRead || test->checkWrite) { ioBuffers->readCheckBuffer = aligned_buffer_alloc(test->transferSize); } return; } /* * Free transfer buffers. */ static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test) { aligned_buffer_free(ioBuffers->buffer); if (test->checkWrite || test->checkRead) { aligned_buffer_free(ioBuffers->checkBuffer); } if (test->checkRead) { aligned_buffer_free(ioBuffers->readCheckBuffer); } return; } /* * malloc a buffer, touching every page in an attempt to defeat lazy allocation. */ static void *malloc_and_touch(size_t size) { size_t page_size; char *buf; char *ptr; if (size == 0) return NULL; page_size = sysconf(_SC_PAGESIZE); buf = (char *)malloc(size); if (buf == NULL) return NULL; for (ptr = buf; ptr < buf+size; ptr += page_size) { *ptr = (char)1; } return (void *)buf; } static void file_hits_histogram(IOR_param_t *params) { int *rankoffs = NULL; int *filecont = NULL; int *filehits = NULL; int ifile; int jfile; if (rank == 0) { rankoffs = (int *)malloc(params->numTasks * sizeof(int)); filecont = (int *)malloc(params->numTasks * sizeof(int)); filehits = (int *)malloc(params->numTasks * sizeof(int)); } MPI_CHECK(MPI_Gather(&rankOffset, 1, MPI_INT, rankoffs, 1, MPI_INT, 0, mpi_comm_world), "MPI_Gather error"); if (rank != 0) return; memset((void *)filecont, 0, params->numTasks * sizeof(int)); for (ifile = 0; ifile < params->numTasks; ifile++) { filecont[(ifile + rankoffs[ifile]) % params->numTasks]++; } memset((void *)filehits, 0, params->numTasks * sizeof(int)); for (ifile = 0; ifile < params->numTasks; ifile++) for (jfile = 0; jfile < params->numTasks; jfile++) { if (ifile == filecont[jfile]) filehits[ifile]++; } fprintf(out_logfile, "#File Hits Dist:"); jfile = 0; ifile = 0; while (jfile < params->numTasks && ifile < params->numTasks) { fprintf(out_logfile, " %d", filehits[ifile]); jfile += filehits[ifile], ifile++; } fprintf(out_logfile, "\n"); free(rankoffs); free(filecont); free(filehits); } int test_time_elapsed(IOR_param_t *params, double startTime) { double endTime; if (params->maxTimeDuration == 0) return 0; endTime = startTime + (params->maxTimeDuration * 60); return GetTimeStamp() >= endTime; } /* * hog some memory as a rough simulation of a real application's memory use */ static void *HogMemory(IOR_param_t *params) { size_t size; void *buf; if (params->memoryPerTask != 0) { size = params->memoryPerTask; } else if (params->memoryPerNode != 0) { if (verbose >= VERBOSE_3) fprintf(out_logfile, "This node hogging %ld bytes of memory\n", params->memoryPerNode); size = params->memoryPerNode / params->numTasksOnNode0; } else { return NULL; } if (verbose >= VERBOSE_3) fprintf(out_logfile, "This task hogging %ld bytes of memory\n", size); buf = malloc_and_touch(size); if (buf == NULL) ERR("malloc of simulated applciation buffer failed"); return buf; } /* * Write times taken during each iteration of the test. */ static void WriteTimes(IOR_param_t *test, const double *timer, const int iteration, const int access) { char timerName[MAX_STR]; for (int i = 0; i < IOR_NB_TIMERS; i++) { if (access == WRITE) { switch (i) { case 0: strcpy(timerName, "write open start"); break; case 1: strcpy(timerName, "write open stop"); break; case 2: strcpy(timerName, "write start"); break; case 3: strcpy(timerName, "write stop"); break; case 4: strcpy(timerName, "write close start"); break; case 5: strcpy(timerName, "write close stop"); break; default: strcpy(timerName, "invalid timer"); break; } } else { switch (i) { case 0: strcpy(timerName, "read open start"); break; case 1: strcpy(timerName, "read open stop"); break; case 2: strcpy(timerName, "read start"); break; case 3: strcpy(timerName, "read stop"); break; case 4: strcpy(timerName, "read close start"); break; case 5: strcpy(timerName, "read close stop"); break; default: strcpy(timerName, "invalid timer"); break; } } fprintf(out_logfile, "Test %d: Iter=%d, Task=%d, Time=%f, %s\n", test->id, iteration, (int)rank, timer[i], timerName); } } /* * Using the test parameters, run iteration(s) of single test. */ static void TestIoSys(IOR_test_t *test) { IOR_param_t *params = &test->params; IOR_results_t *results = test->results; char testFileName[MAX_STR]; double timer[IOR_NB_TIMERS]; double startTime; int pretendRank; int rep; aiori_fd_t *fd; MPI_Group orig_group, new_group; int range[3]; IOR_offset_t dataMoved; /* for data rate calculation */ void *hog_buf; IOR_io_buffers ioBuffers; /* set up communicator for test */ MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group), "MPI_Comm_group() error"); range[0] = 0; /* first rank */ range[1] = params->numTasks - 1; /* last rank */ range[2] = 1; /* stride */ MPI_CHECK(MPI_Group_range_incl(orig_group, 1, &range, &new_group), "MPI_Group_range_incl() error"); MPI_CHECK(MPI_Comm_create(mpi_comm_world, new_group, &testComm), "MPI_Comm_create() error"); MPI_CHECK(MPI_Group_free(&orig_group), "MPI_Group_Free() error"); MPI_CHECK(MPI_Group_free(&new_group), "MPI_Group_Free() error"); params->testComm = testComm; if (testComm == MPI_COMM_NULL) { /* tasks not in the group do not participate in this test */ MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error"); return; } if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, "Participating tasks : %d\n", params->numTasks); fflush(out_logfile); } if (rank == 0 && params->reorderTasks == TRUE && verbose >= VERBOSE_1) { fprintf(out_logfile, "Using reorderTasks '-C' (useful to avoid read cache in client)\n"); fflush(out_logfile); } /* show test setup */ if (rank == 0 && verbose >= VERBOSE_0) ShowSetup(params); hog_buf = HogMemory(params); pretendRank = (rank + rankOffset) % params->numTasks; /* IO Buffer Setup */ if (params->setTimeStampSignature) { // initialize the buffer properly params->timeStampSignatureValue = (unsigned int) params->setTimeStampSignature; } XferBuffersSetup(&ioBuffers, params, pretendRank); reseed_incompressible_prng = TRUE; // reset pseudo random generator, necessary to guarantee the next call to FillBuffer produces the same value as it is right now /* Initial time stamp */ startTime = GetTimeStamp(); /* loop over test iterations */ uint64_t params_saved_wearout = params->stoneWallingWearOutIterations; for (rep = 0; rep < params->repetitions; rep++) { /* Get iteration start time in seconds in task 0 and broadcast to all tasks */ if (rank == 0) { if (! params->setTimeStampSignature) { time_t currentTime; if ((currentTime = time(NULL)) == -1) { ERR("cannot get current time"); } params->timeStampSignatureValue = (unsigned int)currentTime; } if (verbose >= VERBOSE_2) { fprintf(out_logfile, "Using Time Stamp %u (0x%x) for Data Signature\n", params->timeStampSignatureValue, params->timeStampSignatureValue); } if (rep == 0 && verbose >= VERBOSE_0) { PrintTableHeader(); } } MPI_CHECK(MPI_Bcast (¶ms->timeStampSignatureValue, 1, MPI_UNSIGNED, 0, testComm), "cannot broadcast start time value"); FillBuffer(ioBuffers.buffer, params, 0, pretendRank); /* use repetition count for number of multiple files */ if (params->multiFile) params->repCounter = rep; /* * write the file(s), getting timing between I/O calls */ if (params->writeFile && !test_time_elapsed(params, startTime)) { GetTestFileName(testFileName, params); if (verbose >= VERBOSE_3) { fprintf(out_logfile, "task %d writing %s\n", rank, testFileName); } DelaySecs(params->interTestDelay); if (params->useExistingTestFile == FALSE) { RemoveFile(testFileName, params->filePerProc, params); } params->stoneWallingWearOutIterations = params_saved_wearout; MPI_CHECK(MPI_Barrier(testComm), "barrier error"); params->open = WRITE; timer[0] = GetTimeStamp(); fd = backend->create(testFileName, IOR_WRONLY | IOR_CREAT | IOR_TRUNC, params->backend_options); timer[1] = GetTimeStamp(); if (params->intraTestBarriers) MPI_CHECK(MPI_Barrier(testComm), "barrier error"); if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, "Commencing write performance test: %s", CurrentTimeString()); } timer[2] = GetTimeStamp(); dataMoved = WriteOrRead(params, &results[rep], fd, WRITE, &ioBuffers); if (params->verbose >= VERBOSE_4) { fprintf(out_logfile, "* data moved = %llu\n", dataMoved); fflush(out_logfile); } timer[3] = GetTimeStamp(); if (params->intraTestBarriers) MPI_CHECK(MPI_Barrier(testComm), "barrier error"); timer[4] = GetTimeStamp(); backend->close(fd, params->backend_options); timer[5] = GetTimeStamp(); MPI_CHECK(MPI_Barrier(testComm), "barrier error"); /* get the size of the file just written */ results[rep].write.aggFileSizeFromStat = backend->get_file_size(params->backend_options, testComm, testFileName); /* check if stat() of file doesn't equal expected file size, use actual amount of byte moved */ CheckFileSize(test, dataMoved, rep, WRITE); if (verbose >= VERBOSE_3) WriteTimes(params, timer, rep, WRITE); ReduceIterResults(test, timer, rep, WRITE); if (params->outlierThreshold) { CheckForOutliers(params, timer, WRITE); } /* check if in this round we run write with stonewalling */ if(params->deadlineForStonewalling > 0){ params->stoneWallingWearOutIterations = results[rep].write.pairs_accessed; } } /* * perform a check of data, reading back data and comparing * against what was expected to be written */ if (params->checkWrite && !test_time_elapsed(params, startTime)) { MPI_CHECK(MPI_Barrier(testComm), "barrier error"); if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, "Verifying contents of the file(s) just written.\n"); fprintf(out_logfile, "%s\n", CurrentTimeString()); } if (params->reorderTasks) { /* move two nodes away from writing node */ int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */ if (params->tasksBlockMapping) { shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */ } rankOffset = (2 * shift) % params->numTasks; } // update the check buffer FillBuffer(ioBuffers.readCheckBuffer, params, 0, (rank + rankOffset) % params->numTasks); reseed_incompressible_prng = TRUE; /* Re-Seed the PRNG to get same sequence back, if random */ GetTestFileName(testFileName, params); params->open = WRITECHECK; fd = backend->open(testFileName, IOR_RDONLY, params->backend_options); dataMoved = WriteOrRead(params, &results[rep], fd, WRITECHECK, &ioBuffers); backend->close(fd, params->backend_options); rankOffset = 0; } /* * read the file(s), getting timing between I/O calls */ if ((params->readFile || params->checkRead ) && !test_time_elapsed(params, startTime)) { /* check for stonewall */ if(params->stoneWallingStatusFile){ params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile); if(params->stoneWallingWearOutIterations == -1 && rank == 0){ WARN("Could not read back the stonewalling status from the file!"); params->stoneWallingWearOutIterations = 0; } } int operation_flag = READ; if ( params->checkRead ){ // actually read and then compare the buffer operation_flag = READCHECK; } /* Get rankOffset [file offset] for this process to read, based on -C,-Z,-Q,-X options */ /* Constant process offset reading */ if (params->reorderTasks) { /* move one node away from writing node */ int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */ if (params->tasksBlockMapping) { shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */ } rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks; } /* random process offset reading */ if (params->reorderTasksRandom) { /* this should not intefere with randomOffset within a file because GetOffsetArrayRandom */ /* seeds every rand() call */ int nodeoffset; unsigned int iseed0; nodeoffset = params->taskPerNodeOffset; nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1; if (params->reorderTasksRandomSeed < 0) iseed0 = -1 * params->reorderTasksRandomSeed + rep; else iseed0 = params->reorderTasksRandomSeed; srand(rank + iseed0); { rankOffset = rand() % params->numTasks; } while (rankOffset < (nodeoffset * params->numTasksOnNode0)) { rankOffset = rand() % params->numTasks; } /* Get more detailed stats if requested by verbose level */ if (verbose >= VERBOSE_2) { file_hits_histogram(params); } } if(operation_flag == READCHECK){ FillBuffer(ioBuffers.readCheckBuffer, params, 0, (rank + rankOffset) % params->numTasks); } /* Using globally passed rankOffset, following function generates testFileName to read */ GetTestFileName(testFileName, params); if (verbose >= VERBOSE_3) { fprintf(out_logfile, "task %d reading %s\n", rank, testFileName); } DelaySecs(params->interTestDelay); MPI_CHECK(MPI_Barrier(testComm), "barrier error"); params->open = READ; timer[0] = GetTimeStamp(); fd = backend->open(testFileName, IOR_RDONLY, params->backend_options); timer[1] = GetTimeStamp(); if (params->intraTestBarriers) MPI_CHECK(MPI_Barrier(testComm), "barrier error"); if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, "Commencing read performance test: %s\n", CurrentTimeString()); } timer[2] = GetTimeStamp(); dataMoved = WriteOrRead(params, &results[rep], fd, operation_flag, &ioBuffers); timer[3] = GetTimeStamp(); if (params->intraTestBarriers) MPI_CHECK(MPI_Barrier(testComm), "barrier error"); timer[4] = GetTimeStamp(); backend->close(fd, params->backend_options); timer[5] = GetTimeStamp(); /* get the size of the file just read */ results[rep].read.aggFileSizeFromStat = backend->get_file_size(params->backend_options, testComm, testFileName); /* check if stat() of file doesn't equal expected file size, use actual amount of byte moved */ CheckFileSize(test, dataMoved, rep, READ); if (verbose >= VERBOSE_3) WriteTimes(params, timer, rep, READ); ReduceIterResults(test, timer, rep, READ); if (params->outlierThreshold) { CheckForOutliers(params, timer, READ); } } if (!params->keepFile && !(params->errorFound && params->keepFileWithError)) { double start, finish; start = GetTimeStamp(); MPI_CHECK(MPI_Barrier(testComm), "barrier error"); RemoveFile(testFileName, params->filePerProc, params); MPI_CHECK(MPI_Barrier(testComm), "barrier error"); finish = GetTimeStamp(); PrintRemoveTiming(start, finish, rep); } else { MPI_CHECK(MPI_Barrier(testComm), "barrier error"); } params->errorFound = FALSE; rankOffset = 0; PrintRepeatEnd(); } MPI_CHECK(MPI_Comm_free(&testComm), "MPI_Comm_free() error"); if (params->summary_every_test) { PrintLongSummaryHeader(); PrintLongSummaryOneTest(test); } else { PrintShortSummary(test); } XferBuffersFree(&ioBuffers, params); if (hog_buf != NULL) free(hog_buf); /* Sync with the tasks that did not participate in this test */ MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error"); } /* * Determine if valid tests from parameters. */ static void ValidateTests(IOR_param_t * test) { IOR_param_t defaults; init_IOR_Param_t(&defaults); if (test->repetitions <= 0) WARN_RESET("too few test repetitions", test, &defaults, repetitions); if (test->numTasks <= 0) ERR("too few tasks for testing"); if (test->interTestDelay < 0) WARN_RESET("inter-test delay must be nonnegative value", test, &defaults, interTestDelay); if (test->readFile != TRUE && test->writeFile != TRUE && test->checkRead != TRUE && test->checkWrite != TRUE) ERR("test must write, read, or check read/write file"); if(! test->setTimeStampSignature && test->writeFile != TRUE && test->checkRead == TRUE) ERR("using readCheck only requires to write a timeStampSignature -- use -G"); if (test->segmentCount < 0) ERR("segment count must be positive value"); if ((test->blockSize % sizeof(IOR_size_t)) != 0) ERR("block size must be a multiple of access size"); if (test->blockSize < 0) ERR("block size must be non-negative integer"); if ((test->transferSize % sizeof(IOR_size_t)) != 0) ERR("transfer size must be a multiple of access size"); if (test->transferSize < 0) ERR("transfer size must be non-negative integer"); if (test->transferSize == 0) { ERR("test will not complete with zero transfer size"); } else { if ((test->blockSize % test->transferSize) != 0) ERR("block size must be a multiple of transfer size"); } if (test->blockSize < test->transferSize) ERR("block size must not be smaller than transfer size"); /* specific APIs */ if ((strcasecmp(test->api, "MPIIO") == 0) && (test->blockSize < sizeof(IOR_size_t) || test->transferSize < sizeof(IOR_size_t))) ERR("block/transfer size may not be smaller than IOR_size_t for MPIIO"); if ((strcasecmp(test->api, "HDF5") == 0) && (test->blockSize < sizeof(IOR_size_t) || test->transferSize < sizeof(IOR_size_t))) ERR("block/transfer size may not be smaller than IOR_size_t for HDF5"); if ((strcasecmp(test->api, "NCMPI") == 0) && (test->blockSize < sizeof(IOR_size_t) || test->transferSize < sizeof(IOR_size_t))) ERR("block/transfer size may not be smaller than IOR_size_t for NCMPI"); if (((strcasecmp(test->api, "POSIX") != 0) && (strcasecmp(test->api, "MPIIO") != 0) && (strcasecmp(test->api, "MMAP") != 0) && (strcasecmp(test->api, "HDFS") != 0) && (strcasecmp(test->api, "DFS") != 0) && (strcasecmp(test->api, "DAOS") != 0) && (strcasecmp(test->api, "Gfarm") != 0) && (strcasecmp(test->api, "RADOS") != 0) && (strcasecmp(test->api, "CEPHFS") != 0)) && test->fsync) WARN_RESET("fsync() not supported in selected backend", test, &defaults, fsync); /* parameter consistency */ if (test->reorderTasks == TRUE && test->reorderTasksRandom == TRUE) ERR("Both Constant and Random task re-ordering specified. Choose one and resubmit"); if (test->randomOffset && test->reorderTasksRandom && test->filePerProc == FALSE) ERR("random offset and random reorder tasks specified with single-shared-file. Choose one and resubmit"); if (test->randomOffset && test->reorderTasks && test->filePerProc == FALSE) ERR("random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit"); if (test->randomOffset && test->checkRead) ERR("random offset not available with read check option (use write check)"); if (test->randomOffset && test->storeFileOffset) ERR("random offset not available with store file offset option)"); if ((strcasecmp(test->api, "HDF5") == 0) && test->randomOffset) ERR("random offset not available with HDF5"); if ((strcasecmp(test->api, "NCMPI") == 0) && test->randomOffset) ERR("random offset not available with NCMPI"); if ((strcasecmp(test->api, "NCMPI") == 0) && test->filePerProc) ERR("file-per-proc not available in current NCMPI"); backend = test->backend; ior_set_xfer_hints(test); /* allow the backend to validate the options */ if(test->backend->check_params){ int check = test->backend->check_params(test->backend_options); if (check){ ERR("The backend returned that the test parameters are invalid."); } } } /** * Returns a precomputed array of IOR_offset_t for the inner benchmark loop. * They are sequential and the last element is set to -1 as end marker. * @param test IOR_param_t for getting transferSize, blocksize and SegmentCount * @param pretendRank int pretended Rank for shifting the offsets correctly * @return IOR_offset_t */ IOR_offset_t *GetOffsetArraySequential(IOR_param_t * test, int pretendRank) { IOR_offset_t i, j, k = 0; IOR_offset_t offsets; IOR_offset_t *offsetArray; /* count needed offsets */ offsets = (test->blockSize / test->transferSize) * test->segmentCount; /* setup empty array */ offsetArray = (IOR_offset_t *) malloc((offsets + 1) * sizeof(IOR_offset_t)); if (offsetArray == NULL) ERR("malloc() failed"); offsetArray[offsets] = -1; /* set last offset with -1 */ /* fill with offsets */ for (i = 0; i < test->segmentCount; i++) { for (j = 0; j < (test->blockSize / test->transferSize); j++) { offsetArray[k] = j * test->transferSize; if (test->filePerProc) { offsetArray[k] += i * test->blockSize; } else { offsetArray[k] += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize); } k++; } } return (offsetArray); } /** * Returns a precomputed array of IOR_offset_t for the inner benchmark loop. * They get created sequentially and mixed up in the end. The last array element * is set to -1 as end marker. * It should be noted that as the seeds get synchronised across all processes * every process computes the same random order if used with filePerProc. * For a shared file all transfers get randomly assigned to ranks. The processes * can also have differen't numbers of transfers. This might lead to a bigger * diversion in accesse as it dose with filePerProc. This is expected but * should be mined. * @param test IOR_param_t for getting transferSize, blocksize and SegmentCount * @param pretendRank int pretended Rank for shifting the offsets correctly * @return IOR_offset_t * @return */ IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int access) { int seed; IOR_offset_t i, value, tmp; IOR_offset_t offsets = 0; IOR_offset_t offsetCnt = 0; IOR_offset_t fileSize; IOR_offset_t *offsetArray; /* set up seed for random() */ if (access == WRITE || access == READ) { test->randomSeed = seed = rand(); } else { seed = test->randomSeed; } srand(seed); fileSize = test->blockSize * test->segmentCount; if (test->filePerProc == FALSE) { fileSize *= test->numTasks; } /* count needed offsets (pass 1) */ for (i = 0; i < fileSize; i += test->transferSize) { if (test->filePerProc == FALSE) { // this counts which process get how many transferes in // a shared file if ((rand() % test->numTasks) == pretendRank) { offsets++; } } else { offsets++; } } /* setup empty array */ offsetArray = (IOR_offset_t *) malloc((offsets + 1) * sizeof(IOR_offset_t)); if (offsetArray == NULL) ERR("malloc() failed"); offsetArray[offsets] = -1; /* set last offset with -1 */ if (test->filePerProc) { /* fill array */ for (i = 0; i < offsets; i++) { offsetArray[i] = i * test->transferSize; } } else { /* fill with offsets (pass 2) */ srand(seed); /* need same seed to get same transfers as counted in the beginning*/ for (i = 0; i < fileSize; i += test->transferSize) { if ((rand() % test->numTasks) == pretendRank) { offsetArray[offsetCnt] = i; offsetCnt++; } } } /* reorder array */ for (i = 0; i < offsets; i++) { value = rand() % offsets; tmp = offsetArray[value]; offsetArray[value] = offsetArray[i]; offsetArray[i] = tmp; } SeedRandGen(test->testComm); /* synchronize seeds across tasks */ return (offsetArray); } static IOR_offset_t WriteOrReadSingle(IOR_offset_t pairCnt, IOR_offset_t *offsetArray, int pretendRank, IOR_offset_t * transferCount, int * errors, IOR_param_t * test, aiori_fd_t * fd, IOR_io_buffers* ioBuffers, int access){ IOR_offset_t amtXferred = 0; IOR_offset_t transfer; void *buffer = ioBuffers->buffer; void *checkBuffer = ioBuffers->checkBuffer; void *readCheckBuffer = ioBuffers->readCheckBuffer; IOR_offset_t offset = offsetArray[pairCnt]; // this looks inappropriate transfer = test->transferSize; if (access == WRITE) { /* fills each transfer with a unique pattern * containing the offset into the file */ if (test->storeFileOffset == TRUE) { FillBuffer(buffer, test, offset, pretendRank); } amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options); if (amtXferred != transfer) ERR("cannot write to file"); if (test->fsyncPerWrite) backend->fsync(fd, test->backend_options); if (test->interIODelay > 0){ struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)}; nanosleep( & wait, NULL); } } else if (access == READ) { amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options); if (amtXferred != transfer) ERR("cannot read from file"); if (test->interIODelay > 0){ struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)}; nanosleep( & wait, NULL); } } else if (access == WRITECHECK) { memset(checkBuffer, 'a', transfer); if (test->storeFileOffset == TRUE) { FillBuffer(readCheckBuffer, test, offset, pretendRank); } amtXferred = backend->xfer(access, fd, checkBuffer, transfer, offset, test->backend_options); if (amtXferred != transfer) ERR("cannot read from file write check"); (*transferCount)++; *errors += CompareBuffers(readCheckBuffer, checkBuffer, transfer, *transferCount, test, WRITECHECK); } else if (access == READCHECK) { memset(checkBuffer, 'a', transfer); amtXferred = backend->xfer(access, fd, checkBuffer, transfer, offset, test->backend_options); if (amtXferred != transfer){ ERR("cannot read from file"); } if (test->storeFileOffset == TRUE) { FillBuffer(readCheckBuffer, test, offset, pretendRank); } *errors += CompareBuffers(readCheckBuffer, checkBuffer, transfer, *transferCount, test, READCHECK); } return amtXferred; } /* * Write or Read data to file(s). This loops through the strides, writing * out the data to each block in transfer sizes, until the remainder left is 0. */ static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results, aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers) { int errors = 0; IOR_offset_t transferCount = 0; uint64_t pairCnt = 0; IOR_offset_t *offsetArray; int pretendRank; IOR_offset_t dataMoved = 0; /* for data rate calculation */ double startForStonewall; int hitStonewall; IOR_point_t *point = ((access == WRITE) || (access == WRITECHECK)) ? &results->write : &results->read; /* initialize values */ pretendRank = (rank + rankOffset) % test->numTasks; if (test->randomOffset) { offsetArray = GetOffsetArrayRandom(test, pretendRank, access); } else { offsetArray = GetOffsetArraySequential(test, pretendRank); } startForStonewall = GetTimeStamp(); hitStonewall = 0; /* loop over offsets to access */ while ((offsetArray[pairCnt] != -1) && !hitStonewall ) { dataMoved += WriteOrReadSingle(pairCnt, offsetArray, pretendRank, & transferCount, & errors, test, fd, ioBuffers, access); pairCnt++; hitStonewall = ((test->deadlineForStonewalling != 0 && (GetTimeStamp() - startForStonewall) > test->deadlineForStonewalling)) || (test->stoneWallingWearOutIterations != 0 && pairCnt == test->stoneWallingWearOutIterations) ; if ( test->collective && test->deadlineForStonewalling ) { // if collective-mode, you'll get a HANG, if some rank 'accidentally' leave this loop // it absolutely must be an 'all or none': MPI_CHECK(MPI_Bcast(&hitStonewall, 1, MPI_INT, 0, MPI_COMM_WORLD), "hitStonewall broadcast failed"); } } if (test->stoneWallingWearOut){ if (verbose >= VERBOSE_1){ fprintf(out_logfile, "%d: stonewalling pairs accessed: %lld\n", rank, (long long) pairCnt); } long long data_moved_ll = (long long) dataMoved; long long pairs_accessed_min = 0; MPI_CHECK(MPI_Allreduce(& pairCnt, &point->pairs_accessed, 1, MPI_LONG_LONG_INT, MPI_MAX, testComm), "cannot reduce pairs moved"); double stonewall_runtime = GetTimeStamp() - startForStonewall; point->stonewall_time = stonewall_runtime; MPI_CHECK(MPI_Reduce(& pairCnt, & pairs_accessed_min, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved"); MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_min_data_accessed, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved"); MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_avg_data_accessed, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm), "cannot reduce pairs moved"); if(rank == 0){ fprintf(out_logfile, "stonewalling pairs accessed min: %lld max: %zu -- min data: %.1f GiB mean data: %.1f GiB time: %.1fs\n", pairs_accessed_min, point->pairs_accessed, point->stonewall_min_data_accessed /1024.0 / 1024 / 1024, point->stonewall_avg_data_accessed / 1024.0 / 1024 / 1024 / test->numTasks , point->stonewall_time); point->stonewall_min_data_accessed *= test->numTasks; } if(pairCnt != point->pairs_accessed){ // some work needs still to be done ! for(; pairCnt < point->pairs_accessed; pairCnt++ ) { dataMoved += WriteOrReadSingle(pairCnt, offsetArray, pretendRank, & transferCount, & errors, test, fd, ioBuffers, access); } } }else{ point->pairs_accessed = pairCnt; } totalErrorCount += CountErrors(test, access, errors); free(offsetArray); if (access == WRITE && test->fsync == TRUE) { backend->fsync(fd, test->backend_options); /*fsync after all accesses */ } return (dataMoved); }