mdtest/src/ior.c

1856 lines
73 KiB
C
Raw Normal View History

/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
2011-06-17 23:20:43 +04:00
/******************************************************************************\
* *
* Copyright (c) 2003, The Regents of the University of California *
* See the file COPYRIGHT for a complete copyright notice and license. *
* *
\******************************************************************************/
#ifdef HAVE_CONFIG_H
2014-07-31 03:17:21 +04:00
# include "config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h> /* tolower() */
#include <errno.h>
2011-06-17 23:20:43 +04:00
#include <math.h>
#include <mpi.h>
#include <string.h>
#if defined(HAVE_STRINGS_H)
#include <strings.h>
#endif
#include <sys/stat.h> /* struct stat */
2011-06-17 23:20:43 +04:00
#include <time.h>
2014-07-31 03:17:21 +04:00
2011-06-17 23:20:43 +04:00
#ifndef _WIN32
2014-07-31 03:17:21 +04:00
# include <sys/time.h> /* gettimeofday() */
# include <sys/utsname.h> /* uname() */
2011-06-17 23:20:43 +04:00
#endif
2014-07-31 03:17:21 +04:00
#ifdef HAVE_CUDA
#include <cuda_runtime.h>
#endif
#include <assert.h>
2011-06-17 23:20:43 +04:00
#include "ior.h"
2018-07-08 15:38:05 +03:00
#include "ior-internal.h"
#include "aiori.h"
#include "utilities.h"
#include "parse_options.h"
2011-06-17 23:20:43 +04:00
#define IOR_NB_TIMERS 6
/* file scope globals */
extern char **environ;
2018-07-07 13:42:21 +03:00
static int totalErrorCount;
static const ior_aiori_t *backend;
static void DestroyTests(IOR_test_t *tests_head);
static char *PrependDir(IOR_param_t *, char *);
static char **ParseFileName(char *, int *);
2021-01-20 18:00:33 +03:00
static void InitTests(IOR_test_t *);
static void TestIoSys(IOR_test_t *);
static void ValidateTests(IOR_param_t * params, MPI_Comm com);
static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results,
aiori_fd_t *fd, const int access,
IOR_io_buffers *ioBuffers);
static void ior_set_xfer_hints(IOR_param_t * p){
aiori_xfer_hint_t * hints = & p->hints;
hints->dryRun = p->dryRun;
hints->filePerProc = p->filePerProc;
hints->collective = p->collective;
hints->numTasks = p->numTasks;
hints->numNodes = p->numNodes;
hints->randomOffset = p->randomOffset;
hints->fsyncPerWrite = p->fsyncPerWrite;
hints->segmentCount = p->segmentCount;
hints->blockSize = p->blockSize;
hints->transferSize = p->transferSize;
hints->expectedAggFileSize = p->expectedAggFileSize;
hints->singleXferAttempt = p->singleXferAttempt;
if(backend->xfer_hints){
backend->xfer_hints(hints);
}
}
int aiori_warning_as_errors = 0;
2021-01-29 17:00:17 +03:00
/*
Returns 1 if the process participates in the test
*/
static int test_initialize(IOR_test_t * test){
int range[3];
IOR_param_t *params = &test->params;
MPI_Group orig_group, new_group;
/* set up communicator for test */
MPI_CHECK(MPI_Comm_group(params->mpi_comm_world, &orig_group),
"MPI_Comm_group() error");
range[0] = 0; /* first rank */
range[1] = params->numTasks - 1; /* last rank */
range[2] = 1; /* stride */
MPI_CHECK(MPI_Group_range_incl(orig_group, 1, &range, &new_group),
"MPI_Group_range_incl() error");
MPI_CHECK(MPI_Comm_create(params->mpi_comm_world, new_group, & params->testComm),
"MPI_Comm_create() error");
MPI_CHECK(MPI_Group_free(&orig_group), "MPI_Group_Free() error");
MPI_CHECK(MPI_Group_free(&new_group), "MPI_Group_Free() error");
2021-01-29 17:00:17 +03:00
if (params->testComm == MPI_COMM_NULL) {
/* tasks not in the group do not participate in this test, this matches the proceses in test_finalize() that participate */
2021-01-29 17:00:17 +03:00
MPI_CHECK(MPI_Barrier(params->mpi_comm_world), "barrier error");
return 0;
}
/* Setup global variables */
testComm = params->testComm;
verbose = test->params.verbose;
backend = test->params.backend;
#ifdef HAVE_CUDA
cudaError_t cret = cudaSetDevice(test->params.gpuID);
if(cret != cudaSuccess){
EWARNF("cudaSetDevice(%d) error: %s", test->params.gpuID, cudaGetErrorString(cret));
}
#endif
if(backend->initialize){
backend->initialize(test->params.backend_options);
}
ior_set_xfer_hints(& test->params);
aiori_warning_as_errors = test->params.warningAsErrors;
if (rank == 0 && verbose >= VERBOSE_0) {
ShowTestStart(& test->params);
}
2021-01-29 17:00:17 +03:00
return 1;
}
static void test_finalize(IOR_test_t * test){
backend = test->params.backend;
if(backend->finalize){
backend->finalize(test->params.backend_options);
}
MPI_CHECK(MPI_Barrier(test->params.mpi_comm_world), "barrier error");
2021-01-29 17:00:17 +03:00
MPI_CHECK(MPI_Comm_free(& testComm), "MPI_Comm_free() error");
}
2018-07-07 13:42:21 +03:00
IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out){
IOR_test_t *tests_head;
IOR_test_t *tptr;
2018-07-07 13:42:21 +03:00
out_logfile = world_out;
2018-07-08 15:47:55 +03:00
out_resultfile = world_out;
2011-06-17 23:20:43 +04:00
MPI_CHECK(MPI_Comm_rank(world_com, &rank), "cannot get rank");
/* setup tests, and validate parameters */
tests_head = ParseCommandLine(argc, argv, world_com);
2021-01-20 18:00:33 +03:00
InitTests(tests_head);
PrintHeader(argc, argv);
2011-12-11 13:50:19 +04:00
/* perform each test */
for (tptr = tests_head; tptr != NULL; tptr = tptr->next) {
2021-01-29 17:00:17 +03:00
int participate = test_initialize(tptr);
if( ! participate ) continue;
2018-07-07 13:42:21 +03:00
totalErrorCount = 0;
TestIoSys(tptr);
2018-07-07 13:42:21 +03:00
tptr->results->errors = totalErrorCount;
ShowTestEnd(tptr);
test_finalize(tptr);
}
PrintLongSummaryAllTests(tests_head);
/* display finish time */
PrintTestEnds();
2018-07-07 13:42:21 +03:00
return tests_head;
}
2018-07-07 13:42:21 +03:00
int ior_main(int argc, char **argv)
{
IOR_test_t *tests_head;
IOR_test_t *tptr;
2018-07-08 15:07:32 +03:00
2018-07-07 13:42:21 +03:00
out_logfile = stdout;
2018-07-08 15:47:55 +03:00
out_resultfile = stdout;
2018-07-07 13:42:21 +03:00
/*
* check -h option from commandline without starting MPI;
*/
tests_head = ParseCommandLine(argc, argv, MPI_COMM_WORLD);
2018-07-07 13:42:21 +03:00
/* start the MPI code */
MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI");
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "cannot get rank");
2018-07-08 15:07:32 +03:00
2018-07-07 13:42:21 +03:00
/* set error-handling */
/*MPI_CHECK(MPI_Errhandler_set(mpi_comm_world, MPI_ERRORS_RETURN),
"cannot set errhandler"); */
/* setup tests, and validate parameters */
2021-01-20 18:00:33 +03:00
InitTests(tests_head);
2018-07-07 13:42:21 +03:00
PrintHeader(argc, argv);
/* perform each test */
for (tptr = tests_head; tptr != NULL; tptr = tptr->next) {
2021-01-29 17:00:17 +03:00
int participate = test_initialize(tptr);
if( ! participate ) continue;
2018-07-07 13:42:21 +03:00
// This is useful for trapping a running MPI process. While
// this is sleeping, run the script 'testing/hdfs/gdb.attach'
if (verbose >= VERBOSE_4) {
fprintf(out_logfile, "\trank %d: sleeping\n", rank);
sleep(5);
fprintf(out_logfile, "\trank %d: awake.\n", rank);
}
2018-07-07 13:42:21 +03:00
TestIoSys(tptr);
ShowTestEnd(tptr);
test_finalize(tptr);
2018-07-07 13:42:21 +03:00
}
if (verbose <= VERBOSE_0)
2018-07-07 13:42:21 +03:00
/* always print final summary */
verbose = VERBOSE_1;
2018-07-07 13:42:21 +03:00
PrintLongSummaryAllTests(tests_head);
2018-07-07 13:42:21 +03:00
/* display finish time */
PrintTestEnds();
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Finalize(), "cannot finalize MPI");
DestroyTests(tests_head);
2018-07-07 13:42:21 +03:00
return totalErrorCount;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/***************************** F U N C T I O N S ******************************/
/*
* Initialize an IOR_param_t structure to the defaults
*/
void init_IOR_Param_t(IOR_param_t * p, MPI_Comm com)
{
const char *default_aiori = aiori_default ();
assert (NULL != default_aiori);
memset(p, 0, sizeof(IOR_param_t));
p->api = strdup(default_aiori);
p->platform = strdup("HOST(OSTYPE)");
p->testFileName = strdup("testFile");
p->writeFile = p->readFile = FALSE;
p->checkWrite = p->checkRead = FALSE;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/*
* These can be overridden from the command-line but otherwise will be
* set from MPI.
*/
p->numTasks = -1;
p->numNodes = -1;
p->numTasksOnNode0 = -1;
p->repetitions = 1;
p->repCounter = -1;
p->open = WRITE;
p->taskPerNodeOffset = 1;
p->segmentCount = 1;
p->blockSize = 1048576;
p->transferSize = 262144;
p->randomSeed = -1;
p->incompressibleSeed = 573;
p->testComm = com; // this com might change for smaller tests
p->mpi_comm_world = com;
p->URI = NULL;
}
static void
DisplayOutliers(int numTasks,
2011-06-17 23:20:43 +04:00
double timerVal,
char *timeString, int access, int outlierThreshold)
2011-06-17 23:20:43 +04:00
{
char accessString[MAX_STR];
double sum, mean, sqrDiff, var, sd;
/* for local timerVal, don't compensate for wall clock delta */
//timerVal += wall_clock_delta;
MPI_CHECK(MPI_Allreduce
(&timerVal, &sum, 1, MPI_DOUBLE, MPI_SUM, testComm),
"MPI_Allreduce()");
mean = sum / numTasks;
sqrDiff = pow((mean - timerVal), 2);
MPI_CHECK(MPI_Allreduce
(&sqrDiff, &var, 1, MPI_DOUBLE, MPI_SUM, testComm),
"MPI_Allreduce()");
var = var / numTasks;
sd = sqrt(var);
2011-06-17 23:20:43 +04:00
if (access == WRITE) {
strcpy(accessString, "write");
} else { /* READ */
strcpy(accessString, "read");
}
if (fabs(timerVal - mean) > (double)outlierThreshold) {
char hostname[MAX_STR];
int ret = gethostname(hostname, MAX_STR);
if (ret != 0)
strcpy(hostname, "unknown");
EWARNF("for %s, task %d, %s %s is %f (mean=%f, stddev=%f)\n",
hostname, rank, accessString, timeString, timerVal, mean, sd);
}
}
2011-06-17 23:20:43 +04:00
/*
* Check for outliers in start/end times and elapsed create/xfer/close times.
*/
static void
CheckForOutliers(IOR_param_t *test, const double *timer, const int access)
2011-06-17 23:20:43 +04:00
{
DisplayOutliers(test->numTasks, timer[0],
"start time", access, test->outlierThreshold);
DisplayOutliers(test->numTasks,
timer[1] - timer[0],
"elapsed create time", access, test->outlierThreshold);
DisplayOutliers(test->numTasks,
timer[3] - timer[2],
"elapsed transfer time", access,
test->outlierThreshold);
DisplayOutliers(test->numTasks,
timer[5] - timer[4],
"elapsed close time", access, test->outlierThreshold);
DisplayOutliers(test->numTasks, timer[5], "end time",
access, test->outlierThreshold);
}
2011-06-17 23:20:43 +04:00
/*
* Check if actual file size equals expected size; if not use actual for
* calculating performance rate.
*/
static void CheckFileSize(IOR_test_t *test, char * testFilename, IOR_offset_t dataMoved, int rep, const int access)
2011-06-17 23:20:43 +04:00
{
IOR_param_t *params = &test->params;
IOR_results_t *results = test->results;
IOR_point_t *point = (access == WRITE) ? &results[rep].write :
&results[rep].read;
/* get the size of the file */
IOR_offset_t aggFileSizeFromStat, tmpMin, tmpMax, tmpSum;
aggFileSizeFromStat = backend->get_file_size(params->backend_options, testFilename);
if (params->hints.filePerProc == TRUE) {
MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpSum, 1,
MPI_LONG_LONG_INT, MPI_SUM, testComm),
"cannot reduce total data moved");
aggFileSizeFromStat = tmpSum;
} else {
MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMin, 1,
MPI_LONG_LONG_INT, MPI_MIN, testComm),
"cannot reduce total data moved");
MPI_CHECK(MPI_Allreduce(&aggFileSizeFromStat, &tmpMax, 1,
MPI_LONG_LONG_INT, MPI_MAX, testComm),
"cannot reduce total data moved");
if (tmpMin != tmpMax) {
if (rank == 0) {
WARN("inconsistent file size by different tasks");
}
/* incorrect, but now consistent across tasks */
aggFileSizeFromStat = tmpMin;
}
}
point->aggFileSizeFromStat = aggFileSizeFromStat;
MPI_CHECK(MPI_Allreduce(&dataMoved, &point->aggFileSizeFromXfer,
1, MPI_LONG_LONG_INT, MPI_SUM, testComm),
"cannot total data moved");
if (strcasecmp(params->api, "HDF5") != 0 && strcasecmp(params->api, "NCMPI") != 0) {
if (verbose >= VERBOSE_0 && rank == 0) {
if ((params->expectedAggFileSize
!= point->aggFileSizeFromXfer)
|| (point->aggFileSizeFromStat
!= point->aggFileSizeFromXfer)) {
EWARNF("Expected aggregate file size = %lld", (long long) params->expectedAggFileSize);
EWARNF("Stat() of aggregate file size = %lld", (long long) point->aggFileSizeFromStat);
EWARNF("Using actual aggregate bytes moved = %lld", (long long) point->aggFileSizeFromXfer);
if(params->deadlineForStonewalling){
EWARN("Maybe caused by deadlineForStonewalling");
}
}
}
2011-06-17 23:20:43 +04:00
}
point->aggFileSizeForBW = point->aggFileSizeFromXfer;
}
2011-06-17 23:20:43 +04:00
/*
* Compare buffers after reading/writing each transfer. Displays only first
* difference in buffers and returns total errors counted.
*/
static size_t
CompareData(void *expectedBuffer, size_t size, IOR_offset_t transferCount, IOR_param_t *test, IOR_offset_t offset, int fillrank, int access)
2011-06-17 23:20:43 +04:00
{
assert(access == WRITECHECK || access == READCHECK);
return verify_memory_pattern(offset, expectedBuffer, transferCount, test->setTimeStampSignature, fillrank, test->dataPacketType);
2011-11-12 03:11:28 +04:00
}
2011-11-12 03:11:28 +04:00
/*
* Count all errors across all tasks; report errors found.
*/
static int CountErrors(IOR_param_t * test, int access, int errors)
{
int allErrors = 0;
if (test->checkWrite || test->checkRead) {
MPI_CHECK(MPI_Reduce(&errors, &allErrors, 1, MPI_INT, MPI_SUM,
0, testComm), "cannot reduce errors");
MPI_CHECK(MPI_Bcast(&allErrors, 1, MPI_INT, 0, testComm),
"cannot broadcast allErrors value");
if (allErrors != 0) {
totalErrorCount += allErrors;
test->errorFound = TRUE;
}
if (rank == 0 && allErrors != 0) {
if (allErrors < 0) {
WARN("overflow in errors counted");
allErrors = -1;
}
EWARNF("Incorrect data on %s (%d errors found).\n",
access == WRITECHECK ? "write" : "read", allErrors);
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Used Time Stamp %u (0x%x) for Data Signature\n",
test->timeStampSignatureValue,
test->timeStampSignatureValue);
}
}
return (allErrors);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2018-08-29 19:49:41 +03:00
void AllocResults(IOR_test_t *test)
{
int reps;
if (test->results != NULL)
return;
reps = test->params.repetitions;
test->results = (IOR_results_t *) safeMalloc(sizeof(IOR_results_t) * reps);
}
void FreeResults(IOR_test_t *test)
{
if (test->results != NULL) {
free(test->results);
}
}
/**
2011-06-17 23:20:43 +04:00
* Create new test for list of tests.
*/
IOR_test_t *CreateTest(IOR_param_t *init_params, int test_num)
2011-06-17 23:20:43 +04:00
{
IOR_test_t *newTest = NULL;
newTest = (IOR_test_t *) malloc(sizeof(IOR_test_t));
if (newTest == NULL)
ERR("malloc() of IOR_test_t failed");
newTest->params = *init_params;
newTest->params.platform = GetPlatformName();
newTest->params.id = test_num;
newTest->next = NULL;
newTest->results = NULL;
return newTest;
}
static void DestroyTest(IOR_test_t *test)
{
FreeResults(test);
free(test);
}
static void DestroyTests(IOR_test_t *tests_head)
{
IOR_test_t *tptr, *next;
for (tptr = tests_head; tptr != NULL; tptr = next) {
next = tptr->next;
DestroyTest(tptr);
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Distribute IOR_HINTs to all tasks' environments.
*/
static void DistributeHints(MPI_Comm com)
2011-06-17 23:20:43 +04:00
{
char hint[MAX_HINTS][MAX_STR], fullHint[MAX_STR], hintVariable[MAX_STR];
int hintCount = 0, i;
if (rank == 0) {
for (i = 0; environ[i] != NULL; i++) {
if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT"))
== 0) {
hintCount++;
if (hintCount == MAX_HINTS) {
WARN("exceeded max hints; reset MAX_HINTS and recompile");
hintCount = MAX_HINTS;
break;
}
/* assume no IOR_HINT is greater than MAX_STR in length */
strncpy(hint[hintCount - 1], environ[i],
MAX_STR - 1);
}
2011-06-17 23:20:43 +04:00
}
}
MPI_CHECK(MPI_Bcast(&hintCount, sizeof(hintCount), MPI_BYTE, 0, com), "cannot broadcast hints");
for (i = 0; i < hintCount; i++) {
MPI_CHECK(MPI_Bcast(&hint[i], MAX_STR, MPI_BYTE, 0, com),
"cannot broadcast hints");
strcpy(fullHint, hint[i]);
strcpy(hintVariable, strtok(fullHint, "="));
if (getenv(hintVariable) == NULL) {
/* doesn't exist in this task's environment; better set it */
if (putenv(hint[i]) != 0)
WARN("cannot set environment variable");
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Return string describing machine name and type.
*/
char * GetPlatformName()
2011-06-17 23:20:43 +04:00
{
char nodeName[MAX_STR], *p, *start, sysName[MAX_STR];
char platformName[MAX_STR];
struct utsname name;
if (uname(&name) != 0) {
2011-12-15 01:40:25 +04:00
EWARN("cannot get platform name");
sprintf(sysName, "%s", "Unknown");
sprintf(nodeName, "%s", "Unknown");
2011-06-17 23:20:43 +04:00
} else {
sprintf(sysName, "%s", name.sysname);
sprintf(nodeName, "%s", name.nodename);
2011-06-17 23:20:43 +04:00
}
start = nodeName;
if (strlen(nodeName) == 0) {
p = start;
} else {
/* point to one character back from '\0' */
p = start + strlen(nodeName) - 1;
}
/*
* to cut off trailing node number, search backwards
* for the first non-numeric character
*/
while (p != start) {
if (*p < '0' || *p > '9') {
*(p + 1) = '\0';
break;
} else {
p--;
}
}
2011-06-17 23:20:43 +04:00
sprintf(platformName, "%s(%s)", nodeName, sysName);
return strdup(platformName);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2011-06-17 23:20:43 +04:00
/*
* Parse file name.
*/
static char **ParseFileName(char *name, int *count)
2011-06-17 23:20:43 +04:00
{
char **fileNames, *tmp, *token;
char delimiterString[3] = { FILENAME_DELIMITER, '\n', '\0' };
int i = 0;
*count = 0;
tmp = name;
/* pass one */
/* if something there, count the first item */
if (*tmp != '\0') {
(*count)++;
}
/* count the rest of the filenames */
while (*tmp != '\0') {
if (*tmp == FILENAME_DELIMITER) {
(*count)++;
}
tmp++;
}
2011-06-17 23:20:43 +04:00
fileNames = (char **)malloc((*count) * sizeof(char **));
if (fileNames == NULL)
ERR("out of memory");
/* pass two */
token = strtok(name, delimiterString);
while (token != NULL) {
fileNames[i] = token;
token = strtok(NULL, delimiterString);
i++;
}
return (fileNames);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2018-07-08 15:38:05 +03:00
2011-06-17 23:20:43 +04:00
/*
2018-07-08 15:38:05 +03:00
* Return test file name to access.
* for single shared file, fileNames[0] is returned in testFileName
2011-06-17 23:20:43 +04:00
*/
2018-07-08 15:38:05 +03:00
void GetTestFileName(char *testFileName, IOR_param_t * test)
2011-06-17 23:20:43 +04:00
{
2018-07-08 15:38:05 +03:00
char **fileNames;
char initialTestFileName[MAX_PATHLEN];
char testFileNameRoot[MAX_STR];
char tmpString[MAX_STR];
int count;
int socket, core;
2018-07-08 15:38:05 +03:00
/* parse filename for multiple file systems */
strcpy(initialTestFileName, test->testFileName);
if(test->dualMount){
GetProcessorAndCore(&socket, &core);
sprintf(tmpString, "%s%d/%s",initialTestFileName, socket, "data");
strcpy(initialTestFileName, tmpString);
}
2018-07-08 15:38:05 +03:00
fileNames = ParseFileName(initialTestFileName, &count);
if (count > 1 && test->uniqueDir == TRUE)
ERR("cannot use multiple file names with unique directories");
if (test->filePerProc) {
strcpy(testFileNameRoot,
fileNames[((rank +
rankOffset) % test->numTasks) % count]);
} else {
strcpy(testFileNameRoot, fileNames[0]);
2011-06-17 23:20:43 +04:00
}
2018-07-08 15:38:05 +03:00
/* give unique name if using multiple files */
if (test->filePerProc) {
/*
* prepend rank subdirectory before filename
* e.g., /dir/file => /dir/<rank>/file
*/
if (test->uniqueDir == TRUE) {
strcpy(testFileNameRoot,
PrependDir(test, testFileNameRoot));
}
sprintf(testFileName, "%s.%08d", testFileNameRoot,
(rank + rankOffset) % test->numTasks);
} else {
strcpy(testFileName, testFileNameRoot);
}
2018-07-08 15:38:05 +03:00
/* add suffix for multiple files */
if (test->repCounter > -1) {
sprintf(tmpString, ".%d", test->repCounter);
strcat(testFileName, tmpString);
}
free (fileNames);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* From absolute directory, insert rank as subdirectory. Allows each task
* to write to its own directory. E.g., /dir/file => /dir/<rank>/file.
*/
static char *PrependDir(IOR_param_t * test, char *rootDir)
2011-06-17 23:20:43 +04:00
{
char *dir;
char *fname;
int i;
dir = (char *)malloc(MAX_STR + 1);
if (dir == NULL)
ERR("out of memory");
/* get dir name */
strcpy(dir, rootDir);
i = strlen(dir) - 1;
while (i > 0) {
if (dir[i] == '\0' || dir[i] == '/') {
dir[i] = '/';
dir[i + 1] = '\0';
break;
}
i--;
}
/* get file name */
fname = rootDir + i + 1;
/* create directory with rank as subdirectory */
sprintf(dir + i + 1, "%d", (rank + rankOffset) % test->numTasks);
/* dir doesn't exist, so create */
if (backend->access(dir, F_OK, test->backend_options) != 0) {
if (backend->mkdir(dir, S_IRWXU, test->backend_options) < 0) {
ERRF("cannot create directory: %s", dir);
}
/* check if correct permissions */
} else if (backend->access(dir, R_OK, test->backend_options) != 0 ||
backend->access(dir, W_OK, test->backend_options) != 0 ||
backend->access(dir, X_OK, test->backend_options) != 0) {
ERRF("invalid directory permissions: %s", dir);
}
/* concatenate dir and file names */
strcat(dir, "/");
strcat(dir, fname);
return dir;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/******************************************************************************/
/*
* Reduce test results, and show if verbose set.
*/
static void
ReduceIterResults(IOR_test_t *test, double *timer, const int rep, const int access)
2011-06-17 23:20:43 +04:00
{
double reduced[IOR_NB_TIMERS] = { 0 };
double diff[IOR_NB_TIMERS / 2 + 1];
double totalTime, accessTime;
IOR_param_t *params = &test->params;
double bw, iops, latency, minlatency;
int i;
MPI_Op op;
assert(access == WRITE || access == READ);
2011-12-11 08:45:19 +04:00
/* Find the minimum start time of the even numbered timers, and the
maximum finish time for the odd numbered timers */
for (i = 0; i < IOR_NB_TIMERS; i++) {
op = i % 2 ? MPI_MAX : MPI_MIN;
MPI_CHECK(MPI_Reduce(&timer[i], &reduced[i], 1, MPI_DOUBLE,
op, 0, testComm), "MPI_Reduce()");
2011-06-17 23:20:43 +04:00
}
/* Calculate elapsed times and throughput numbers */
for (i = 0; i < IOR_NB_TIMERS / 2; i++)
diff[i] = reduced[2 * i + 1] - reduced[2 * i];
totalTime = reduced[5] - reduced[0];
accessTime = reduced[3] - reduced[2];
IOR_point_t *point = (access == WRITE) ? &test->results[rep].write :
&test->results[rep].read;
point->time = totalTime;
2012-01-09 06:55:46 +04:00
if (verbose < VERBOSE_0)
return;
bw = (double)point->aggFileSizeForBW / totalTime;
/* For IOPS in this iteration, we divide the total amount of IOs from
* all ranks over the entire access time (first start -> last end). */
iops = (point->aggFileSizeForBW / params->transferSize) / accessTime;
/* For Latency, we divide the total access time for each task over the
* number of I/Os issued from that task; then reduce and display the
* minimum (best) latency achieved. So what is reported is the average
* latency of all ops from a single task, then taking the minimum of
2019-10-30 17:43:43 +03:00
* that between all tasks. */
latency = (timer[3] - timer[2]) / (params->blockSize / params->transferSize);
MPI_CHECK(MPI_Reduce(&latency, &minlatency, 1, MPI_DOUBLE,
MPI_MIN, 0, testComm), "MPI_Reduce()");
/* Only rank 0 tallies and prints the results. */
if (rank != 0)
return;
PrintReducedResult(test, access, bw, iops, latency, diff, totalTime, rep);
2012-01-09 06:55:46 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Check for file(s), then remove all files if file-per-proc, else single file.
*
2011-06-17 23:20:43 +04:00
*/
static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test)
{
int tmpRankOffset = 0;
if (filePerProc) {
/* in random tasks, delete own file */
if (test->reorderTasksRandom == TRUE) {
tmpRankOffset = rankOffset;
rankOffset = 0;
GetTestFileName(testFileName, test);
}
if (backend->access(testFileName, F_OK, test->backend_options) == 0) {
if (verbose >= VERBOSE_3) {
fprintf(out_logfile, "task %d removing %s\n", rank,
testFileName);
}
backend->delete(testFileName, test->backend_options);
}
if (test->reorderTasksRandom == TRUE) {
rankOffset = tmpRankOffset;
GetTestFileName(testFileName, test);
}
} else {
if ((rank == 0) && (backend->access(testFileName, F_OK, test->backend_options) == 0)) {
if (verbose >= VERBOSE_3) {
fprintf(out_logfile, "task %d removing %s\n", rank,
testFileName);
}
backend->delete(testFileName, test->backend_options);
}
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Setup tests by parsing commandline and creating test script.
* Perform a sanity-check on the configured parameters.
2011-06-17 23:20:43 +04:00
*/
2021-01-20 18:00:33 +03:00
static void InitTests(IOR_test_t *tests)
2011-06-17 23:20:43 +04:00
{
2021-01-20 18:00:33 +03:00
if(tests == NULL){
return;
}
MPI_Comm com = tests->params.mpi_comm_world;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
int mpiNumNodes = 0;
int mpiNumTasks = 0;
int mpiNumTasksOnNode0 = 0;
verbose = tests->params.verbose;
aiori_warning_as_errors = tests->params.warningAsErrors;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/*
* These default values are the same for every test and expensive to
* retrieve so just do it once.
*/
mpiNumNodes = GetNumNodes(com);
mpiNumTasks = GetNumTasks(com);
mpiNumTasksOnNode0 = GetNumTasksOnNode0(com);
2011-06-17 23:20:43 +04:00
/*
* Since there is no guarantee that anyone other than
* task 0 has the environment settings for the hints, pass
2018-07-07 13:42:21 +03:00
* the hint=value pair to everyone else in mpi_comm_world
*/
DistributeHints(com);
2011-06-17 23:20:43 +04:00
/* check validity of tests and create test queue */
while (tests != NULL) {
IOR_param_t *params = & tests->params;
params->testComm = com;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/* use MPI values if not overridden on command-line */
if (params->numNodes == -1) {
params->numNodes = mpiNumNodes;
}
if (params->numTasks == -1) {
params->numTasks = mpiNumTasks;
} else if (params->numTasks > mpiNumTasks) {
if (rank == 0) {
EWARNF("More tasks requested (%d) than available (%d),",
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
params->numTasks, mpiNumTasks);
EWARNF(" running with %d tasks.\n", mpiNumTasks);
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
}
params->numTasks = mpiNumTasks;
}
if (params->numTasksOnNode0 == -1) {
params->numTasksOnNode0 = mpiNumTasksOnNode0;
}
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
params->tasksBlockMapping = QueryNodeMapping(com,false);
params->expectedAggFileSize =
params->blockSize * params->segmentCount * params->numTasks;
ValidateTests(&tests->params, com);
tests = tests->next;
}
2011-06-17 23:20:43 +04:00
init_clock(com);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2011-11-12 03:11:28 +04:00
/*
* Setup transfer buffers, creating and filling as needed.
*/
static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test,
int pretendRank)
2011-06-17 23:20:43 +04:00
{
ioBuffers->buffer = aligned_buffer_alloc(test->transferSize, test->gpuMemoryFlags);
}
/*
* Free transfer buffers.
*/
static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test)
{
aligned_buffer_free(ioBuffers->buffer, test->gpuMemoryFlags);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* malloc a buffer, touching every page in an attempt to defeat lazy allocation.
*/
static void *malloc_and_touch(size_t size)
{
size_t page_size;
char *buf;
char *ptr;
if (size == 0)
return NULL;
page_size = sysconf(_SC_PAGESIZE);
buf = (char *)malloc(size);
if (buf == NULL)
2012-01-09 06:41:30 +04:00
return NULL;
for (ptr = buf; ptr < buf+size; ptr += page_size) {
*ptr = (char)1;
}
return (void *)buf;
}
static void file_hits_histogram(IOR_param_t *params)
{
int *rankoffs = NULL;
int *filecont = NULL;
int *filehits = NULL;
int ifile;
int jfile;
if (rank == 0) {
rankoffs = (int *)malloc(params->numTasks * sizeof(int));
filecont = (int *)malloc(params->numTasks * sizeof(int));
filehits = (int *)malloc(params->numTasks * sizeof(int));
}
MPI_CHECK(MPI_Gather(&rankOffset, 1, MPI_INT, rankoffs,
1, MPI_INT, 0, params->testComm),
"MPI_Gather error");
if (rank != 0)
return;
memset((void *)filecont, 0, params->numTasks * sizeof(int));
for (ifile = 0; ifile < params->numTasks; ifile++) {
filecont[(ifile + rankoffs[ifile]) % params->numTasks]++;
}
memset((void *)filehits, 0, params->numTasks * sizeof(int));
for (ifile = 0; ifile < params->numTasks; ifile++)
for (jfile = 0; jfile < params->numTasks; jfile++) {
if (ifile == filecont[jfile])
filehits[ifile]++;
}
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "#File Hits Dist:");
jfile = 0;
ifile = 0;
while (jfile < params->numTasks && ifile < params->numTasks) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, " %d", filehits[ifile]);
jfile += filehits[ifile], ifile++;
}
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "\n");
free(rankoffs);
free(filecont);
free(filehits);
}
2012-01-09 00:30:05 +04:00
int test_time_elapsed(IOR_param_t *params, double startTime)
{
double endTime;
2012-01-09 00:30:05 +04:00
if (params->maxTimeDuration == 0)
return 0;
2012-01-09 00:30:05 +04:00
endTime = startTime + (params->maxTimeDuration * 60);
2012-01-09 00:30:05 +04:00
return GetTimeStamp() >= endTime;
2012-01-09 00:30:05 +04:00
}
2012-01-09 06:41:30 +04:00
/*
* hog some memory as a rough simulation of a real application's memory use
*/
static void *HogMemory(IOR_param_t *params)
{
size_t size;
void *buf;
if (params->memoryPerTask != 0) {
size = params->memoryPerTask;
} else if (params->memoryPerNode != 0) {
if (verbose >= VERBOSE_3)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "This node hogging %ld bytes of memory\n",
2012-01-09 06:41:30 +04:00
params->memoryPerNode);
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
size = params->memoryPerNode / params->numTasksOnNode0;
2012-01-09 06:41:30 +04:00
} else {
return NULL;
}
if (verbose >= VERBOSE_3)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "This task hogging %ld bytes of memory\n", size);
2012-01-09 06:41:30 +04:00
buf = malloc_and_touch(size);
if (buf == NULL)
ERR("malloc of simulated applciation buffer failed");
return buf;
}
/*
* Write times taken during each iteration of the test.
*/
static void
WriteTimes(IOR_param_t *test, const double *timer, const int iteration,
const int access)
{
char timerName[MAX_STR];
2012-01-09 06:41:30 +04:00
for (int i = 0; i < IOR_NB_TIMERS; i++) {
if (access == WRITE) {
switch (i) {
case 0:
strcpy(timerName, "write open start");
break;
case 1:
strcpy(timerName, "write open stop");
break;
case 2:
strcpy(timerName, "write start");
break;
case 3:
strcpy(timerName, "write stop");
break;
case 4:
strcpy(timerName, "write close start");
break;
case 5:
strcpy(timerName, "write close stop");
break;
default:
strcpy(timerName, "invalid timer");
break;
}
}
else {
switch (i) {
case 0:
strcpy(timerName, "read open start");
break;
case 1:
strcpy(timerName, "read open stop");
break;
case 2:
strcpy(timerName, "read start");
break;
case 3:
strcpy(timerName, "read stop");
break;
case 4:
strcpy(timerName, "read close start");
break;
case 5:
strcpy(timerName, "read close stop");
break;
default:
strcpy(timerName, "invalid timer");
break;
}
}
fprintf(out_logfile, "Test %d: Iter=%d, Task=%d, Time=%f, %s\n",
test->id, iteration, (int)rank, timer[i],
timerName);
}
}
static void StoreRankInformation(IOR_test_t *test, double *timer, const int rep, const int access){
IOR_param_t *params = &test->params;
double totalTime = timer[5] - timer[0];
double accessTime = timer[3] - timer[2];
double times[] = {totalTime, accessTime};
if(rank == 0){
FILE* fd = fopen(params->saveRankDetailsCSV, "a");
if (fd == NULL){
FAIL("Cannot open saveRankPerformanceDetailsCSV file for writes!");
}
int size;
MPI_Comm_size(params->testComm, & size);
double *all_times = malloc(2* size * sizeof(double));
MPI_Gather(times, 2, MPI_DOUBLE, all_times, 2, MPI_DOUBLE, 0, params->testComm);
IOR_point_t *point = (access == WRITE) ? &test->results[rep].write : &test->results[rep].read;
double file_size = ((double) point->aggFileSizeForBW) / size;
for(int i=0; i < size; i++){
char buff[1024];
sprintf(buff, "%s,%d,%.10e,%.10e,%.10e,%.10e\n", access==WRITE ? "write" : "read", i, all_times[i*2], all_times[i*2+1], file_size/all_times[i*2], file_size/all_times[i*2+1] );
int ret = fwrite(buff, strlen(buff), 1, fd);
if(ret != 1){
WARN("Couln't append to saveRankPerformanceDetailsCSV file\n");
break;
}
}
fclose(fd);
}else{
MPI_Gather(& times, 2, MPI_DOUBLE, NULL, 2, MPI_DOUBLE, 0, testComm);
}
}
static void ProcessIterResults(IOR_test_t *test, double *timer, const int rep, const int access){
IOR_param_t *params = &test->params;
if (verbose >= VERBOSE_3)
WriteTimes(params, timer, rep, access);
ReduceIterResults(test, timer, rep, access);
if (params->outlierThreshold) {
CheckForOutliers(params, timer, access);
}
if(params->saveRankDetailsCSV){
StoreRankInformation(test, timer, rep, access);
}
}
2011-06-17 23:20:43 +04:00
/*
* Using the test parameters, run iteration(s) of single test.
*/
static void TestIoSys(IOR_test_t *test)
2011-06-17 23:20:43 +04:00
{
IOR_param_t *params = &test->params;
IOR_results_t *results = test->results;
char testFileName[MAX_STR];
double timer[IOR_NB_TIMERS];
double startTime;
int pretendRank;
int rep;
aiori_fd_t *fd;
IOR_offset_t dataMoved; /* for data rate calculation */
void *hog_buf;
IOR_io_buffers ioBuffers;
if (rank == 0 && verbose >= VERBOSE_1) {
fprintf(out_logfile, "Participating tasks : %d\n", params->numTasks);
2018-07-07 13:42:21 +03:00
fflush(out_logfile);
}
if (rank == 0 && params->reorderTasks == TRUE && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Using reorderTasks '-C' (useful to avoid read cache in client)\n");
2018-07-07 13:42:21 +03:00
fflush(out_logfile);
2011-06-17 23:20:43 +04:00
}
/* show test setup */
if (rank == 0 && verbose >= VERBOSE_0)
ShowSetup(params);
2011-06-17 23:20:43 +04:00
2012-01-09 06:41:30 +04:00
hog_buf = HogMemory(params);
pretendRank = (rank + rankOffset) % params->numTasks;
/* IO Buffer Setup */
if (params->setTimeStampSignature) { // initialize the buffer properly
2019-12-22 14:21:40 +03:00
params->timeStampSignatureValue = (unsigned int) params->setTimeStampSignature;
}
XferBuffersSetup(&ioBuffers, params, pretendRank);
/* Initial time stamp */
startTime = GetTimeStamp();
2011-06-17 23:20:43 +04:00
/* loop over test iterations */
uint64_t params_saved_wearout = params->stoneWallingWearOutIterations;
/* Check if the file exists and warn users */
if((params->writeFile || params->checkWrite) && (params->hints.filePerProc || rank == 0)){
struct stat sb;
GetTestFileName(testFileName, params);
int ret = backend->stat(testFileName, & sb, params->backend_options);
if(ret == 0) {
EWARNF("The file \"%s\" exists already and will be overwritten", testFileName);
}
}
for (rep = 0; rep < params->repetitions; rep++) {
/* Get iteration start time in seconds in task 0 and broadcast to
all tasks */
if (rank == 0) {
if (! params->setTimeStampSignature) {
time_t currentTime;
if ((currentTime = time(NULL)) == -1) {
ERR("cannot get current time");
}
params->timeStampSignatureValue =
(unsigned int)currentTime;
}
if (verbose >= VERBOSE_2) {
fprintf(out_logfile,
"Using Time Stamp %u (0x%x) for Data Signature\n",
params->timeStampSignatureValue,
params->timeStampSignatureValue);
}
if (rep == 0 && verbose >= VERBOSE_0) {
2018-07-08 15:47:55 +03:00
PrintTableHeader();
}
}
MPI_CHECK(MPI_Bcast
(&params->timeStampSignatureValue, 1, MPI_UNSIGNED, 0,
testComm), "cannot broadcast start time value");
2017-12-09 13:52:13 +03:00
generate_memory_pattern((char*) ioBuffers.buffer, params->transferSize, params->setTimeStampSignature, pretendRank, params->dataPacketType);
/* use repetition count for number of multiple files */
if (params->multiFile)
params->repCounter = rep;
/*
* write the file(s), getting timing between I/O calls
*/
2012-01-09 00:30:05 +04:00
if (params->writeFile && !test_time_elapsed(params, startTime)) {
GetTestFileName(testFileName, params);
if (verbose >= VERBOSE_3) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "task %d writing %s\n", rank,
testFileName);
}
DelaySecs(params->interTestDelay);
if (params->useExistingTestFile == FALSE) {
RemoveFile(testFileName, params->filePerProc,
params);
}
params->stoneWallingWearOutIterations = params_saved_wearout;
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
params->open = WRITE;
timer[0] = GetTimeStamp();
fd = backend->create(testFileName, IOR_WRONLY | IOR_CREAT | IOR_TRUNC, params->backend_options);
2020-06-30 21:41:49 +03:00
if(fd == NULL) FAIL("Cannot create file");
timer[1] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Commencing write performance test: %s",
CurrentTimeString());
}
timer[2] = GetTimeStamp();
dataMoved = WriteOrRead(params, &results[rep], fd, WRITE, &ioBuffers);
if (params->verbose >= VERBOSE_4) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "* data moved = %llu\n", dataMoved);
fflush(out_logfile);
}
timer[3] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
timer[4] = GetTimeStamp();
backend->close(fd, params->backend_options);
timer[5] = GetTimeStamp();
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
/* check if stat() of file doesn't equal expected file size,
use actual amount of byte moved */
CheckFileSize(test, testFileName, dataMoved, rep, WRITE);
ProcessIterResults(test, timer, rep, WRITE);
/* check if in this round we run write with stonewalling */
if(params->deadlineForStonewalling > 0){
params->stoneWallingWearOutIterations = results[rep].write.pairs_accessed;
}
}
2011-06-17 23:20:43 +04:00
/*
* perform a check of data, reading back data and comparing
* against what was expected to be written
*/
2012-01-09 00:30:05 +04:00
if (params->checkWrite && !test_time_elapsed(params, startTime)) {
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Verifying contents of the file(s) just written.\n");
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "%s\n", CurrentTimeString());
}
if (params->reorderTasks) {
/* move two nodes away from writing node */
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
if (params->tasksBlockMapping) {
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */
}
rankOffset = (2 * shift) % params->numTasks;
}
GetTestFileName(testFileName, params);
params->open = WRITECHECK;
fd = backend->open(testFileName, IOR_RDONLY, params->backend_options);
2020-06-30 21:41:49 +03:00
if(fd == NULL) FAIL("Cannot open file");
dataMoved = WriteOrRead(params, &results[rep], fd, WRITECHECK, &ioBuffers);
backend->close(fd, params->backend_options);
rankOffset = 0;
}
/*
* read the file(s), getting timing between I/O calls
*/
if ((params->readFile || params->checkRead ) && !test_time_elapsed(params, startTime)) {
/* check for stonewall */
if(params->stoneWallingStatusFile){
params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile, params->testComm);
if(params->stoneWallingWearOutIterations == -1 && rank == 0){
WARN("Could not read back the stonewalling status from the file!");
params->stoneWallingWearOutIterations = 0;
}
}
int operation_flag = READ;
if ( params->checkRead ){
// actually read and then compare the buffer
operation_flag = READCHECK;
}
/* Get rankOffset [file offset] for this process to read, based on -C,-Z,-Q,-X options */
/* Constant process offset reading */
if (params->reorderTasks) {
/* move one node away from writing node */
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
if (params->tasksBlockMapping) {
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */
}
rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks;
}
/* random process offset reading */
if (params->reorderTasksRandom) {
/* this should not intefere with randomOffset within a file because GetOffsetArrayRandom */
/* seeds every rand() call */
int nodeoffset;
unsigned int iseed0;
nodeoffset = params->taskPerNodeOffset;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1;
if (params->reorderTasksRandomSeed < 0)
iseed0 = -1 * params->reorderTasksRandomSeed + rep;
else
iseed0 = params->reorderTasksRandomSeed;
srand(rank + iseed0);
{
rankOffset = rand() % params->numTasks;
}
while (rankOffset <
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
(nodeoffset * params->numTasksOnNode0)) {
rankOffset = rand() % params->numTasks;
}
/* Get more detailed stats if requested by verbose level */
if (verbose >= VERBOSE_2) {
file_hits_histogram(params);
}
}
/* Using globally passed rankOffset, following function generates testFileName to read */
GetTestFileName(testFileName, params);
if (verbose >= VERBOSE_3) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "task %d reading %s\n", rank,
testFileName);
}
DelaySecs(params->interTestDelay);
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
params->open = READ;
timer[0] = GetTimeStamp();
fd = backend->open(testFileName, IOR_RDONLY, params->backend_options);
2020-06-30 21:41:49 +03:00
if(fd == NULL) FAIL("Cannot open file");
timer[1] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
"Commencing read performance test: %s\n",
CurrentTimeString());
}
timer[2] = GetTimeStamp();
dataMoved = WriteOrRead(params, &results[rep], fd, operation_flag, &ioBuffers);
timer[3] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
timer[4] = GetTimeStamp();
backend->close(fd, params->backend_options);
timer[5] = GetTimeStamp();
/* check if stat() of file doesn't equal expected file size,
use actual amount of byte moved */
CheckFileSize(test, testFileName, dataMoved, rep, READ);
ProcessIterResults(test, timer, rep, READ);
}
if (!params->keepFile
2012-01-09 06:55:46 +04:00
&& !(params->errorFound && params->keepFileWithError)) {
double start, finish;
start = GetTimeStamp();
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
RemoveFile(testFileName, params->filePerProc, params);
2012-01-09 06:55:46 +04:00
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
finish = GetTimeStamp();
PrintRemoveTiming(start, finish, rep);
} else {
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
}
params->errorFound = FALSE;
rankOffset = 0;
2018-07-08 18:47:23 +03:00
}
PrintRepeatEnd();
2011-06-17 23:20:43 +04:00
if (params->summary_every_test) {
PrintLongSummaryHeader();
PrintLongSummaryOneTest(test);
} else {
PrintShortSummary(test);
}
XferBuffersFree(&ioBuffers, params);
if (hog_buf != NULL)
free(hog_buf);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Determine if valid tests from parameters.
*/
static void ValidateTests(IOR_param_t * test, MPI_Comm com)
2011-06-17 23:20:43 +04:00
{
IOR_param_t defaults;
init_IOR_Param_t(&defaults, com);
if (test->stoneWallingStatusFile && test->keepFile == 0)
ERR("a StoneWallingStatusFile is only sensible when splitting write/read into multiple executions of ior, please use -k");
if (test->stoneWallingStatusFile && test->stoneWallingWearOut == 0 && test->writeFile)
ERR("the StoneWallingStatusFile is only sensible for a write test when using stoneWallingWearOut");
if (test->deadlineForStonewalling == 0 && test->stoneWallingWearOut > 0)
ERR("the stoneWallingWearOut is only sensible when setting a stonewall deadline with -D");
if (test->stoneWallingStatusFile && test->testscripts)
WARN("the StoneWallingStatusFile only preserves the last experiment, make sure that each run uses a separate status file!");
if (test->repetitions <= 0)
WARN_RESET("too few test repetitions",
test, &defaults, repetitions);
if (test->numTasks <= 0)
ERR("too few tasks for testing");
if (test->interTestDelay < 0)
WARN_RESET("inter-test delay must be nonnegative value",
test, &defaults, interTestDelay);
if (test->readFile != TRUE && test->writeFile != TRUE
&& test->checkRead != TRUE && test->checkWrite != TRUE)
ERR("test must write, read, or check read/write file");
if(! test->setTimeStampSignature && test->writeFile != TRUE && test->checkRead == TRUE)
ERR("using readCheck only requires to write a timeStampSignature -- use -G");
if (test->segmentCount < 0)
ERR("segment count must be positive value");
if ((test->blockSize % sizeof(IOR_size_t)) != 0)
ERR("block size must be a multiple of access size");
if (test->blockSize < 0)
ERR("block size must be non-negative integer");
if ((test->transferSize % sizeof(IOR_size_t)) != 0)
ERR("transfer size must be a multiple of access size");
if (test->transferSize < 0)
ERR("transfer size must be non-negative integer");
if (test->transferSize == 0) {
ERR("test will not complete with zero transfer size");
2011-06-17 23:20:43 +04:00
} else {
if ((test->blockSize % test->transferSize) != 0)
ERR("block size must be a multiple of transfer size");
}
if (test->blockSize < test->transferSize)
ERR("block size must not be smaller than transfer size");
if (test->randomOffset && test->blockSize == test->transferSize)
ERR("IOR will randomize access within a block and repeats the same pattern for all segments, therefore choose blocksize > transferSize");
if (! test->randomOffset && test->randomPrefillBlocksize)
ERR("Setting the randomPrefill option without using random is not useful");
if (test->randomPrefillBlocksize && (test->blockSize % test->randomPrefillBlocksize != 0))
ERR("The randomPrefill option must divide the blockSize");
/* specific APIs */
if ((strcasecmp(test->api, "MPIIO") == 0)
&& (test->blockSize < sizeof(IOR_size_t)
|| test->transferSize < sizeof(IOR_size_t)))
ERR("block/transfer size may not be smaller than IOR_size_t for MPIIO");
if ((strcasecmp(test->api, "HDF5") == 0)
&& (test->blockSize < sizeof(IOR_size_t)
|| test->transferSize < sizeof(IOR_size_t)))
ERR("block/transfer size may not be smaller than IOR_size_t for HDF5");
if ((strcasecmp(test->api, "NCMPI") == 0)
&& (test->blockSize < sizeof(IOR_size_t)
|| test->transferSize < sizeof(IOR_size_t)))
ERR("block/transfer size may not be smaller than IOR_size_t for NCMPI");
2018-04-27 20:44:20 +03:00
if (((strcasecmp(test->api, "POSIX") != 0)
&& (strcasecmp(test->api, "MPIIO") != 0)
2018-07-12 18:13:40 +03:00
&& (strcasecmp(test->api, "MMAP") != 0)
2018-08-15 01:08:04 +03:00
&& (strcasecmp(test->api, "HDFS") != 0)
&& (strcasecmp(test->api, "DFS") != 0)
2019-08-02 07:03:59 +03:00
&& (strcasecmp(test->api, "Gfarm") != 0)
&& (strcasecmp(test->api, "RADOS") != 0)
&& (strcasecmp(test->api, "CEPHFS") != 0)) && test->fsync)
2018-04-27 20:44:20 +03:00
WARN_RESET("fsync() not supported in selected backend",
test, &defaults, fsync);
2020-07-03 10:09:40 +03:00
/* parameter consistency */
if (test->reorderTasks == TRUE && test->reorderTasksRandom == TRUE)
ERR("Both Constant and Random task re-ordering specified. Choose one and resubmit");
if (test->randomOffset && test->reorderTasksRandom
&& test->filePerProc == FALSE)
ERR("random offset and random reorder tasks specified with single-shared-file. Choose one and resubmit");
if (test->randomOffset && test->reorderTasks
&& test->filePerProc == FALSE)
ERR("random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit");
if (test->randomOffset && test->checkRead && test->randomSeed == -1)
ERR("random offset with read check option requires to set the random seed");
if (test->randomOffset && test->dataPacketType == DATA_OFFSET)
ERR("random offset not available with store file offset option)");
if ((strcasecmp(test->api, "HDF5") == 0) && test->randomOffset)
ERR("random offset not available with HDF5");
if ((strcasecmp(test->api, "NCMPI") == 0) && test->randomOffset)
ERR("random offset not available with NCMPI");
if ((strcasecmp(test->api, "NCMPI") == 0) && test->filePerProc)
ERR("file-per-proc not available in current NCMPI");
backend = test->backend;
ior_set_xfer_hints(test);
/* allow the backend to validate the options */
if(test->backend->check_params){
int check = test->backend->check_params(test->backend_options);
if (check){
ERR("The backend returned that the test parameters are invalid.");
}
}
2011-11-12 03:11:28 +04:00
}
/**
* Returns a precomputed array of IOR_offset_t for the inner benchmark loop.
* They get created sequentially and mixed up in the end.
* It should be noted that as the seeds get synchronised across all processes if not FilePerProcess is set
* every process computes the same random order.
* For a shared file all transfers get randomly assigned to ranks. The processes
* can also have differen't numbers of transfers. This might lead to a bigger
* diversion in accesse as it dose with filePerProc. This is expected but
* should be mined.
* @param test IOR_param_t for getting transferSize, blocksize and SegmentCount
2020-07-03 10:09:40 +03:00
* @param pretendRank int pretended Rank for shifting the offsets correctly
* @return IOR_offset_t
*/
2020-11-06 13:27:11 +03:00
IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, IOR_offset_t * out_count)
2011-06-17 23:20:43 +04:00
{
int seed;
IOR_offset_t i;
IOR_offset_t offsets;
2011-11-12 03:11:28 +04:00
IOR_offset_t offsetCnt = 0;
IOR_offset_t *offsetArray;
if (test->filePerProc) {
/* set up seed, each process can determine which regions to access individually */
if (test->randomSeed == -1) {
seed = time(NULL);
test->randomSeed = seed;
} else {
seed = test->randomSeed + pretendRank;
}
}else{
/* Shared file requires that the seed is synchronized */
if (test->randomSeed == -1) {
// all processes need to have the same seed.
if(rank == 0){
seed = time(NULL);
}
MPI_CHECK(MPI_Bcast(& seed, 1, MPI_INT, 0, test->testComm), "cannot broadcast random seed value");
test->randomSeed = seed;
}else{
seed = test->randomSeed;
}
2011-06-17 23:20:43 +04:00
}
srandom(seed);
/* count needed offsets (pass 1) */
2020-11-06 13:27:11 +03:00
if (test->filePerProc) {
offsets = test->blockSize / test->transferSize;
}else{
offsets = 0;
2020-11-06 13:27:11 +03:00
for (i = 0; i < test->blockSize * test->numTasks; i += test->transferSize) {
// this counts which process get how many transferes in the shared file
if ((rand() % test->numTasks) == pretendRank) {
offsets++;
}
}
2011-06-17 23:20:43 +04:00
}
/* setup empty array */
2020-11-06 13:27:11 +03:00
offsetArray = (IOR_offset_t *) safeMalloc(offsets * sizeof(IOR_offset_t));
2020-11-06 13:27:11 +03:00
*out_count = offsets;
if (test->filePerProc) {
/* fill array */
for (i = 0; i < offsets; i++) {
2020-11-06 13:27:11 +03:00
offsetArray[i] = i * test->transferSize;
}
} else {
/* fill with offsets (pass 2) */
srandom(seed); /* need same seed to get same transfers as counted in the beginning*/
2020-11-06 13:27:11 +03:00
for (i = 0; i < test->blockSize * test->numTasks; i += test->transferSize) {
if ((rand() % test->numTasks) == pretendRank) {
offsetArray[offsetCnt] = i;
offsetCnt++;
}
}
}
/* reorder array */
for (i = 0; i < offsets; i++) {
IOR_offset_t value, tmp;
value = rand() % offsets;
tmp = offsetArray[value];
offsetArray[value] = offsetArray[i];
offsetArray[i] = tmp;
}
2011-06-17 23:20:43 +04:00
return (offsetArray);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
static IOR_offset_t WriteOrReadSingle(IOR_offset_t offset, int pretendRank, IOR_offset_t transfer, IOR_offset_t * transferCount, int * errors, IOR_param_t * test, aiori_fd_t * fd, IOR_io_buffers* ioBuffers, int access){
IOR_offset_t amtXferred = 0;
void *buffer = ioBuffers->buffer;
if (access == WRITE) {
2017-11-30 13:56:26 +03:00
/* fills each transfer with a unique pattern
* containing the offset into the file */
update_write_memory_pattern(offset, ioBuffers->buffer, transfer, test->setTimeStampSignature, pretendRank, test->dataPacketType);
2020-06-10 19:47:07 +03:00
amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
if (amtXferred != transfer)
ERR("cannot write to file");
if (test->fsyncPerWrite)
backend->fsync(fd, test->backend_options);
if (test->interIODelay > 0){
2018-10-06 19:30:00 +03:00
struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)};
nanosleep( & wait, NULL);
}
} else if (access == READ) {
2020-06-10 19:47:07 +03:00
amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
if (amtXferred != transfer)
ERR("cannot read from file");
if (test->interIODelay > 0){
2018-10-06 19:30:00 +03:00
struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)};
nanosleep( & wait, NULL);
}
} else if (access == WRITECHECK) {
((long long int*) buffer)[0] = ~((long long int*) buffer)[0]; // changes the buffer, no memset to reduce the memory pressure
amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
if (amtXferred != transfer)
ERR("cannot read from file write check");
*errors += CompareData(buffer, transfer, *transferCount, test, offset, pretendRank, WRITECHECK);
} else if (access == READCHECK) {
((long long int*) buffer)[0] = ~((long long int*) buffer)[0]; // changes the buffer, no memset to reduce the memory pressure
amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
if (amtXferred != transfer){
ERR("cannot read from file");
}
*errors += CompareData(buffer, transfer, *transferCount, test, offset, pretendRank, READCHECK);
}
return amtXferred;
}
static void prefillSegment(IOR_param_t *test, void * randomPrefillBuffer, int pretendRank, aiori_fd_t *fd, IOR_io_buffers *ioBuffers, int startSegment, int endSegment){
// prefill the whole file already with an invalid pattern
int offsets = test->blockSize / test->randomPrefillBlocksize;
void * oldBuffer = ioBuffers->buffer;
IOR_offset_t transferCount;
int errors;
ioBuffers->buffer = randomPrefillBuffer;
for (IOR_offset_t i = startSegment; i < endSegment; i++){
for (int j = 0; j < offsets; j++) {
IOR_offset_t offset = j * test->randomPrefillBlocksize;
if (test->filePerProc) {
offset += i * test->blockSize;
} else {
offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize);
}
WriteOrReadSingle(offset, pretendRank, test->randomPrefillBlocksize, & transferCount, & errors, test, fd, ioBuffers, WRITE);
}
}
ioBuffers->buffer = oldBuffer;
}
2011-06-17 23:20:43 +04:00
/*
* Write or Read data to file(s). This loops through the strides, writing
* out the data to each block in transfer sizes, until the remainder left is 0.
*/
static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results,
aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers)
2011-06-17 23:20:43 +04:00
{
int errors = 0;
IOR_offset_t transferCount = 0;
2018-07-07 13:42:21 +03:00
uint64_t pairCnt = 0;
int pretendRank;
IOR_offset_t dataMoved = 0; /* for data rate calculation */
double startForStonewall;
int hitStonewall;
IOR_offset_t i, j;
IOR_point_t *point = ((access == WRITE) || (access == WRITECHECK)) ?
&results->write : &results->read;
/* initialize values */
pretendRank = (rank + rankOffset) % test->numTasks;
// offsetArray = GetOffsetArraySequential(test, pretendRank);
2011-06-17 23:20:43 +04:00
2020-11-06 13:27:11 +03:00
IOR_offset_t offsets;
IOR_offset_t * offsets_rnd;
if (test->randomOffset) {
2020-11-06 13:27:11 +03:00
offsets_rnd = GetOffsetArrayRandom(test, pretendRank, & offsets);
}else{
offsets = (test->blockSize / test->transferSize);
}
void * randomPrefillBuffer = NULL;
if(test->randomPrefillBlocksize && (access == WRITE || access == WRITECHECK)){
randomPrefillBuffer = aligned_buffer_alloc(test->randomPrefillBlocksize, test->gpuMemoryFlags);
// store invalid data into the buffer
memset(randomPrefillBuffer, -1, test->randomPrefillBlocksize);
2011-06-17 23:20:43 +04:00
}
// start timer after random offset was generated
startForStonewall = GetTimeStamp();
hitStonewall = 0;
2011-06-17 23:20:43 +04:00
if(randomPrefillBuffer && test->deadlineForStonewalling == 0){
double t_start = GetTimeStamp();
prefillSegment(test, randomPrefillBuffer, pretendRank, fd, ioBuffers, 0, test->segmentCount);
if(rank == 0 && verbose > VERBOSE_1){
fprintf(out_logfile, "Random prefill took: %fs\n", GetTimeStamp() - t_start);
}
2020-12-01 17:47:57 +03:00
// must synchronize processes to ensure they are not running ahead
MPI_Barrier(test->testComm);
}
for (i = 0; i < test->segmentCount && !hitStonewall; i++) {
if(randomPrefillBuffer && test->deadlineForStonewalling != 0){
// prefill the whole segment with data, this needs to be done collectively
double t_start = GetTimeStamp();
prefillSegment(test, randomPrefillBuffer, pretendRank, fd, ioBuffers, i, i+1);
2020-12-01 17:47:57 +03:00
MPI_Barrier(test->testComm);
if(rank == 0 && verbose > VERBOSE_1){
fprintf(out_logfile, "Random: synchronizing segment count with barrier and prefill took: %fs\n", GetTimeStamp() - t_start);
}
}
2020-11-06 13:27:11 +03:00
for (j = 0; j < offsets && !hitStonewall ; j++) {
IOR_offset_t offset;
if (test->randomOffset) {
2020-11-06 13:27:11 +03:00
if(test->filePerProc){
offset = offsets_rnd[j] + (i * test->blockSize);
}else{
offset = offsets_rnd[j] + (i * test->numTasks * test->blockSize);
}
}else{
offset = j * test->transferSize;
if (test->filePerProc) {
offset += i * test->blockSize;
} else {
offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize);
}
}
dataMoved += WriteOrReadSingle(offset, pretendRank, test->transferSize, & transferCount, & errors, test, fd, ioBuffers, access);
pairCnt++;
hitStonewall = ((test->deadlineForStonewalling != 0
&& (GetTimeStamp() - startForStonewall) > test->deadlineForStonewalling))
|| (test->stoneWallingWearOutIterations != 0 && pairCnt == test->stoneWallingWearOutIterations) ;
if ( test->collective && test->deadlineForStonewalling ) {
// if collective-mode, you'll get a HANG, if some rank 'accidentally' leave this loop
// it absolutely must be an 'all or none':
MPI_CHECK(MPI_Bcast(&hitStonewall, 1, MPI_INT, 0, testComm), "hitStonewall broadcast failed");
}
}
}
if (test->stoneWallingWearOut){
2018-07-07 13:42:21 +03:00
if (verbose >= VERBOSE_1){
fprintf(out_logfile, "%d: stonewalling pairs accessed: %lld\n", rank, (long long) pairCnt);
}
long long data_moved_ll = (long long) dataMoved;
long long pairs_accessed_min = 0;
MPI_CHECK(MPI_Allreduce(& pairCnt, &point->pairs_accessed,
1, MPI_LONG_LONG_INT, MPI_MAX, testComm), "cannot reduce pairs moved");
2018-07-07 13:42:21 +03:00
double stonewall_runtime = GetTimeStamp() - startForStonewall;
point->stonewall_time = stonewall_runtime;
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Reduce(& pairCnt, & pairs_accessed_min,
1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved");
MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_min_data_accessed,
2018-07-07 13:42:21 +03:00
1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved");
2020-12-04 00:07:45 +03:00
MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_total_data_accessed,
2018-07-07 13:42:21 +03:00
1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm), "cannot reduce pairs moved");
if(rank == 0){
2020-12-04 00:07:45 +03:00
point->stonewall_avg_data_accessed = point->stonewall_total_data_accessed / test->numTasks;
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "stonewalling pairs accessed min: %lld max: %zu -- min data: %.1f GiB mean data: %.1f GiB time: %.1fs\n",
pairs_accessed_min, point->pairs_accessed,
2020-12-04 00:07:45 +03:00
point->stonewall_min_data_accessed /1024.0 / 1024 / 1024, point->stonewall_avg_data_accessed / 1024.0 / 1024 / 1024 , point->stonewall_time);
2018-07-07 13:42:21 +03:00
}
if(pairCnt != point->pairs_accessed){
// some work needs still to be done, complete the current block !
i--;
if(j == offsets){
j = 0; // current block is completed
i++;
}
for ( ; pairCnt < point->pairs_accessed; i++) {
for ( ; j < offsets && pairCnt < point->pairs_accessed ; j++) {
IOR_offset_t offset;
if (test->randomOffset) {
2020-11-06 13:27:11 +03:00
if(test->filePerProc){
offset = offsets_rnd[j] + (i * test->blockSize);
}else{
offset = offsets_rnd[j] + (i * test->numTasks * test->blockSize);
}
}else{
offset = j * test->transferSize;
if (test->filePerProc) {
offset += i * test->blockSize;
} else {
offset += (i * test->numTasks * test->blockSize) + (pretendRank * test->blockSize);
}
}
dataMoved += WriteOrReadSingle(offset, pretendRank, test->transferSize, & transferCount, & errors, test, fd, ioBuffers, access);
pairCnt++;
}
j = 0;
}
}
}else{
point->pairs_accessed = pairCnt;
}
2011-06-17 23:20:43 +04:00
totalErrorCount += CountErrors(test, access, errors);
2011-06-17 23:20:43 +04:00
if (access == WRITE && test->fsync == TRUE) {
backend->fsync(fd, test->backend_options); /*fsync after all accesses */
}
if(randomPrefillBuffer){
aligned_buffer_free(randomPrefillBuffer, test->gpuMemoryFlags);
}
return (dataMoved);
2011-11-12 03:11:28 +04:00
}