mdtest/src/ior.c

1949 lines
76 KiB
C
Raw Normal View History

/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
2011-06-17 23:20:43 +04:00
/******************************************************************************\
* *
* Copyright (c) 2003, The Regents of the University of California *
* See the file COPYRIGHT for a complete copyright notice and license. *
* *
\******************************************************************************/
#ifdef HAVE_CONFIG_H
2014-07-31 03:17:21 +04:00
# include "config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h> /* tolower() */
#include <errno.h>
2011-06-17 23:20:43 +04:00
#include <math.h>
#include <mpi.h>
#include <string.h>
#if defined(HAVE_STRINGS_H)
#include <strings.h>
#endif
#include <sys/stat.h> /* struct stat */
2011-06-17 23:20:43 +04:00
#include <time.h>
2014-07-31 03:17:21 +04:00
2011-06-17 23:20:43 +04:00
#ifndef _WIN32
2014-07-31 03:17:21 +04:00
# include <sys/time.h> /* gettimeofday() */
# include <sys/utsname.h> /* uname() */
2011-06-17 23:20:43 +04:00
#endif
2014-07-31 03:17:21 +04:00
#include <assert.h>
2011-06-17 23:20:43 +04:00
#include "ior.h"
2018-07-08 15:38:05 +03:00
#include "ior-internal.h"
#include "aiori.h"
#include "utilities.h"
#include "parse_options.h"
2011-06-17 23:20:43 +04:00
#define IOR_NB_TIMERS 6
/* file scope globals */
extern char **environ;
2018-07-07 13:42:21 +03:00
static int totalErrorCount;
static const ior_aiori_t *backend;
static void DestroyTests(IOR_test_t *tests_head);
static char *PrependDir(IOR_param_t *, char *);
static char **ParseFileName(char *, int *);
static void InitTests(IOR_test_t * , MPI_Comm);
static void TestIoSys(IOR_test_t *);
static void ValidateTests(IOR_param_t *);
static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results,
aiori_fd_t *fd, const int access,
IOR_io_buffers *ioBuffers);
static void ior_set_xfer_hints(IOR_param_t * p){
aiori_xfer_hint_t * hints = & p->hints;
hints->dryRun = p->dryRun;
hints->filePerProc = p->filePerProc;
hints->collective = p->collective;
hints->numTasks = p->numTasks;
hints->numNodes = p->numNodes;
hints->randomOffset = p->randomOffset;
hints->fsyncPerWrite = p->fsyncPerWrite;
hints->segmentCount = p->segmentCount;
hints->blockSize = p->blockSize;
hints->transferSize = p->transferSize;
hints->expectedAggFileSize = p->expectedAggFileSize;
hints->singleXferAttempt = p->singleXferAttempt;
if(backend->xfer_hints){
backend->xfer_hints(hints);
}
}
int aiori_warning_as_errors = 0;
static void test_initialize(IOR_test_t * test){
verbose = test->params.verbose;
backend = test->params.backend;
if(backend->initialize){
backend->initialize(test->params.backend_options);
}
ior_set_xfer_hints(& test->params);
aiori_warning_as_errors = test->params.warningAsErrors;
if (rank == 0 && verbose >= VERBOSE_0) {
ShowTestStart(& test->params);
}
}
static void test_finalize(IOR_test_t * test){
backend = test->params.backend;
if(backend->finalize){
backend->finalize(test->params.backend_options);
}
}
2018-07-07 13:42:21 +03:00
IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out){
IOR_test_t *tests_head;
IOR_test_t *tptr;
2018-07-07 13:42:21 +03:00
out_logfile = world_out;
2018-07-08 15:47:55 +03:00
out_resultfile = world_out;
2018-07-07 13:42:21 +03:00
mpi_comm_world = world_com;
2011-06-17 23:20:43 +04:00
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank");
/* setup tests, and validate parameters */
tests_head = ParseCommandLine(argc, argv);
InitTests(tests_head, world_com);
PrintHeader(argc, argv);
2011-12-11 13:50:19 +04:00
/* perform each test */
for (tptr = tests_head; tptr != NULL; tptr = tptr->next) {
test_initialize(tptr);
2018-07-07 13:42:21 +03:00
totalErrorCount = 0;
TestIoSys(tptr);
2018-07-07 13:42:21 +03:00
tptr->results->errors = totalErrorCount;
ShowTestEnd(tptr);
test_finalize(tptr);
}
PrintLongSummaryAllTests(tests_head);
/* display finish time */
PrintTestEnds();
2018-07-07 13:42:21 +03:00
return tests_head;
}
2018-07-07 13:42:21 +03:00
int ior_main(int argc, char **argv)
{
IOR_test_t *tests_head;
IOR_test_t *tptr;
2018-07-08 15:07:32 +03:00
2018-07-07 13:42:21 +03:00
out_logfile = stdout;
2018-07-08 15:47:55 +03:00
out_resultfile = stdout;
2018-07-07 13:42:21 +03:00
/*
* check -h option from commandline without starting MPI;
*/
tests_head = ParseCommandLine(argc, argv);
2018-07-07 13:42:21 +03:00
/* start the MPI code */
MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI");
2018-07-07 13:42:21 +03:00
mpi_comm_world = MPI_COMM_WORLD;
MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank");
2018-07-08 15:07:32 +03:00
2018-07-07 13:42:21 +03:00
/* set error-handling */
/*MPI_CHECK(MPI_Errhandler_set(mpi_comm_world, MPI_ERRORS_RETURN),
"cannot set errhandler"); */
/* setup tests, and validate parameters */
InitTests(tests_head, mpi_comm_world);
2018-07-07 13:42:21 +03:00
PrintHeader(argc, argv);
/* perform each test */
for (tptr = tests_head; tptr != NULL; tptr = tptr->next) {
test_initialize(tptr);
2018-07-07 13:42:21 +03:00
// This is useful for trapping a running MPI process. While
// this is sleeping, run the script 'testing/hdfs/gdb.attach'
if (verbose >= VERBOSE_4) {
fprintf(out_logfile, "\trank %d: sleeping\n", rank);
sleep(5);
fprintf(out_logfile, "\trank %d: awake.\n", rank);
}
2018-07-07 13:42:21 +03:00
TestIoSys(tptr);
ShowTestEnd(tptr);
test_finalize(tptr);
2018-07-07 13:42:21 +03:00
}
if (verbose <= VERBOSE_0)
2018-07-07 13:42:21 +03:00
/* always print final summary */
verbose = VERBOSE_1;
2018-07-07 13:42:21 +03:00
PrintLongSummaryAllTests(tests_head);
2018-07-07 13:42:21 +03:00
/* display finish time */
PrintTestEnds();
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Finalize(), "cannot finalize MPI");
DestroyTests(tests_head);
2018-07-07 13:42:21 +03:00
return totalErrorCount;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/***************************** F U N C T I O N S ******************************/
/*
* Initialize an IOR_param_t structure to the defaults
*/
void init_IOR_Param_t(IOR_param_t * p)
{
const char *default_aiori = aiori_default ();
2018-04-23 23:10:04 +03:00
char *hdfs_user;
assert (NULL != default_aiori);
memset(p, 0, sizeof(IOR_param_t));
p->api = strdup(default_aiori);
p->platform = strdup("HOST(OSTYPE)");
p->testFileName = strdup("testFile");
p->writeFile = p->readFile = FALSE;
p->checkWrite = p->checkRead = FALSE;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/*
* These can be overridden from the command-line but otherwise will be
* set from MPI.
*/
p->numTasks = -1;
p->numNodes = -1;
p->numTasksOnNode0 = -1;
p->repetitions = 1;
p->repCounter = -1;
p->open = WRITE;
p->taskPerNodeOffset = 1;
p->segmentCount = 1;
p->blockSize = 1048576;
p->transferSize = 262144;
p->randomSeed = -1;
p->incompressibleSeed = 573;
2018-07-07 13:42:21 +03:00
p->testComm = mpi_comm_world;
2018-04-23 23:10:04 +03:00
hdfs_user = getenv("USER");
if (!hdfs_user)
hdfs_user = "";
p->hdfs_user = strdup(hdfs_user);
p->hdfs_name_node = "default";
p->hdfs_name_node_port = 0; /* ??? */
p->hdfs_fs = NULL;
p->hdfs_replicas = 0; /* invokes the default */
p->hdfs_block_size = 0;
p->URI = NULL;
p->part_number = 0;
}
static void
DisplayOutliers(int numTasks,
2011-06-17 23:20:43 +04:00
double timerVal,
char *timeString, int access, int outlierThreshold)
2011-06-17 23:20:43 +04:00
{
char accessString[MAX_STR];
double sum, mean, sqrDiff, var, sd;
/* for local timerVal, don't compensate for wall clock delta */
timerVal += wall_clock_delta;
MPI_CHECK(MPI_Allreduce
(&timerVal, &sum, 1, MPI_DOUBLE, MPI_SUM, testComm),
"MPI_Allreduce()");
mean = sum / numTasks;
sqrDiff = pow((mean - timerVal), 2);
MPI_CHECK(MPI_Allreduce
(&sqrDiff, &var, 1, MPI_DOUBLE, MPI_SUM, testComm),
"MPI_Allreduce()");
var = var / numTasks;
sd = sqrt(var);
2011-06-17 23:20:43 +04:00
if (access == WRITE) {
strcpy(accessString, "write");
} else { /* READ */
strcpy(accessString, "read");
}
if (fabs(timerVal - mean) > (double)outlierThreshold) {
char hostname[MAX_STR];
int ret = gethostname(hostname, MAX_STR);
if (ret != 0)
strcpy(hostname, "unknown");
EWARNF("for %s, task %d, %s %s is %f (mean=%f, stddev=%f)\n",
hostname, rank, accessString, timeString, timerVal, mean, sd);
}
}
2011-06-17 23:20:43 +04:00
/*
* Check for outliers in start/end times and elapsed create/xfer/close times.
*/
static void
CheckForOutliers(IOR_param_t *test, const double *timer, const int access)
2011-06-17 23:20:43 +04:00
{
DisplayOutliers(test->numTasks, timer[0],
"start time", access, test->outlierThreshold);
DisplayOutliers(test->numTasks,
timer[1] - timer[0],
"elapsed create time", access, test->outlierThreshold);
DisplayOutliers(test->numTasks,
timer[3] - timer[2],
"elapsed transfer time", access,
test->outlierThreshold);
DisplayOutliers(test->numTasks,
timer[5] - timer[4],
"elapsed close time", access, test->outlierThreshold);
DisplayOutliers(test->numTasks, timer[5], "end time",
access, test->outlierThreshold);
}
2011-06-17 23:20:43 +04:00
/*
* Check if actual file size equals expected size; if not use actual for
* calculating performance rate.
*/
static void CheckFileSize(IOR_test_t *test, IOR_offset_t dataMoved, int rep,
const int access)
2011-06-17 23:20:43 +04:00
{
IOR_param_t *params = &test->params;
IOR_results_t *results = test->results;
IOR_point_t *point = (access == WRITE) ? &results[rep].write :
&results[rep].read;
MPI_CHECK(MPI_Allreduce(&dataMoved, &point->aggFileSizeFromXfer,
1, MPI_LONG_LONG_INT, MPI_SUM, testComm),
"cannot total data moved");
if (strcasecmp(params->api, "HDF5") != 0 && strcasecmp(params->api, "NCMPI") != 0 &&
strcasecmp(params->api, "DAOS") != 0) {
if (verbose >= VERBOSE_0 && rank == 0) {
if ((params->expectedAggFileSize
!= point->aggFileSizeFromXfer)
|| (point->aggFileSizeFromStat
!= point->aggFileSizeFromXfer)) {
EWARNF("Expected aggregate file size = %lld", (long long) params->expectedAggFileSize);
EWARNF("Stat() of aggregate file size = %lld", (long long) point->aggFileSizeFromStat);
EWARNF("Using actual aggregate bytes moved = %lld", (long long) point->aggFileSizeFromXfer);
if(params->deadlineForStonewalling){
EWARN("Maybe caused by deadlineForStonewalling");
}
}
}
2011-06-17 23:20:43 +04:00
}
point->aggFileSizeForBW = point->aggFileSizeFromXfer;
}
2011-06-17 23:20:43 +04:00
/*
* Compare buffers after reading/writing each transfer. Displays only first
* difference in buffers and returns total errors counted.
*/
static size_t
CompareBuffers(void *expectedBuffer,
void *unknownBuffer,
size_t size,
IOR_offset_t transferCount, IOR_param_t *test, int access)
2011-06-17 23:20:43 +04:00
{
2018-07-08 00:39:14 +03:00
char testFileName[MAX_PATHLEN];
2011-11-12 03:11:28 +04:00
char bufferLabel1[MAX_STR];
char bufferLabel2[MAX_STR];
size_t i, j, length, first, last;
size_t errorCount = 0;
int inError = 0;
unsigned long long *goodbuf = (unsigned long long *)expectedBuffer;
unsigned long long *testbuf = (unsigned long long *)unknownBuffer;
if (access == WRITECHECK || access == READCHECK) {
strcpy(bufferLabel1, "Expected: ");
strcpy(bufferLabel2, "Actual: ");
} else {
ERR("incorrect argument for CompareBuffers()");
}
2011-06-17 23:20:43 +04:00
length = size / sizeof(IOR_size_t);
first = -1;
if (verbose >= VERBOSE_3) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"[%d] At file byte offset %lld, comparing %llu-byte transfer\n",
2020-06-10 19:47:07 +03:00
rank, (long long) offset, (long long)size);
}
for (i = 0; i < length; i++) {
if (testbuf[i] != goodbuf[i]) {
errorCount++;
if (verbose >= VERBOSE_2) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"[%d] At transfer buffer #%lld, index #%lld (file byte offset %lld):\n",
rank, transferCount - 1, (long long)i,
2020-06-10 19:47:07 +03:00
(long long) offset +
(IOR_size_t) (i * sizeof(IOR_size_t)));
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1);
fprintf(out_logfile, "%016llx\n", goodbuf[i]);
fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel2);
fprintf(out_logfile, "%016llx\n", testbuf[i]);
}
if (!inError) {
inError = 1;
first = i;
last = i;
} else {
last = i;
}
} else if (verbose >= VERBOSE_5 && i % 4 == 0) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
2020-06-10 19:47:07 +03:00
"[%d] PASSED offset = %lu bytes, transfer %lld\n",
rank,
((i * sizeof(unsigned long long)) +
2020-06-10 19:47:07 +03:00
offset), transferCount);
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "[%d] GOOD %s0x", rank, bufferLabel1);
for (j = 0; j < 4; j++)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "%016llx ", goodbuf[i + j]);
fprintf(out_logfile, "\n[%d] GOOD %s0x", rank, bufferLabel2);
for (j = 0; j < 4; j++)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "%016llx ", testbuf[i + j]);
fprintf(out_logfile, "\n");
}
}
if (inError) {
inError = 0;
GetTestFileName(testFileName, test);
EWARNF("[%d] FAILED comparison of buffer containing %d-byte ints:\n",
rank, (int)sizeof(unsigned long long int));
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "[%d] File name = %s\n", rank, testFileName);
fprintf(out_logfile, "[%d] In transfer %lld, ", rank,
transferCount);
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"%lld errors between buffer indices %lld and %lld.\n",
(long long)errorCount, (long long)first,
(long long)last);
2020-06-10 19:47:07 +03:00
fprintf(out_logfile, "[%d] File byte offset = %lu:\n", rank,
((first * sizeof(unsigned long long)) + offset));
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "[%d] %s0x", rank, bufferLabel1);
for (j = first; j < length && j < first + 4; j++)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "%016llx ", goodbuf[j]);
if (j == length)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "[end of buffer]");
fprintf(out_logfile, "\n[%d] %s0x", rank, bufferLabel2);
for (j = first; j < length && j < first + 4; j++)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "%016llx ", testbuf[j]);
if (j == length)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "[end of buffer]");
fprintf(out_logfile, "\n");
2011-06-17 23:20:43 +04:00
}
return (errorCount);
2011-11-12 03:11:28 +04:00
}
2011-11-12 03:11:28 +04:00
/*
* Count all errors across all tasks; report errors found.
*/
static int CountErrors(IOR_param_t * test, int access, int errors)
{
int allErrors = 0;
if (test->checkWrite || test->checkRead) {
MPI_CHECK(MPI_Reduce(&errors, &allErrors, 1, MPI_INT, MPI_SUM,
0, testComm), "cannot reduce errors");
MPI_CHECK(MPI_Bcast(&allErrors, 1, MPI_INT, 0, testComm),
"cannot broadcast allErrors value");
if (allErrors != 0) {
totalErrorCount += allErrors;
test->errorFound = TRUE;
}
if (rank == 0 && allErrors != 0) {
if (allErrors < 0) {
WARN("overflow in errors counted");
allErrors = -1;
}
EWARNF("Incorrect data on %s (%d errors found).\n",
access == WRITECHECK ? "write" : "read", allErrors);
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Used Time Stamp %u (0x%x) for Data Signature\n",
test->timeStampSignatureValue,
test->timeStampSignatureValue);
}
}
return (allErrors);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Allocate a page-aligned (required by O_DIRECT) buffer.
*/
static void *aligned_buffer_alloc(size_t size)
2011-06-17 23:20:43 +04:00
{
size_t pageMask;
char *buf, *tmp;
char *aligned;
#ifdef HAVE_SYSCONF
long pageSize = sysconf(_SC_PAGESIZE);
#else
size_t pageSize = getpagesize();
#endif
pageMask = pageSize - 1;
buf = malloc(size + pageSize + sizeof(void *));
if (buf == NULL)
ERR("out of memory");
/* find the alinged buffer */
tmp = buf + sizeof(char *);
aligned = tmp + pageSize - ((size_t) tmp & pageMask);
/* write a pointer to the original malloc()ed buffer into the bytes
preceding "aligned", so that the aligned buffer can later be free()ed */
tmp = aligned - sizeof(void *);
*(void **)tmp = buf;
return (void *)aligned;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Free a buffer allocated by aligned_buffer_alloc().
*/
static void aligned_buffer_free(void *buf)
{
free(*(void **)((char *)buf - sizeof(char *)));
}
2018-08-29 19:49:41 +03:00
void AllocResults(IOR_test_t *test)
{
int reps;
if (test->results != NULL)
return;
reps = test->params.repetitions;
test->results = (IOR_results_t *) safeMalloc(sizeof(IOR_results_t) * reps);
}
void FreeResults(IOR_test_t *test)
{
if (test->results != NULL) {
free(test->results);
}
}
/**
2011-06-17 23:20:43 +04:00
* Create new test for list of tests.
*/
IOR_test_t *CreateTest(IOR_param_t *init_params, int test_num)
2011-06-17 23:20:43 +04:00
{
IOR_test_t *newTest = NULL;
newTest = (IOR_test_t *) malloc(sizeof(IOR_test_t));
if (newTest == NULL)
ERR("malloc() of IOR_test_t failed");
newTest->params = *init_params;
newTest->params.platform = GetPlatformName();
newTest->params.id = test_num;
newTest->next = NULL;
newTest->results = NULL;
return newTest;
}
static void DestroyTest(IOR_test_t *test)
{
FreeResults(test);
free(test);
}
static void DestroyTests(IOR_test_t *tests_head)
{
IOR_test_t *tptr, *next;
for (tptr = tests_head; tptr != NULL; tptr = next) {
next = tptr->next;
DestroyTest(tptr);
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Distribute IOR_HINTs to all tasks' environments.
*/
void DistributeHints(void)
2011-06-17 23:20:43 +04:00
{
char hint[MAX_HINTS][MAX_STR], fullHint[MAX_STR], hintVariable[MAX_STR];
int hintCount = 0, i;
if (rank == 0) {
for (i = 0; environ[i] != NULL; i++) {
if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT"))
== 0) {
hintCount++;
if (hintCount == MAX_HINTS) {
WARN("exceeded max hints; reset MAX_HINTS and recompile");
hintCount = MAX_HINTS;
break;
}
/* assume no IOR_HINT is greater than MAX_STR in length */
strncpy(hint[hintCount - 1], environ[i],
MAX_STR - 1);
}
2011-06-17 23:20:43 +04:00
}
}
MPI_CHECK(MPI_Bcast(&hintCount, sizeof(hintCount), MPI_BYTE,
2011-06-17 23:20:43 +04:00
0, MPI_COMM_WORLD), "cannot broadcast hints");
for (i = 0; i < hintCount; i++) {
MPI_CHECK(MPI_Bcast(&hint[i], MAX_STR, MPI_BYTE,
0, MPI_COMM_WORLD),
"cannot broadcast hints");
strcpy(fullHint, hint[i]);
strcpy(hintVariable, strtok(fullHint, "="));
if (getenv(hintVariable) == NULL) {
/* doesn't exist in this task's environment; better set it */
if (putenv(hint[i]) != 0)
WARN("cannot set environment variable");
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Fill buffer, which is transfer size bytes long, with known 8-byte long long
* int values. In even-numbered 8-byte long long ints, store MPI task in high
* bits and timestamp signature in low bits. In odd-numbered 8-byte long long
* ints, store transfer offset. If storeFileOffset option is used, the file
* (not transfer) offset is stored instead.
*/
static void
FillIncompressibleBuffer(void* buffer, IOR_param_t * test)
{
size_t i;
unsigned long long hi, lo;
unsigned long long *buf = (unsigned long long *)buffer;
for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) {
hi = ((unsigned long long) rand_r(&test->incompressibleSeed) << 32);
lo = (unsigned long long) rand_r(&test->incompressibleSeed);
buf[i] = hi | lo;
}
}
unsigned int reseed_incompressible_prng = TRUE;
static void
FillBuffer(void *buffer,
IOR_param_t * test, unsigned long long offset, int fillrank)
2011-06-17 23:20:43 +04:00
{
size_t i;
unsigned long long hi, lo;
unsigned long long *buf = (unsigned long long *)buffer;
2020-07-03 10:09:40 +03:00
if(test->dataPacketType == incompressible ) { /* Make for some non compressible buffers with randomish data */
2020-07-03 10:09:40 +03:00
/* In order for write checks to work, we have to restart the pseudo random sequence */
if(reseed_incompressible_prng == TRUE) {
test->incompressibleSeed = test->setTimeStampSignature + rank; /* We copied seed into timestampSignature at initialization, also add the rank to add randomness between processes */
reseed_incompressible_prng = FALSE;
}
FillIncompressibleBuffer(buffer, test);
}
else {
hi = ((unsigned long long)fillrank) << 32;
lo = (unsigned long long)test->timeStampSignatureValue;
for (i = 0; i < test->transferSize / sizeof(unsigned long long); i++) {
if ((i % 2) == 0) {
/* evens contain MPI rank and time in seconds */
buf[i] = hi | lo;
} else {
/* odds contain offset */
buf[i] = offset + (i * sizeof(unsigned long long));
}
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Return string describing machine name and type.
*/
char * GetPlatformName()
2011-06-17 23:20:43 +04:00
{
char nodeName[MAX_STR], *p, *start, sysName[MAX_STR];
char platformName[MAX_STR];
struct utsname name;
if (uname(&name) != 0) {
2011-12-15 01:40:25 +04:00
EWARN("cannot get platform name");
sprintf(sysName, "%s", "Unknown");
sprintf(nodeName, "%s", "Unknown");
2011-06-17 23:20:43 +04:00
} else {
sprintf(sysName, "%s", name.sysname);
sprintf(nodeName, "%s", name.nodename);
2011-06-17 23:20:43 +04:00
}
start = nodeName;
if (strlen(nodeName) == 0) {
p = start;
} else {
/* point to one character back from '\0' */
p = start + strlen(nodeName) - 1;
}
/*
* to cut off trailing node number, search backwards
* for the first non-numeric character
*/
while (p != start) {
if (*p < '0' || *p > '9') {
*(p + 1) = '\0';
break;
} else {
p--;
}
}
2011-06-17 23:20:43 +04:00
sprintf(platformName, "%s(%s)", nodeName, sysName);
return strdup(platformName);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2011-06-17 23:20:43 +04:00
/*
* Parse file name.
*/
static char **ParseFileName(char *name, int *count)
2011-06-17 23:20:43 +04:00
{
char **fileNames, *tmp, *token;
char delimiterString[3] = { FILENAME_DELIMITER, '\n', '\0' };
int i = 0;
*count = 0;
tmp = name;
/* pass one */
/* if something there, count the first item */
if (*tmp != '\0') {
(*count)++;
}
/* count the rest of the filenames */
while (*tmp != '\0') {
if (*tmp == FILENAME_DELIMITER) {
(*count)++;
}
tmp++;
}
2011-06-17 23:20:43 +04:00
fileNames = (char **)malloc((*count) * sizeof(char **));
if (fileNames == NULL)
ERR("out of memory");
/* pass two */
token = strtok(name, delimiterString);
while (token != NULL) {
fileNames[i] = token;
token = strtok(NULL, delimiterString);
i++;
}
return (fileNames);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2018-07-08 15:38:05 +03:00
2011-06-17 23:20:43 +04:00
/*
2018-07-08 15:38:05 +03:00
* Return test file name to access.
* for single shared file, fileNames[0] is returned in testFileName
2011-06-17 23:20:43 +04:00
*/
2018-07-08 15:38:05 +03:00
void GetTestFileName(char *testFileName, IOR_param_t * test)
2011-06-17 23:20:43 +04:00
{
2018-07-08 15:38:05 +03:00
char **fileNames;
char initialTestFileName[MAX_PATHLEN];
char testFileNameRoot[MAX_STR];
char tmpString[MAX_STR];
int count;
int socket, core;
2018-07-08 15:38:05 +03:00
/* parse filename for multiple file systems */
strcpy(initialTestFileName, test->testFileName);
if(test->dualMount){
GetProcessorAndCore(&socket, &core);
sprintf(tmpString, "%s%d/%s",initialTestFileName,
socket, "data");
strcpy(initialTestFileName, tmpString);
}
2018-07-08 15:38:05 +03:00
fileNames = ParseFileName(initialTestFileName, &count);
if (count > 1 && test->uniqueDir == TRUE)
ERR("cannot use multiple file names with unique directories");
if (test->filePerProc) {
strcpy(testFileNameRoot,
fileNames[((rank +
rankOffset) % test->numTasks) % count]);
} else {
strcpy(testFileNameRoot, fileNames[0]);
2011-06-17 23:20:43 +04:00
}
2018-07-08 15:38:05 +03:00
/* give unique name if using multiple files */
if (test->filePerProc) {
/*
* prepend rank subdirectory before filename
* e.g., /dir/file => /dir/<rank>/file
*/
if (test->uniqueDir == TRUE) {
strcpy(testFileNameRoot,
PrependDir(test, testFileNameRoot));
}
sprintf(testFileName, "%s.%08d", testFileNameRoot,
(rank + rankOffset) % test->numTasks);
} else {
strcpy(testFileName, testFileNameRoot);
}
2018-07-08 15:38:05 +03:00
/* add suffix for multiple files */
if (test->repCounter > -1) {
sprintf(tmpString, ".%d", test->repCounter);
strcat(testFileName, tmpString);
}
free (fileNames);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* From absolute directory, insert rank as subdirectory. Allows each task
* to write to its own directory. E.g., /dir/file => /dir/<rank>/file.
*/
static char *PrependDir(IOR_param_t * test, char *rootDir)
2011-06-17 23:20:43 +04:00
{
char *dir;
char *fname;
int i;
dir = (char *)malloc(MAX_STR + 1);
if (dir == NULL)
ERR("out of memory");
/* get dir name */
strcpy(dir, rootDir);
i = strlen(dir) - 1;
while (i > 0) {
if (dir[i] == '\0' || dir[i] == '/') {
dir[i] = '/';
dir[i + 1] = '\0';
break;
}
i--;
}
/* get file name */
fname = rootDir + i + 1;
/* create directory with rank as subdirectory */
sprintf(dir + i + 1, "%d", (rank + rankOffset) % test->numTasks);
/* dir doesn't exist, so create */
if (backend->access(dir, F_OK, test->backend_options) != 0) {
if (backend->mkdir(dir, S_IRWXU, test->backend_options) < 0) {
ERRF("cannot create directory: %s", dir);
}
/* check if correct permissions */
} else if (backend->access(dir, R_OK, test->backend_options) != 0 ||
backend->access(dir, W_OK, test->backend_options) != 0 ||
backend->access(dir, X_OK, test->backend_options) != 0) {
ERRF("invalid directory permissions: %s", dir);
}
/* concatenate dir and file names */
strcat(dir, "/");
strcat(dir, fname);
return dir;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/******************************************************************************/
/*
* Reduce test results, and show if verbose set.
*/
static void
ReduceIterResults(IOR_test_t *test, double *timer, const int rep, const int access)
2011-06-17 23:20:43 +04:00
{
double reduced[IOR_NB_TIMERS] = { 0 };
double diff[IOR_NB_TIMERS / 2 + 1];
double totalTime, accessTime;
IOR_param_t *params = &test->params;
double bw, iops, latency, minlatency;
int i;
MPI_Op op;
assert(access == WRITE || access == READ);
2011-12-11 08:45:19 +04:00
/* Find the minimum start time of the even numbered timers, and the
maximum finish time for the odd numbered timers */
for (i = 0; i < IOR_NB_TIMERS; i++) {
op = i % 2 ? MPI_MAX : MPI_MIN;
MPI_CHECK(MPI_Reduce(&timer[i], &reduced[i], 1, MPI_DOUBLE,
op, 0, testComm), "MPI_Reduce()");
2011-06-17 23:20:43 +04:00
}
/* Calculate elapsed times and throughput numbers */
for (i = 0; i < IOR_NB_TIMERS / 2; i++)
diff[i] = reduced[2 * i + 1] - reduced[2 * i];
totalTime = reduced[5] - reduced[0];
accessTime = reduced[3] - reduced[2];
IOR_point_t *point = (access == WRITE) ? &test->results[rep].write :
&test->results[rep].read;
point->time = totalTime;
2012-01-09 06:55:46 +04:00
if (verbose < VERBOSE_0)
return;
bw = (double)point->aggFileSizeForBW / totalTime;
/* For IOPS in this iteration, we divide the total amount of IOs from
* all ranks over the entire access time (first start -> last end). */
iops = (point->aggFileSizeForBW / params->transferSize) / accessTime;
/* For Latency, we divide the total access time for each task over the
* number of I/Os issued from that task; then reduce and display the
* minimum (best) latency achieved. So what is reported is the average
* latency of all ops from a single task, then taking the minimum of
2019-10-30 17:43:43 +03:00
* that between all tasks. */
latency = (timer[3] - timer[2]) / (params->blockSize / params->transferSize);
MPI_CHECK(MPI_Reduce(&latency, &minlatency, 1, MPI_DOUBLE,
MPI_MIN, 0, testComm), "MPI_Reduce()");
/* Only rank 0 tallies and prints the results. */
if (rank != 0)
return;
PrintReducedResult(test, access, bw, iops, latency, diff, totalTime, rep);
2012-01-09 06:55:46 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Check for file(s), then remove all files if file-per-proc, else single file.
*
2011-06-17 23:20:43 +04:00
*/
static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test)
{
int tmpRankOffset = 0;
if (filePerProc) {
/* in random tasks, delete own file */
if (test->reorderTasksRandom == TRUE) {
tmpRankOffset = rankOffset;
rankOffset = 0;
GetTestFileName(testFileName, test);
}
if (backend->access(testFileName, F_OK, test->backend_options) == 0) {
if (verbose >= VERBOSE_3) {
fprintf(out_logfile, "task %d removing %s\n", rank,
testFileName);
}
backend->delete(testFileName, test->backend_options);
}
if (test->reorderTasksRandom == TRUE) {
rankOffset = tmpRankOffset;
GetTestFileName(testFileName, test);
}
} else {
if ((rank == 0) && (backend->access(testFileName, F_OK, test->backend_options) == 0)) {
if (verbose >= VERBOSE_3) {
fprintf(out_logfile, "task %d removing %s\n", rank,
testFileName);
}
backend->delete(testFileName, test->backend_options);
}
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Setup tests by parsing commandline and creating test script.
* Perform a sanity-check on the configured parameters.
2011-06-17 23:20:43 +04:00
*/
static void InitTests(IOR_test_t *tests, MPI_Comm com)
2011-06-17 23:20:43 +04:00
{
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
int mpiNumNodes = 0;
int mpiNumTasks = 0;
int mpiNumTasksOnNode0 = 0;
verbose = tests->params.verbose;
aiori_warning_as_errors = tests->params.warningAsErrors;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/*
* These default values are the same for every test and expensive to
* retrieve so just do it once.
*/
mpiNumNodes = GetNumNodes(com);
mpiNumTasks = GetNumTasks(com);
mpiNumTasksOnNode0 = GetNumTasksOnNode0(com);
2011-06-17 23:20:43 +04:00
/*
* Since there is no guarantee that anyone other than
* task 0 has the environment settings for the hints, pass
2018-07-07 13:42:21 +03:00
* the hint=value pair to everyone else in mpi_comm_world
*/
DistributeHints();
2011-06-17 23:20:43 +04:00
/* check validity of tests and create test queue */
while (tests != NULL) {
IOR_param_t *params = & tests->params;
params->testComm = com;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/* use MPI values if not overridden on command-line */
if (params->numNodes == -1) {
params->numNodes = mpiNumNodes;
}
if (params->numTasks == -1) {
params->numTasks = mpiNumTasks;
} else if (params->numTasks > mpiNumTasks) {
if (rank == 0) {
EWARNF("More tasks requested (%d) than available (%d),",
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
params->numTasks, mpiNumTasks);
EWARNF(" running with %d tasks.\n", mpiNumTasks);
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
}
params->numTasks = mpiNumTasks;
}
if (params->numTasksOnNode0 == -1) {
params->numTasksOnNode0 = mpiNumTasksOnNode0;
}
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
params->tasksBlockMapping = QueryNodeMapping(com,false);
params->expectedAggFileSize =
params->blockSize * params->segmentCount * params->numTasks;
ValidateTests(&tests->params);
tests = tests->next;
}
2011-06-17 23:20:43 +04:00
2018-07-07 13:42:21 +03:00
init_clock();
2011-06-17 23:20:43 +04:00
/* seed random number generator */
2018-07-07 13:42:21 +03:00
SeedRandGen(mpi_comm_world);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2011-11-12 03:11:28 +04:00
/*
* Setup transfer buffers, creating and filling as needed.
*/
static void XferBuffersSetup(IOR_io_buffers* ioBuffers, IOR_param_t* test,
int pretendRank)
2011-06-17 23:20:43 +04:00
{
ioBuffers->buffer = aligned_buffer_alloc(test->transferSize);
if (test->checkWrite || test->checkRead) {
ioBuffers->checkBuffer = aligned_buffer_alloc(test->transferSize);
}
if (test->checkRead || test->checkWrite) {
ioBuffers->readCheckBuffer = aligned_buffer_alloc(test->transferSize);
}
return;
}
/*
* Free transfer buffers.
*/
static void XferBuffersFree(IOR_io_buffers* ioBuffers, IOR_param_t* test)
{
aligned_buffer_free(ioBuffers->buffer);
if (test->checkWrite || test->checkRead) {
aligned_buffer_free(ioBuffers->checkBuffer);
}
if (test->checkRead) {
aligned_buffer_free(ioBuffers->readCheckBuffer);
2011-06-17 23:20:43 +04:00
}
return;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* malloc a buffer, touching every page in an attempt to defeat lazy allocation.
*/
static void *malloc_and_touch(size_t size)
{
size_t page_size;
char *buf;
char *ptr;
if (size == 0)
return NULL;
page_size = sysconf(_SC_PAGESIZE);
buf = (char *)malloc(size);
if (buf == NULL)
2012-01-09 06:41:30 +04:00
return NULL;
for (ptr = buf; ptr < buf+size; ptr += page_size) {
*ptr = (char)1;
}
return (void *)buf;
}
static void file_hits_histogram(IOR_param_t *params)
{
int *rankoffs = NULL;
int *filecont = NULL;
int *filehits = NULL;
int ifile;
int jfile;
if (rank == 0) {
rankoffs = (int *)malloc(params->numTasks * sizeof(int));
filecont = (int *)malloc(params->numTasks * sizeof(int));
filehits = (int *)malloc(params->numTasks * sizeof(int));
}
MPI_CHECK(MPI_Gather(&rankOffset, 1, MPI_INT, rankoffs,
2018-07-07 13:42:21 +03:00
1, MPI_INT, 0, mpi_comm_world),
"MPI_Gather error");
if (rank != 0)
return;
memset((void *)filecont, 0, params->numTasks * sizeof(int));
for (ifile = 0; ifile < params->numTasks; ifile++) {
filecont[(ifile + rankoffs[ifile]) % params->numTasks]++;
}
memset((void *)filehits, 0, params->numTasks * sizeof(int));
for (ifile = 0; ifile < params->numTasks; ifile++)
for (jfile = 0; jfile < params->numTasks; jfile++) {
if (ifile == filecont[jfile])
filehits[ifile]++;
}
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "#File Hits Dist:");
jfile = 0;
ifile = 0;
while (jfile < params->numTasks && ifile < params->numTasks) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, " %d", filehits[ifile]);
jfile += filehits[ifile], ifile++;
}
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "\n");
free(rankoffs);
free(filecont);
free(filehits);
}
2012-01-09 00:30:05 +04:00
int test_time_elapsed(IOR_param_t *params, double startTime)
{
double endTime;
2012-01-09 00:30:05 +04:00
if (params->maxTimeDuration == 0)
return 0;
2012-01-09 00:30:05 +04:00
endTime = startTime + (params->maxTimeDuration * 60);
2012-01-09 00:30:05 +04:00
return GetTimeStamp() >= endTime;
2012-01-09 00:30:05 +04:00
}
2012-01-09 06:41:30 +04:00
/*
* hog some memory as a rough simulation of a real application's memory use
*/
static void *HogMemory(IOR_param_t *params)
{
size_t size;
void *buf;
if (params->memoryPerTask != 0) {
size = params->memoryPerTask;
} else if (params->memoryPerNode != 0) {
if (verbose >= VERBOSE_3)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "This node hogging %ld bytes of memory\n",
2012-01-09 06:41:30 +04:00
params->memoryPerNode);
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
size = params->memoryPerNode / params->numTasksOnNode0;
2012-01-09 06:41:30 +04:00
} else {
return NULL;
}
if (verbose >= VERBOSE_3)
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "This task hogging %ld bytes of memory\n", size);
2012-01-09 06:41:30 +04:00
buf = malloc_and_touch(size);
if (buf == NULL)
ERR("malloc of simulated applciation buffer failed");
return buf;
}
/*
* Write times taken during each iteration of the test.
*/
static void
WriteTimes(IOR_param_t *test, const double *timer, const int iteration,
const int access)
{
char timerName[MAX_STR];
2012-01-09 06:41:30 +04:00
for (int i = 0; i < IOR_NB_TIMERS; i++) {
if (access == WRITE) {
switch (i) {
case 0:
strcpy(timerName, "write open start");
break;
case 1:
strcpy(timerName, "write open stop");
break;
case 2:
strcpy(timerName, "write start");
break;
case 3:
strcpy(timerName, "write stop");
break;
case 4:
strcpy(timerName, "write close start");
break;
case 5:
strcpy(timerName, "write close stop");
break;
default:
strcpy(timerName, "invalid timer");
break;
}
}
else {
switch (i) {
case 0:
strcpy(timerName, "read open start");
break;
case 1:
strcpy(timerName, "read open stop");
break;
case 2:
strcpy(timerName, "read start");
break;
case 3:
strcpy(timerName, "read stop");
break;
case 4:
strcpy(timerName, "read close start");
break;
case 5:
strcpy(timerName, "read close stop");
break;
default:
strcpy(timerName, "invalid timer");
break;
}
}
fprintf(out_logfile, "Test %d: Iter=%d, Task=%d, Time=%f, %s\n",
test->id, iteration, (int)rank, timer[i],
timerName);
}
}
2011-06-17 23:20:43 +04:00
/*
* Using the test parameters, run iteration(s) of single test.
*/
static void TestIoSys(IOR_test_t *test)
2011-06-17 23:20:43 +04:00
{
IOR_param_t *params = &test->params;
IOR_results_t *results = test->results;
char testFileName[MAX_STR];
double timer[IOR_NB_TIMERS];
double startTime;
int pretendRank;
int rep;
aiori_fd_t *fd;
MPI_Group orig_group, new_group;
int range[3];
IOR_offset_t dataMoved; /* for data rate calculation */
void *hog_buf;
IOR_io_buffers ioBuffers;
/* set up communicator for test */
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group),
"MPI_Comm_group() error");
range[0] = 0; /* first rank */
range[1] = params->numTasks - 1; /* last rank */
range[2] = 1; /* stride */
MPI_CHECK(MPI_Group_range_incl(orig_group, 1, &range, &new_group),
"MPI_Group_range_incl() error");
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Comm_create(mpi_comm_world, new_group, &testComm),
"MPI_Comm_create() error");
MPI_CHECK(MPI_Group_free(&orig_group), "MPI_Group_Free() error");
MPI_CHECK(MPI_Group_free(&new_group), "MPI_Group_Free() error");
params->testComm = testComm;
if (testComm == MPI_COMM_NULL) {
/* tasks not in the group do not participate in this test */
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error");
return;
}
if (rank == 0 && verbose >= VERBOSE_1) {
fprintf(out_logfile, "Participating tasks : %d\n", params->numTasks);
2018-07-07 13:42:21 +03:00
fflush(out_logfile);
}
if (rank == 0 && params->reorderTasks == TRUE && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Using reorderTasks '-C' (useful to avoid read cache in client)\n");
2018-07-07 13:42:21 +03:00
fflush(out_logfile);
2011-06-17 23:20:43 +04:00
}
/* show test setup */
if (rank == 0 && verbose >= VERBOSE_0)
ShowSetup(params);
2011-06-17 23:20:43 +04:00
2012-01-09 06:41:30 +04:00
hog_buf = HogMemory(params);
pretendRank = (rank + rankOffset) % params->numTasks;
/* IO Buffer Setup */
if (params->setTimeStampSignature) { // initialize the buffer properly
2019-12-22 14:21:40 +03:00
params->timeStampSignatureValue = (unsigned int) params->setTimeStampSignature;
}
XferBuffersSetup(&ioBuffers, params, pretendRank);
reseed_incompressible_prng = TRUE; // reset pseudo random generator, necessary to guarantee the next call to FillBuffer produces the same value as it is right now
/* Initial time stamp */
startTime = GetTimeStamp();
2011-06-17 23:20:43 +04:00
/* loop over test iterations */
uint64_t params_saved_wearout = params->stoneWallingWearOutIterations;
for (rep = 0; rep < params->repetitions; rep++) {
/* Get iteration start time in seconds in task 0 and broadcast to
all tasks */
if (rank == 0) {
if (! params->setTimeStampSignature) {
time_t currentTime;
if ((currentTime = time(NULL)) == -1) {
ERR("cannot get current time");
}
params->timeStampSignatureValue =
(unsigned int)currentTime;
}
if (verbose >= VERBOSE_2) {
fprintf(out_logfile,
"Using Time Stamp %u (0x%x) for Data Signature\n",
params->timeStampSignatureValue,
params->timeStampSignatureValue);
}
if (rep == 0 && verbose >= VERBOSE_0) {
2018-07-08 15:47:55 +03:00
PrintTableHeader();
}
}
MPI_CHECK(MPI_Bcast
(&params->timeStampSignatureValue, 1, MPI_UNSIGNED, 0,
testComm), "cannot broadcast start time value");
2017-12-09 13:52:13 +03:00
FillBuffer(ioBuffers.buffer, params, 0, pretendRank);
/* use repetition count for number of multiple files */
if (params->multiFile)
params->repCounter = rep;
/*
* write the file(s), getting timing between I/O calls
*/
2012-01-09 00:30:05 +04:00
if (params->writeFile && !test_time_elapsed(params, startTime)) {
GetTestFileName(testFileName, params);
if (verbose >= VERBOSE_3) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "task %d writing %s\n", rank,
testFileName);
}
DelaySecs(params->interTestDelay);
if (params->useExistingTestFile == FALSE) {
RemoveFile(testFileName, params->filePerProc,
params);
}
params->stoneWallingWearOutIterations = params_saved_wearout;
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
params->open = WRITE;
timer[0] = GetTimeStamp();
fd = backend->create(testFileName, IOR_WRONLY | IOR_CREAT | IOR_TRUNC, params->backend_options);
timer[1] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Commencing write performance test: %s",
CurrentTimeString());
}
timer[2] = GetTimeStamp();
dataMoved = WriteOrRead(params, &results[rep], fd, WRITE, &ioBuffers);
if (params->verbose >= VERBOSE_4) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "* data moved = %llu\n", dataMoved);
fflush(out_logfile);
}
timer[3] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
timer[4] = GetTimeStamp();
backend->close(fd, params->backend_options);
timer[5] = GetTimeStamp();
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
/* get the size of the file just written */
results[rep].write.aggFileSizeFromStat =
backend->get_file_size(params->backend_options, testComm, testFileName);
/* check if stat() of file doesn't equal expected file size,
use actual amount of byte moved */
CheckFileSize(test, dataMoved, rep, WRITE);
if (verbose >= VERBOSE_3)
WriteTimes(params, timer, rep, WRITE);
ReduceIterResults(test, timer, rep, WRITE);
if (params->outlierThreshold) {
CheckForOutliers(params, timer, WRITE);
}
/* check if in this round we run write with stonewalling */
if(params->deadlineForStonewalling > 0){
params->stoneWallingWearOutIterations = results[rep].write.pairs_accessed;
}
}
2011-06-17 23:20:43 +04:00
/*
* perform a check of data, reading back data and comparing
* against what was expected to be written
*/
2012-01-09 00:30:05 +04:00
if (params->checkWrite && !test_time_elapsed(params, startTime)) {
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
"Verifying contents of the file(s) just written.\n");
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "%s\n", CurrentTimeString());
}
if (params->reorderTasks) {
/* move two nodes away from writing node */
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
if (params->tasksBlockMapping) {
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */
}
rankOffset = (2 * shift) % params->numTasks;
}
// update the check buffer
FillBuffer(ioBuffers.readCheckBuffer, params, 0, (rank + rankOffset) % params->numTasks);
reseed_incompressible_prng = TRUE; /* Re-Seed the PRNG to get same sequence back, if random */
GetTestFileName(testFileName, params);
params->open = WRITECHECK;
fd = backend->open(testFileName, IOR_RDONLY, params->backend_options);
dataMoved = WriteOrRead(params, &results[rep], fd, WRITECHECK, &ioBuffers);
backend->close(fd, params->backend_options);
rankOffset = 0;
}
/*
* read the file(s), getting timing between I/O calls
*/
if ((params->readFile || params->checkRead ) && !test_time_elapsed(params, startTime)) {
/* check for stonewall */
if(params->stoneWallingStatusFile){
params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile);
if(params->stoneWallingWearOutIterations == -1 && rank == 0){
WARN("Could not read back the stonewalling status from the file!");
params->stoneWallingWearOutIterations = 0;
}
}
int operation_flag = READ;
if ( params->checkRead ){
// actually read and then compare the buffer
operation_flag = READCHECK;
}
/* Get rankOffset [file offset] for this process to read, based on -C,-Z,-Q,-X options */
/* Constant process offset reading */
if (params->reorderTasks) {
/* move one node away from writing node */
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
if (params->tasksBlockMapping) {
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */
}
rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks;
}
/* random process offset reading */
if (params->reorderTasksRandom) {
/* this should not intefere with randomOffset within a file because GetOffsetArrayRandom */
/* seeds every rand() call */
int nodeoffset;
unsigned int iseed0;
nodeoffset = params->taskPerNodeOffset;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1;
if (params->reorderTasksRandomSeed < 0)
iseed0 = -1 * params->reorderTasksRandomSeed + rep;
else
iseed0 = params->reorderTasksRandomSeed;
srand(rank + iseed0);
{
rankOffset = rand() % params->numTasks;
}
while (rankOffset <
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
(nodeoffset * params->numTasksOnNode0)) {
rankOffset = rand() % params->numTasks;
}
/* Get more detailed stats if requested by verbose level */
if (verbose >= VERBOSE_2) {
file_hits_histogram(params);
}
}
if(operation_flag == READCHECK){
2017-10-25 16:57:50 +03:00
FillBuffer(ioBuffers.readCheckBuffer, params, 0, (rank + rankOffset) % params->numTasks);
}
/* Using globally passed rankOffset, following function generates testFileName to read */
GetTestFileName(testFileName, params);
if (verbose >= VERBOSE_3) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile, "task %d reading %s\n", rank,
testFileName);
}
DelaySecs(params->interTestDelay);
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
params->open = READ;
timer[0] = GetTimeStamp();
fd = backend->open(testFileName, IOR_RDONLY, params->backend_options);
timer[1] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
2018-07-07 13:42:21 +03:00
fprintf(out_logfile,
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
"Commencing read performance test: %s\n",
CurrentTimeString());
}
timer[2] = GetTimeStamp();
dataMoved = WriteOrRead(params, &results[rep], fd, operation_flag, &ioBuffers);
timer[3] = GetTimeStamp();
if (params->intraTestBarriers)
MPI_CHECK(MPI_Barrier(testComm),
"barrier error");
timer[4] = GetTimeStamp();
backend->close(fd, params->backend_options);
timer[5] = GetTimeStamp();
/* get the size of the file just read */
results[rep].read.aggFileSizeFromStat =
backend->get_file_size(params->backend_options, testComm,
testFileName);
/* check if stat() of file doesn't equal expected file size,
use actual amount of byte moved */
CheckFileSize(test, dataMoved, rep, READ);
if (verbose >= VERBOSE_3)
WriteTimes(params, timer, rep, READ);
ReduceIterResults(test, timer, rep, READ);
if (params->outlierThreshold) {
CheckForOutliers(params, timer, READ);
}
}
if (!params->keepFile
2012-01-09 06:55:46 +04:00
&& !(params->errorFound && params->keepFileWithError)) {
double start, finish;
start = GetTimeStamp();
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
RemoveFile(testFileName, params->filePerProc, params);
2012-01-09 06:55:46 +04:00
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
finish = GetTimeStamp();
PrintRemoveTiming(start, finish, rep);
} else {
MPI_CHECK(MPI_Barrier(testComm), "barrier error");
}
params->errorFound = FALSE;
rankOffset = 0;
2018-07-08 18:47:23 +03:00
PrintRepeatEnd();
}
2011-06-17 23:20:43 +04:00
MPI_CHECK(MPI_Comm_free(&testComm), "MPI_Comm_free() error");
if (params->summary_every_test) {
PrintLongSummaryHeader();
PrintLongSummaryOneTest(test);
} else {
PrintShortSummary(test);
}
XferBuffersFree(&ioBuffers, params);
if (hog_buf != NULL)
free(hog_buf);
/* Sync with the tasks that did not participate in this test */
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error");
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Determine if valid tests from parameters.
*/
static void ValidateTests(IOR_param_t * test)
2011-06-17 23:20:43 +04:00
{
IOR_param_t defaults;
init_IOR_Param_t(&defaults);
if (test->repetitions <= 0)
WARN_RESET("too few test repetitions",
test, &defaults, repetitions);
if (test->numTasks <= 0)
ERR("too few tasks for testing");
if (test->interTestDelay < 0)
WARN_RESET("inter-test delay must be nonnegative value",
test, &defaults, interTestDelay);
if (test->readFile != TRUE && test->writeFile != TRUE
&& test->checkRead != TRUE && test->checkWrite != TRUE)
ERR("test must write, read, or check read/write file");
if(! test->setTimeStampSignature && test->writeFile != TRUE && test->checkRead == TRUE)
ERR("using readCheck only requires to write a timeStampSignature -- use -G");
if (test->segmentCount < 0)
ERR("segment count must be positive value");
if ((test->blockSize % sizeof(IOR_size_t)) != 0)
ERR("block size must be a multiple of access size");
if (test->blockSize < 0)
ERR("block size must be non-negative integer");
if ((test->transferSize % sizeof(IOR_size_t)) != 0)
ERR("transfer size must be a multiple of access size");
if (test->transferSize < 0)
ERR("transfer size must be non-negative integer");
if (test->transferSize == 0) {
ERR("test will not complete with zero transfer size");
2011-06-17 23:20:43 +04:00
} else {
if ((test->blockSize % test->transferSize) != 0)
ERR("block size must be a multiple of transfer size");
}
if (test->blockSize < test->transferSize)
ERR("block size must not be smaller than transfer size");
/* specific APIs */
if ((strcasecmp(test->api, "MPIIO") == 0)
&& (test->blockSize < sizeof(IOR_size_t)
|| test->transferSize < sizeof(IOR_size_t)))
ERR("block/transfer size may not be smaller than IOR_size_t for MPIIO");
if ((strcasecmp(test->api, "HDF5") == 0)
&& (test->blockSize < sizeof(IOR_size_t)
|| test->transferSize < sizeof(IOR_size_t)))
ERR("block/transfer size may not be smaller than IOR_size_t for HDF5");
if ((strcasecmp(test->api, "NCMPI") == 0)
&& (test->blockSize < sizeof(IOR_size_t)
|| test->transferSize < sizeof(IOR_size_t)))
ERR("block/transfer size may not be smaller than IOR_size_t for NCMPI");
2018-04-27 20:44:20 +03:00
if (((strcasecmp(test->api, "POSIX") != 0)
&& (strcasecmp(test->api, "MPIIO") != 0)
2018-07-12 18:13:40 +03:00
&& (strcasecmp(test->api, "MMAP") != 0)
2018-08-15 01:08:04 +03:00
&& (strcasecmp(test->api, "HDFS") != 0)
&& (strcasecmp(test->api, "DFS") != 0)
&& (strcasecmp(test->api, "DAOS") != 0)
2019-08-02 07:03:59 +03:00
&& (strcasecmp(test->api, "Gfarm") != 0)
&& (strcasecmp(test->api, "RADOS") != 0)
&& (strcasecmp(test->api, "CEPHFS") != 0)) && test->fsync)
2018-04-27 20:44:20 +03:00
WARN_RESET("fsync() not supported in selected backend",
test, &defaults, fsync);
2020-07-03 10:09:40 +03:00
/* parameter consistency */
if (test->reorderTasks == TRUE && test->reorderTasksRandom == TRUE)
ERR("Both Constant and Random task re-ordering specified. Choose one and resubmit");
if (test->randomOffset && test->reorderTasksRandom
&& test->filePerProc == FALSE)
ERR("random offset and random reorder tasks specified with single-shared-file. Choose one and resubmit");
if (test->randomOffset && test->reorderTasks
&& test->filePerProc == FALSE)
ERR("random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit");
if (test->randomOffset && test->checkRead)
ERR("random offset not available with read check option (use write check)");
if (test->randomOffset && test->storeFileOffset)
ERR("random offset not available with store file offset option)");
if ((strcasecmp(test->api, "HDF5") == 0) && test->randomOffset)
ERR("random offset not available with HDF5");
if ((strcasecmp(test->api, "NCMPI") == 0) && test->randomOffset)
ERR("random offset not available with NCMPI");
if ((strcasecmp(test->api, "NCMPI") == 0) && test->filePerProc)
ERR("file-per-proc not available in current NCMPI");
backend = test->backend;
ior_set_xfer_hints(test);
/* allow the backend to validate the options */
if(test->backend->check_params){
int check = test->backend->check_params(test->backend_options);
if (check){
ERR("The backend returned that the test parameters are invalid.");
}
}
2011-11-12 03:11:28 +04:00
}
/**
* Returns a precomputed array of IOR_offset_t for the inner benchmark loop.
* They are sequential and the last element is set to -1 as end marker.
* @param test IOR_param_t for getting transferSize, blocksize and SegmentCount
2020-07-03 10:09:40 +03:00
* @param pretendRank int pretended Rank for shifting the offsets correctly
* @return IOR_offset_t
*/
IOR_offset_t *GetOffsetArraySequential(IOR_param_t * test, int pretendRank)
2011-06-17 23:20:43 +04:00
{
IOR_offset_t i, j, k = 0;
IOR_offset_t offsets;
IOR_offset_t *offsetArray;
/* count needed offsets */
offsets = (test->blockSize / test->transferSize) * test->segmentCount;
/* setup empty array */
offsetArray =
(IOR_offset_t *) malloc((offsets + 1) * sizeof(IOR_offset_t));
if (offsetArray == NULL)
ERR("malloc() failed");
offsetArray[offsets] = -1; /* set last offset with -1 */
/* fill with offsets */
for (i = 0; i < test->segmentCount; i++) {
for (j = 0; j < (test->blockSize / test->transferSize); j++) {
offsetArray[k] = j * test->transferSize;
if (test->filePerProc) {
offsetArray[k] += i * test->blockSize;
} else {
offsetArray[k] +=
(i * test->numTasks * test->blockSize)
+ (pretendRank * test->blockSize);
}
k++;
}
}
return (offsetArray);
2011-11-12 03:11:28 +04:00
}
/**
2018-07-07 13:42:21 +03:00
* Returns a precomputed array of IOR_offset_t for the inner benchmark loop.
* They get created sequentially and mixed up in the end. The last array element
* is set to -1 as end marker.
* It should be noted that as the seeds get synchronised across all processes
* every process computes the same random order if used with filePerProc.
* For a shared file all transfers get randomly assigned to ranks. The processes
* can also have differen't numbers of transfers. This might lead to a bigger
* diversion in accesse as it dose with filePerProc. This is expected but
* should be mined.
* @param test IOR_param_t for getting transferSize, blocksize and SegmentCount
2020-07-03 10:09:40 +03:00
* @param pretendRank int pretended Rank for shifting the offsets correctly
* @return IOR_offset_t
* @return
*/
IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int access)
2011-06-17 23:20:43 +04:00
{
int seed;
IOR_offset_t i, value, tmp;
2011-11-12 03:11:28 +04:00
IOR_offset_t offsets = 0;
IOR_offset_t offsetCnt = 0;
IOR_offset_t fileSize;
IOR_offset_t *offsetArray;
/* set up seed for random() */
if (access == WRITE || access == READ) {
test->randomSeed = seed = rand();
2011-06-17 23:20:43 +04:00
} else {
seed = test->randomSeed;
2011-06-17 23:20:43 +04:00
}
srand(seed);
2011-06-17 23:20:43 +04:00
fileSize = test->blockSize * test->segmentCount;
if (test->filePerProc == FALSE) {
fileSize *= test->numTasks;
2011-06-17 23:20:43 +04:00
}
/* count needed offsets (pass 1) */
2011-06-17 23:20:43 +04:00
for (i = 0; i < fileSize; i += test->transferSize) {
if (test->filePerProc == FALSE) {
// this counts which process get how many transferes in
// a shared file
if ((rand() % test->numTasks) == pretendRank) {
offsets++;
}
} else {
offsets++;
}
2011-06-17 23:20:43 +04:00
}
/* setup empty array */
offsetArray =
(IOR_offset_t *) malloc((offsets + 1) * sizeof(IOR_offset_t));
if (offsetArray == NULL)
ERR("malloc() failed");
offsetArray[offsets] = -1; /* set last offset with -1 */
if (test->filePerProc) {
/* fill array */
for (i = 0; i < offsets; i++) {
offsetArray[i] = i * test->transferSize;
}
} else {
/* fill with offsets (pass 2) */
srand(seed); /* need same seed to get same transfers as counted in the beginning*/
for (i = 0; i < fileSize; i += test->transferSize) {
if ((rand() % test->numTasks) == pretendRank) {
offsetArray[offsetCnt] = i;
offsetCnt++;
}
}
}
/* reorder array */
for (i = 0; i < offsets; i++) {
value = rand() % offsets;
tmp = offsetArray[value];
offsetArray[value] = offsetArray[i];
offsetArray[i] = tmp;
}
SeedRandGen(test->testComm); /* synchronize seeds across tasks */
2011-06-17 23:20:43 +04:00
return (offsetArray);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
static IOR_offset_t WriteOrReadSingle(IOR_offset_t pairCnt, IOR_offset_t *offsetArray, int pretendRank,
IOR_offset_t * transferCount, int * errors, IOR_param_t * test, aiori_fd_t * fd, IOR_io_buffers* ioBuffers, int access){
IOR_offset_t amtXferred = 0;
IOR_offset_t transfer;
void *buffer = ioBuffers->buffer;
void *checkBuffer = ioBuffers->checkBuffer;
void *readCheckBuffer = ioBuffers->readCheckBuffer;
2020-06-10 19:47:07 +03:00
IOR_offset_t offset = offsetArray[pairCnt]; // this looks inappropriate
transfer = test->transferSize;
if (access == WRITE) {
2017-11-30 13:56:26 +03:00
/* fills each transfer with a unique pattern
* containing the offset into the file */
if (test->storeFileOffset == TRUE) {
2020-06-10 19:47:07 +03:00
FillBuffer(buffer, test, offset, pretendRank);
}
2020-06-10 19:47:07 +03:00
amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
if (amtXferred != transfer)
ERR("cannot write to file");
if (test->fsyncPerWrite)
backend->fsync(fd, test->backend_options);
if (test->interIODelay > 0){
2018-10-06 19:30:00 +03:00
struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)};
nanosleep( & wait, NULL);
}
} else if (access == READ) {
2020-06-10 19:47:07 +03:00
amtXferred = backend->xfer(access, fd, buffer, transfer, offset, test->backend_options);
if (amtXferred != transfer)
ERR("cannot read from file");
if (test->interIODelay > 0){
2018-10-06 19:30:00 +03:00
struct timespec wait = {test->interIODelay / 1000 / 1000, 1000l * (test->interIODelay % 1000000)};
nanosleep( & wait, NULL);
}
} else if (access == WRITECHECK) {
memset(checkBuffer, 'a', transfer);
2017-11-30 13:56:26 +03:00
if (test->storeFileOffset == TRUE) {
2020-06-10 19:47:07 +03:00
FillBuffer(readCheckBuffer, test, offset, pretendRank);
2017-11-30 13:56:26 +03:00
}
2020-06-10 19:47:07 +03:00
amtXferred = backend->xfer(access, fd, checkBuffer, transfer, offset, test->backend_options);
if (amtXferred != transfer)
ERR("cannot read from file write check");
(*transferCount)++;
*errors += CompareBuffers(readCheckBuffer, checkBuffer, transfer,
*transferCount, test,
WRITECHECK);
} else if (access == READCHECK) {
memset(checkBuffer, 'a', transfer);
2020-06-10 19:47:07 +03:00
amtXferred = backend->xfer(access, fd, checkBuffer, transfer, offset, test->backend_options);
if (amtXferred != transfer){
ERR("cannot read from file");
}
if (test->storeFileOffset == TRUE) {
2020-06-10 19:47:07 +03:00
FillBuffer(readCheckBuffer, test, offset, pretendRank);
}
*errors += CompareBuffers(readCheckBuffer, checkBuffer, transfer, *transferCount, test, READCHECK);
}
return amtXferred;
}
2011-06-17 23:20:43 +04:00
/*
* Write or Read data to file(s). This loops through the strides, writing
* out the data to each block in transfer sizes, until the remainder left is 0.
*/
static IOR_offset_t WriteOrRead(IOR_param_t *test, IOR_results_t *results,
aiori_fd_t *fd, const int access, IOR_io_buffers *ioBuffers)
2011-06-17 23:20:43 +04:00
{
int errors = 0;
IOR_offset_t transferCount = 0;
2018-07-07 13:42:21 +03:00
uint64_t pairCnt = 0;
IOR_offset_t *offsetArray;
int pretendRank;
IOR_offset_t dataMoved = 0; /* for data rate calculation */
double startForStonewall;
int hitStonewall;
IOR_point_t *point = ((access == WRITE) || (access == WRITECHECK)) ?
&results->write : &results->read;
/* initialize values */
pretendRank = (rank + rankOffset) % test->numTasks;
if (test->randomOffset) {
offsetArray = GetOffsetArrayRandom(test, pretendRank, access);
} else {
offsetArray = GetOffsetArraySequential(test, pretendRank);
2011-06-17 23:20:43 +04:00
}
startForStonewall = GetTimeStamp();
hitStonewall = 0;
2011-06-17 23:20:43 +04:00
/* loop over offsets to access */
while ((offsetArray[pairCnt] != -1) && !hitStonewall ) {
dataMoved += WriteOrReadSingle(pairCnt, offsetArray, pretendRank, & transferCount, & errors, test, fd, ioBuffers, access);
pairCnt++;
2018-07-14 14:22:36 +03:00
hitStonewall = ((test->deadlineForStonewalling != 0
&& (GetTimeStamp() - startForStonewall)
> test->deadlineForStonewalling)) || (test->stoneWallingWearOutIterations != 0 && pairCnt == test->stoneWallingWearOutIterations) ;
if ( test->collective && test->deadlineForStonewalling ) {
// if collective-mode, you'll get a HANG, if some rank 'accidentally' leave this loop
// it absolutely must be an 'all or none':
MPI_CHECK(MPI_Bcast(&hitStonewall, 1, MPI_INT, 0, MPI_COMM_WORLD), "hitStonewall broadcast failed");
}
}
if (test->stoneWallingWearOut){
2018-07-07 13:42:21 +03:00
if (verbose >= VERBOSE_1){
fprintf(out_logfile, "%d: stonewalling pairs accessed: %lld\n", rank, (long long) pairCnt);
}
long long data_moved_ll = (long long) dataMoved;
long long pairs_accessed_min = 0;
MPI_CHECK(MPI_Allreduce(& pairCnt, &point->pairs_accessed,
1, MPI_LONG_LONG_INT, MPI_MAX, testComm), "cannot reduce pairs moved");
2018-07-07 13:42:21 +03:00
double stonewall_runtime = GetTimeStamp() - startForStonewall;
point->stonewall_time = stonewall_runtime;
2018-07-07 13:42:21 +03:00
MPI_CHECK(MPI_Reduce(& pairCnt, & pairs_accessed_min,
1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved");
MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_min_data_accessed,
2018-07-07 13:42:21 +03:00
1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm), "cannot reduce pairs moved");
MPI_CHECK(MPI_Reduce(& data_moved_ll, &point->stonewall_avg_data_accessed,
2018-07-07 13:42:21 +03:00
1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm), "cannot reduce pairs moved");
if(rank == 0){
fprintf(out_logfile, "stonewalling pairs accessed min: %lld max: %zu -- min data: %.1f GiB mean data: %.1f GiB time: %.1fs\n",
pairs_accessed_min, point->pairs_accessed,
point->stonewall_min_data_accessed /1024.0 / 1024 / 1024, point->stonewall_avg_data_accessed / 1024.0 / 1024 / 1024 / test->numTasks , point->stonewall_time);
point->stonewall_min_data_accessed *= test->numTasks;
2018-07-07 13:42:21 +03:00
}
if(pairCnt != point->pairs_accessed){
// some work needs still to be done !
for(; pairCnt < point->pairs_accessed; pairCnt++ ) {
dataMoved += WriteOrReadSingle(pairCnt, offsetArray, pretendRank, & transferCount, & errors, test, fd, ioBuffers, access);
}
}
}else{
point->pairs_accessed = pairCnt;
}
2011-06-17 23:20:43 +04:00
totalErrorCount += CountErrors(test, access, errors);
2011-06-17 23:20:43 +04:00
free(offsetArray);
2011-06-17 23:20:43 +04:00
if (access == WRITE && test->fsync == TRUE) {
backend->fsync(fd, test->backend_options); /*fsync after all accesses */
}
return (dataMoved);
2011-11-12 03:11:28 +04:00
}