Merge pull request #182 from jschwartz-cray/fix-181

Fix #181.
master
Julian Kunkel 2019-09-19 16:53:41 +01:00 committed by GitHub
commit 604598ab2f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 164 additions and 76 deletions

2
NEWS
View File

@ -120,7 +120,7 @@ Version 2.10.1
- Corrected IOR_GetFileSize() function to point to HDF5 and NCMPI versions of
IOR_GetFileSize() calls
- Changed the netcdf dataset from 1D array to 4D array, where the 4 dimensions
are: [segmentCount][numTasksWorld][numTransfers][transferSize]
are: [segmentCount][numTasks][numTransfers][transferSize]
This patch from Wei-keng Liao allows for file sizes > 4GB (provided no
single dimension is > 4GB).
- Finalized random-capability release

View File

@ -216,7 +216,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer,
param->blockSize / param->transferSize;
/* reshape 1D array to 3D array:
[segmentCount*numTasksWorld][numTransfers][transferSize]
[segmentCount*numTasks][numTransfers][transferSize]
Requirement: none of these dimensions should be > 4G,
*/
NCMPI_CHECK(ncmpi_def_dim
@ -267,7 +267,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer,
bufSize[1] = 1;
bufSize[2] = param->transferSize;
offset[0] = segmentNum * numTasksWorld + rank;
offset[0] = segmentNum * param->numTasks + rank;
offset[1] = transferNum;
offset[2] = 0;

View File

@ -340,10 +340,10 @@ void ShowTestStart(IOR_param_t *test)
PrintKeyVal("options", test->options);
PrintKeyValInt("dryRun", test->dryRun);
PrintKeyValInt("nodes", test->nodes);
PrintKeyValInt("nodes", test->numNodes);
PrintKeyValInt("memoryPerTask", (unsigned long) test->memoryPerTask);
PrintKeyValInt("memoryPerNode", (unsigned long) test->memoryPerNode);
PrintKeyValInt("tasksPerNode", tasksPerNode);
PrintKeyValInt("tasksPerNode", test->numTasksOnNode0);
PrintKeyValInt("repetitions", test->repetitions);
PrintKeyValInt("multiFile", test->multiFile);
PrintKeyValInt("interTestDelay", test->interTestDelay);
@ -431,8 +431,9 @@ void ShowSetup(IOR_param_t *params)
PrintKeyValInt("task offset", params->taskPerNodeOffset);
PrintKeyValInt("reorder random seed", params->reorderTasksRandomSeed);
}
PrintKeyValInt("nodes", params->numNodes);
PrintKeyValInt("tasks", params->numTasks);
PrintKeyValInt("clients per node", params->tasksPerNode);
PrintKeyValInt("clients per node", params->numTasksOnNode0);
if (params->memoryPerTask != 0){
PrintKeyVal("memoryPerTask", HumanReadable(params->memoryPerTask, BASE_TWO));
}
@ -572,7 +573,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access)
}
fprintf(out_resultfile, "%5d ", params->id);
fprintf(out_resultfile, "%6d ", params->numTasks);
fprintf(out_resultfile, "%3d ", params->tasksPerNode);
fprintf(out_resultfile, "%3d ", params->numTasksOnNode0);
fprintf(out_resultfile, "%4d ", params->repetitions);
fprintf(out_resultfile, "%3d ", params->filePerProc);
fprintf(out_resultfile, "%5d ", params->reorderTasks);
@ -596,7 +597,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access)
PrintKeyValInt("blockSize", params->blockSize);
PrintKeyValInt("transferSize", params->transferSize);
PrintKeyValInt("numTasks", params->numTasks);
PrintKeyValInt("tasksPerNode", params->tasksPerNode);
PrintKeyValInt("tasksPerNode", params->numTasksOnNode0);
PrintKeyValInt("repetitions", params->repetitions);
PrintKeyValInt("filePerProc", params->filePerProc);
PrintKeyValInt("reorderTasks", params->reorderTasks);

View File

@ -65,7 +65,6 @@ IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out
out_resultfile = world_out;
mpi_comm_world = world_com;
MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld), "cannot get number of tasks");
MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank");
/* setup tests, and validate parameters */
@ -113,8 +112,6 @@ int ior_main(int argc, char **argv)
MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI");
mpi_comm_world = MPI_COMM_WORLD;
MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld),
"cannot get number of tasks");
MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank");
/* set error-handling */
@ -189,8 +186,14 @@ void init_IOR_Param_t(IOR_param_t * p)
p->writeFile = p->readFile = FALSE;
p->checkWrite = p->checkRead = FALSE;
p->nodes = 1;
p->tasksPerNode = 1;
/*
* These can be overridden from the command-line but otherwise will be
* set from MPI.
*/
p->numTasks = -1;
p->numNodes = -1;
p->numTasksOnNode0 = -1;
p->repetitions = 1;
p->repCounter = -1;
p->open = WRITE;
@ -920,12 +923,17 @@ static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test)
*/
static void InitTests(IOR_test_t *tests, MPI_Comm com)
{
int size;
int mpiNumNodes = 0;
int mpiNumTasks = 0;
int mpiNumTasksOnNode0 = 0;
MPI_CHECK(MPI_Comm_size(com, & size), "MPI_Comm_size() error");
/* count the tasks per node */
tasksPerNode = CountTasksPerNode(com);
/*
* These default values are the same for every test and expensive to
* retrieve so just do it once.
*/
mpiNumNodes = GetNumNodes(com);
mpiNumTasks = GetNumTasks(com);
mpiNumTasksOnNode0 = GetNumTasksOnNode0(com);
/*
* Since there is no guarantee that anyone other than
@ -938,12 +946,28 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com)
while (tests != NULL) {
IOR_param_t *params = & tests->params;
params->testComm = com;
params->nodes = params->numTasks / tasksPerNode;
params->tasksPerNode = tasksPerNode;
params->tasksBlockMapping = QueryNodeMapping(com,false);
if (params->numTasks == 0) {
params->numTasks = size;
/* use MPI values if not overridden on command-line */
if (params->numNodes == -1) {
params->numNodes = mpiNumNodes;
}
if (params->numTasks == -1) {
params->numTasks = mpiNumTasks;
} else if (params->numTasks > mpiNumTasks) {
if (rank == 0) {
fprintf(out_logfile,
"WARNING: More tasks requested (%d) than available (%d),",
params->numTasks, mpiNumTasks);
fprintf(out_logfile, " running with %d tasks.\n",
mpiNumTasks);
}
params->numTasks = mpiNumTasks;
}
if (params->numTasksOnNode0 == -1) {
params->numTasksOnNode0 = mpiNumTasksOnNode0;
}
params->tasksBlockMapping = QueryNodeMapping(com,false);
params->expectedAggFileSize =
params->blockSize * params->segmentCount * params->numTasks;
@ -1091,7 +1115,7 @@ static void *HogMemory(IOR_param_t *params)
if (verbose >= VERBOSE_3)
fprintf(out_logfile, "This node hogging %ld bytes of memory\n",
params->memoryPerNode);
size = params->memoryPerNode / params->tasksPerNode;
size = params->memoryPerNode / params->numTasksOnNode0;
} else {
return NULL;
}
@ -1191,16 +1215,6 @@ static void TestIoSys(IOR_test_t *test)
IOR_io_buffers ioBuffers;
/* set up communicator for test */
if (params->numTasks > numTasksWorld) {
if (rank == 0) {
fprintf(out_logfile,
"WARNING: More tasks requested (%d) than available (%d),",
params->numTasks, numTasksWorld);
fprintf(out_logfile, " running on %d tasks.\n",
numTasksWorld);
}
params->numTasks = numTasksWorld;
}
MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group),
"MPI_Comm_group() error");
range[0] = 0; /* first rank */
@ -1227,7 +1241,6 @@ static void TestIoSys(IOR_test_t *test)
"Using reorderTasks '-C' (useful to avoid read cache in client)\n");
fflush(out_logfile);
}
params->tasksPerNode = CountTasksPerNode(testComm);
backend = params->backend;
/* show test setup */
if (rank == 0 && verbose >= VERBOSE_0)
@ -1364,7 +1377,7 @@ static void TestIoSys(IOR_test_t *test)
/* move two nodes away from writing node */
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
if (params->tasksBlockMapping) {
shift = params->tasksPerNode; /* switch to by-slot (contiguous block) mapping */
shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */
}
rankOffset = (2 * shift) % params->numTasks;
}
@ -1389,7 +1402,7 @@ static void TestIoSys(IOR_test_t *test)
if(params->stoneWallingStatusFile){
params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile);
if(params->stoneWallingWearOutIterations == -1 && rank == 0){
fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!");
fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!\n");
params->stoneWallingWearOutIterations = 0;
}
}
@ -1404,7 +1417,7 @@ static void TestIoSys(IOR_test_t *test)
/* move one node away from writing node */
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
if (params->tasksBlockMapping) {
shift=params->tasksPerNode; /* switch to a by-slot (contiguous block) mapping */
shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */
}
rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks;
}
@ -1415,7 +1428,7 @@ static void TestIoSys(IOR_test_t *test)
int nodeoffset;
unsigned int iseed0;
nodeoffset = params->taskPerNodeOffset;
nodeoffset = (nodeoffset < params->nodes) ? nodeoffset : params->nodes - 1;
nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1;
if (params->reorderTasksRandomSeed < 0)
iseed0 = -1 * params->reorderTasksRandomSeed + rep;
else
@ -1425,7 +1438,7 @@ static void TestIoSys(IOR_test_t *test)
rankOffset = rand() % params->numTasks;
}
while (rankOffset <
(nodeoffset * params->tasksPerNode)) {
(nodeoffset * params->numTasksOnNode0)) {
rankOffset = rand() % params->numTasks;
}
/* Get more detailed stats if requested by verbose level */
@ -1455,7 +1468,7 @@ static void TestIoSys(IOR_test_t *test)
"barrier error");
if (rank == 0 && verbose >= VERBOSE_1) {
fprintf(out_logfile,
"Commencing read performance test: %s",
"Commencing read performance test: %s\n",
CurrentTimeString());
}
timer[2] = GetTimeStamp();

View File

@ -98,8 +98,8 @@ typedef struct
// intermediate options
int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */
int numTasks; /* number of tasks for test */
int nodes; /* number of nodes for test */
int tasksPerNode; /* number of tasks per node */
int numNodes; /* number of nodes for test */
int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */
int tasksBlockMapping; /* are the tasks in contiguous blocks across nodes or round-robin */
int repetitions; /* number of repetitions of test */
int repCounter; /* rep counter */

View File

@ -1870,7 +1870,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
mdtest_init_args();
int i, j;
int nodeCount;
int numNodes;
int numTasksOnNode0 = 0;
MPI_Group worldgroup, testgroup;
struct {
int first;
@ -1950,8 +1951,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
pid = getpid();
uid = getuid();
tasksPerNode = CountTasksPerNode(testComm);
nodeCount = size / tasksPerNode;
numNodes = GetNumNodes(testComm);
numTasksOnNode0 = GetNumTasksOnNode0(testComm);
char cmd_buffer[4096];
strncpy(cmd_buffer, argv[0], 4096);
@ -1960,7 +1961,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
}
VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp());
VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, nodeCount);
VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, numNodes);
VERBOSE(0,-1,"Command line used: %s", cmd_buffer);
/* adjust special variables */
@ -2128,10 +2129,10 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
/* set the shift to mimic IOR and shift by procs per node */
if (nstride > 0) {
if ( nodeCount > 1 && tasksBlockMapping ) {
if ( numNodes > 1 && tasksBlockMapping ) {
/* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks
however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */
nstride *= tasksPerNode;
nstride *= numTasksOnNode0;
}
VERBOSE(0,5,"Shifting ranks by %d for each phase.", nstride);
}

View File

@ -151,8 +151,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt
params->maxTimeDuration = atoi(value);
} else if (strcasecmp(option, "outlierthreshold") == 0) {
params->outlierThreshold = atoi(value);
} else if (strcasecmp(option, "nodes") == 0) {
params->nodes = atoi(value);
} else if (strcasecmp(option, "numnodes") == 0) {
params->numNodes = atoi(value);
} else if (strcasecmp(option, "numtasks") == 0) {
params->numTasks = atoi(value);
} else if (strcasecmp(option, "numtasksonnode0") == 0) {
params->numTasksOnNode0 = atoi(value);
} else if (strcasecmp(option, "repetitions") == 0) {
params->repetitions = atoi(value);
} else if (strcasecmp(option, "intertestdelay") == 0) {
@ -286,8 +290,6 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt
params->beegfs_chunkSize = string_to_bytes(value);
if (!ISPOWEROFTWO(params->beegfs_chunkSize) || params->beegfs_chunkSize < (1<<16))
ERR("beegfsChunkSize must be a power of two and >64k");
} else if (strcasecmp(option, "numtasks") == 0) {
params->numTasks = atoi(value);
} else if (strcasecmp(option, "summaryalways") == 0) {
params->summary_every_test = atoi(value);
} else {
@ -498,7 +500,7 @@ option_help * createGlobalOptions(IOR_param_t * params){
{'m', NULL, "multiFile -- use number of reps (-i) for multiple file count", OPTION_FLAG, 'd', & params->multiFile},
{'M', NULL, "memoryPerNode -- hog memory on the node (e.g.: 2g, 75%)", OPTION_OPTIONAL_ARGUMENT, 's', & params->memoryPerNodeStr},
{'n', NULL, "noFill -- no fill in HDF5 file creation", OPTION_FLAG, 'd', & params->noFill},
{'N', NULL, "numTasks -- number of tasks that should participate in the test", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks},
{'N', NULL, "numTasks -- number of tasks that are participating in the test (overrides MPI)", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks},
{'o', NULL, "testFile -- full name for test", OPTION_OPTIONAL_ARGUMENT, 's', & params->testFileName},
{'O', NULL, "string of IOR directives (e.g. -O checkRead=1,lustreStripeCount=32)", OPTION_OPTIONAL_ARGUMENT, 'p', & decodeDirectiveWrapper},
{'p', NULL, "preallocate -- preallocate file size", OPTION_FLAG, 'd', & params->preallocate},

View File

@ -53,11 +53,9 @@
extern int errno;
extern int numTasks;
/* globals used by other files, also defined "extern" in ior.h */
int numTasksWorld = 0;
/* globals used by other files, also defined "extern" in utilities.h */
int rank = 0;
int rankOffset = 0;
int tasksPerNode = 0; /* tasks per node */
int verbose = VERBOSE_0; /* verbose output */
MPI_Comm testComm;
MPI_Comm mpi_comm_world;
@ -265,35 +263,108 @@ int QueryNodeMapping(MPI_Comm comm, int print_nodemap) {
return ret;
}
/*
* There is a more direct way to determine the node count in modern MPI
* versions so we use that if possible.
*
* For older versions we use a method which should still provide accurate
* results even if the total number of tasks is not evenly divisible by the
* tasks on node rank 0.
*/
int GetNumNodes(MPI_Comm comm) {
#if MPI_VERSION >= 3
int CountTasksPerNode(MPI_Comm comm) {
/* modern MPI provides a simple way to get the local process count */
MPI_Comm shared_comm;
int count;
MPI_Comm shared_comm;
int shared_rank = 0;
int local_result = 0;
int numNodes = 0;
MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
"MPI_Comm_split_type() error");
MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
local_result = shared_rank == 0? 1 : 0;
MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm),
"MPI_Allreduce() error");
MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
MPI_Comm_split_type (comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm);
MPI_Comm_size (shared_comm, &count);
MPI_Comm_free (&shared_comm);
return numNodes;
#else
int numTasks = 0;
int numTasksOnNode0 = 0;
return count;
numTasks = GetNumTasks(comm);
numTasksOnNode0 = GetNumTasksOnNode0(comm);
return ((numTasks - 1) / numTasksOnNode0) + 1;
#endif
}
int GetNumTasks(MPI_Comm comm) {
int numTasks = 0;
MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks");
return numTasks;
}
/*
* It's very important that this method provide the same result to every
* process as it's used for redistributing which jobs read from which files.
* It was renamed accordingly.
*
* If different nodes get different results from this method then jobs get
* redistributed unevenly and you no longer have a 1:1 relationship with some
* nodes reading multiple files while others read none.
*
* In the common case the number of tasks on each node (MPI_Comm_size on an
* MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is
* nothing which guarantees this. It's valid to have, for example, 64 jobs
* across 4 systems which can run 20 jobs each. In that scenario you end up
* with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4.
*
* In the (MPI_VERSION < 3) implementation of this method consistency is
* ensured by asking specifically about the number of tasks on the node with
* rank 0. In the original implementation for (MPI_VERSION >= 3) this was
* broken by using the LOCAL process count which differed depending on which
* node you were on.
*
* This was corrected below by first splitting the comm into groups by node
* (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and
* shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields
* the original consistent behavior no matter which node asks.
*
* In the common case where every node has the same number of tasks this
* method will return the same value it always has.
*/
int GetNumTasksOnNode0(MPI_Comm comm) {
#if MPI_VERSION >= 3
MPI_Comm shared_comm;
int shared_rank = 0;
int tasks_on_node_rank0 = 0;
int local_result = 0;
MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
"MPI_Comm_split_type() error");
MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
if (rank == 0 && shared_rank == 0) {
MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error");
}
MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm),
"MPI_Allreduce() error");
MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
return tasks_on_node_rank0;
#else
/*
* Count the number of tasks that share a host.
*
* This function employees the gethostname() call, rather than using
* This version employs the gethostname() call, rather than using
* MPI_Get_processor_name(). We are interested in knowing the number
* of tasks that share a file system client (I/O node, compute node,
* whatever that may be). However on machines like BlueGene/Q,
* MPI_Get_processor_name() uniquely identifies a cpu in a compute node,
* not the node where the I/O is function shipped to. gethostname()
* is assumed to identify the shared filesystem client in more situations.
*
* NOTE: This also assumes that the task count on all nodes is equal
* to the task count on the host running MPI task 0.
*/
int CountTasksPerNode(MPI_Comm comm) {
int size;
MPI_Comm_size(comm, & size);
/* for debugging and testing */
@ -336,8 +407,8 @@ int CountTasksPerNode(MPI_Comm comm) {
MPI_Bcast(&count, 1, MPI_INT, 0, comm);
return(count);
}
#endif
}
/*

View File

@ -18,10 +18,8 @@
#include <mpi.h>
#include "ior.h"
extern int numTasksWorld;
extern int rank;
extern int rankOffset;
extern int tasksPerNode;
extern int verbose;
extern MPI_Comm testComm;
extern MPI_Comm mpi_comm_world;
@ -55,8 +53,10 @@ void SeedRandGen(MPI_Comm);
void SetHints (MPI_Info *, char *);
void ShowHints (MPI_Info *);
char *HumanReadable(IOR_offset_t value, int base);
int CountTasksPerNode(MPI_Comm comm);
int QueryNodeMapping(MPI_Comm comm, int print_nodemap);
int GetNumNodes(MPI_Comm);
int GetNumTasks(MPI_Comm);
int GetNumTasksOnNode0(MPI_Comm);
void DelaySecs(int delay);
void updateParsedOptions(IOR_param_t * options, options_all_t * global_options);
size_t NodeMemoryStringToBytes(char *size_str);