commit
604598ab2f
2
NEWS
2
NEWS
|
@ -120,7 +120,7 @@ Version 2.10.1
|
|||
- Corrected IOR_GetFileSize() function to point to HDF5 and NCMPI versions of
|
||||
IOR_GetFileSize() calls
|
||||
- Changed the netcdf dataset from 1D array to 4D array, where the 4 dimensions
|
||||
are: [segmentCount][numTasksWorld][numTransfers][transferSize]
|
||||
are: [segmentCount][numTasks][numTransfers][transferSize]
|
||||
This patch from Wei-keng Liao allows for file sizes > 4GB (provided no
|
||||
single dimension is > 4GB).
|
||||
- Finalized random-capability release
|
||||
|
|
|
@ -216,7 +216,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer,
|
|||
param->blockSize / param->transferSize;
|
||||
|
||||
/* reshape 1D array to 3D array:
|
||||
[segmentCount*numTasksWorld][numTransfers][transferSize]
|
||||
[segmentCount*numTasks][numTransfers][transferSize]
|
||||
Requirement: none of these dimensions should be > 4G,
|
||||
*/
|
||||
NCMPI_CHECK(ncmpi_def_dim
|
||||
|
@ -267,7 +267,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer,
|
|||
bufSize[1] = 1;
|
||||
bufSize[2] = param->transferSize;
|
||||
|
||||
offset[0] = segmentNum * numTasksWorld + rank;
|
||||
offset[0] = segmentNum * param->numTasks + rank;
|
||||
offset[1] = transferNum;
|
||||
offset[2] = 0;
|
||||
|
||||
|
|
|
@ -340,10 +340,10 @@ void ShowTestStart(IOR_param_t *test)
|
|||
|
||||
PrintKeyVal("options", test->options);
|
||||
PrintKeyValInt("dryRun", test->dryRun);
|
||||
PrintKeyValInt("nodes", test->nodes);
|
||||
PrintKeyValInt("nodes", test->numNodes);
|
||||
PrintKeyValInt("memoryPerTask", (unsigned long) test->memoryPerTask);
|
||||
PrintKeyValInt("memoryPerNode", (unsigned long) test->memoryPerNode);
|
||||
PrintKeyValInt("tasksPerNode", tasksPerNode);
|
||||
PrintKeyValInt("tasksPerNode", test->numTasksOnNode0);
|
||||
PrintKeyValInt("repetitions", test->repetitions);
|
||||
PrintKeyValInt("multiFile", test->multiFile);
|
||||
PrintKeyValInt("interTestDelay", test->interTestDelay);
|
||||
|
@ -431,8 +431,9 @@ void ShowSetup(IOR_param_t *params)
|
|||
PrintKeyValInt("task offset", params->taskPerNodeOffset);
|
||||
PrintKeyValInt("reorder random seed", params->reorderTasksRandomSeed);
|
||||
}
|
||||
PrintKeyValInt("nodes", params->numNodes);
|
||||
PrintKeyValInt("tasks", params->numTasks);
|
||||
PrintKeyValInt("clients per node", params->tasksPerNode);
|
||||
PrintKeyValInt("clients per node", params->numTasksOnNode0);
|
||||
if (params->memoryPerTask != 0){
|
||||
PrintKeyVal("memoryPerTask", HumanReadable(params->memoryPerTask, BASE_TWO));
|
||||
}
|
||||
|
@ -572,7 +573,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access)
|
|||
}
|
||||
fprintf(out_resultfile, "%5d ", params->id);
|
||||
fprintf(out_resultfile, "%6d ", params->numTasks);
|
||||
fprintf(out_resultfile, "%3d ", params->tasksPerNode);
|
||||
fprintf(out_resultfile, "%3d ", params->numTasksOnNode0);
|
||||
fprintf(out_resultfile, "%4d ", params->repetitions);
|
||||
fprintf(out_resultfile, "%3d ", params->filePerProc);
|
||||
fprintf(out_resultfile, "%5d ", params->reorderTasks);
|
||||
|
@ -596,7 +597,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access)
|
|||
PrintKeyValInt("blockSize", params->blockSize);
|
||||
PrintKeyValInt("transferSize", params->transferSize);
|
||||
PrintKeyValInt("numTasks", params->numTasks);
|
||||
PrintKeyValInt("tasksPerNode", params->tasksPerNode);
|
||||
PrintKeyValInt("tasksPerNode", params->numTasksOnNode0);
|
||||
PrintKeyValInt("repetitions", params->repetitions);
|
||||
PrintKeyValInt("filePerProc", params->filePerProc);
|
||||
PrintKeyValInt("reorderTasks", params->reorderTasks);
|
||||
|
|
79
src/ior.c
79
src/ior.c
|
@ -65,7 +65,6 @@ IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out
|
|||
out_resultfile = world_out;
|
||||
mpi_comm_world = world_com;
|
||||
|
||||
MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld), "cannot get number of tasks");
|
||||
MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank");
|
||||
|
||||
/* setup tests, and validate parameters */
|
||||
|
@ -113,8 +112,6 @@ int ior_main(int argc, char **argv)
|
|||
MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI");
|
||||
|
||||
mpi_comm_world = MPI_COMM_WORLD;
|
||||
MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld),
|
||||
"cannot get number of tasks");
|
||||
MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank");
|
||||
|
||||
/* set error-handling */
|
||||
|
@ -189,8 +186,14 @@ void init_IOR_Param_t(IOR_param_t * p)
|
|||
p->writeFile = p->readFile = FALSE;
|
||||
p->checkWrite = p->checkRead = FALSE;
|
||||
|
||||
p->nodes = 1;
|
||||
p->tasksPerNode = 1;
|
||||
/*
|
||||
* These can be overridden from the command-line but otherwise will be
|
||||
* set from MPI.
|
||||
*/
|
||||
p->numTasks = -1;
|
||||
p->numNodes = -1;
|
||||
p->numTasksOnNode0 = -1;
|
||||
|
||||
p->repetitions = 1;
|
||||
p->repCounter = -1;
|
||||
p->open = WRITE;
|
||||
|
@ -920,12 +923,17 @@ static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test)
|
|||
*/
|
||||
static void InitTests(IOR_test_t *tests, MPI_Comm com)
|
||||
{
|
||||
int size;
|
||||
int mpiNumNodes = 0;
|
||||
int mpiNumTasks = 0;
|
||||
int mpiNumTasksOnNode0 = 0;
|
||||
|
||||
MPI_CHECK(MPI_Comm_size(com, & size), "MPI_Comm_size() error");
|
||||
|
||||
/* count the tasks per node */
|
||||
tasksPerNode = CountTasksPerNode(com);
|
||||
/*
|
||||
* These default values are the same for every test and expensive to
|
||||
* retrieve so just do it once.
|
||||
*/
|
||||
mpiNumNodes = GetNumNodes(com);
|
||||
mpiNumTasks = GetNumTasks(com);
|
||||
mpiNumTasksOnNode0 = GetNumTasksOnNode0(com);
|
||||
|
||||
/*
|
||||
* Since there is no guarantee that anyone other than
|
||||
|
@ -938,12 +946,28 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com)
|
|||
while (tests != NULL) {
|
||||
IOR_param_t *params = & tests->params;
|
||||
params->testComm = com;
|
||||
params->nodes = params->numTasks / tasksPerNode;
|
||||
params->tasksPerNode = tasksPerNode;
|
||||
params->tasksBlockMapping = QueryNodeMapping(com,false);
|
||||
if (params->numTasks == 0) {
|
||||
params->numTasks = size;
|
||||
|
||||
/* use MPI values if not overridden on command-line */
|
||||
if (params->numNodes == -1) {
|
||||
params->numNodes = mpiNumNodes;
|
||||
}
|
||||
if (params->numTasks == -1) {
|
||||
params->numTasks = mpiNumTasks;
|
||||
} else if (params->numTasks > mpiNumTasks) {
|
||||
if (rank == 0) {
|
||||
fprintf(out_logfile,
|
||||
"WARNING: More tasks requested (%d) than available (%d),",
|
||||
params->numTasks, mpiNumTasks);
|
||||
fprintf(out_logfile, " running with %d tasks.\n",
|
||||
mpiNumTasks);
|
||||
}
|
||||
params->numTasks = mpiNumTasks;
|
||||
}
|
||||
if (params->numTasksOnNode0 == -1) {
|
||||
params->numTasksOnNode0 = mpiNumTasksOnNode0;
|
||||
}
|
||||
|
||||
params->tasksBlockMapping = QueryNodeMapping(com,false);
|
||||
params->expectedAggFileSize =
|
||||
params->blockSize * params->segmentCount * params->numTasks;
|
||||
|
||||
|
@ -1091,7 +1115,7 @@ static void *HogMemory(IOR_param_t *params)
|
|||
if (verbose >= VERBOSE_3)
|
||||
fprintf(out_logfile, "This node hogging %ld bytes of memory\n",
|
||||
params->memoryPerNode);
|
||||
size = params->memoryPerNode / params->tasksPerNode;
|
||||
size = params->memoryPerNode / params->numTasksOnNode0;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
@ -1191,16 +1215,6 @@ static void TestIoSys(IOR_test_t *test)
|
|||
IOR_io_buffers ioBuffers;
|
||||
|
||||
/* set up communicator for test */
|
||||
if (params->numTasks > numTasksWorld) {
|
||||
if (rank == 0) {
|
||||
fprintf(out_logfile,
|
||||
"WARNING: More tasks requested (%d) than available (%d),",
|
||||
params->numTasks, numTasksWorld);
|
||||
fprintf(out_logfile, " running on %d tasks.\n",
|
||||
numTasksWorld);
|
||||
}
|
||||
params->numTasks = numTasksWorld;
|
||||
}
|
||||
MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group),
|
||||
"MPI_Comm_group() error");
|
||||
range[0] = 0; /* first rank */
|
||||
|
@ -1227,7 +1241,6 @@ static void TestIoSys(IOR_test_t *test)
|
|||
"Using reorderTasks '-C' (useful to avoid read cache in client)\n");
|
||||
fflush(out_logfile);
|
||||
}
|
||||
params->tasksPerNode = CountTasksPerNode(testComm);
|
||||
backend = params->backend;
|
||||
/* show test setup */
|
||||
if (rank == 0 && verbose >= VERBOSE_0)
|
||||
|
@ -1364,7 +1377,7 @@ static void TestIoSys(IOR_test_t *test)
|
|||
/* move two nodes away from writing node */
|
||||
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
|
||||
if (params->tasksBlockMapping) {
|
||||
shift = params->tasksPerNode; /* switch to by-slot (contiguous block) mapping */
|
||||
shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */
|
||||
}
|
||||
rankOffset = (2 * shift) % params->numTasks;
|
||||
}
|
||||
|
@ -1389,7 +1402,7 @@ static void TestIoSys(IOR_test_t *test)
|
|||
if(params->stoneWallingStatusFile){
|
||||
params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile);
|
||||
if(params->stoneWallingWearOutIterations == -1 && rank == 0){
|
||||
fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!");
|
||||
fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!\n");
|
||||
params->stoneWallingWearOutIterations = 0;
|
||||
}
|
||||
}
|
||||
|
@ -1404,7 +1417,7 @@ static void TestIoSys(IOR_test_t *test)
|
|||
/* move one node away from writing node */
|
||||
int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */
|
||||
if (params->tasksBlockMapping) {
|
||||
shift=params->tasksPerNode; /* switch to a by-slot (contiguous block) mapping */
|
||||
shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */
|
||||
}
|
||||
rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks;
|
||||
}
|
||||
|
@ -1415,7 +1428,7 @@ static void TestIoSys(IOR_test_t *test)
|
|||
int nodeoffset;
|
||||
unsigned int iseed0;
|
||||
nodeoffset = params->taskPerNodeOffset;
|
||||
nodeoffset = (nodeoffset < params->nodes) ? nodeoffset : params->nodes - 1;
|
||||
nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1;
|
||||
if (params->reorderTasksRandomSeed < 0)
|
||||
iseed0 = -1 * params->reorderTasksRandomSeed + rep;
|
||||
else
|
||||
|
@ -1425,7 +1438,7 @@ static void TestIoSys(IOR_test_t *test)
|
|||
rankOffset = rand() % params->numTasks;
|
||||
}
|
||||
while (rankOffset <
|
||||
(nodeoffset * params->tasksPerNode)) {
|
||||
(nodeoffset * params->numTasksOnNode0)) {
|
||||
rankOffset = rand() % params->numTasks;
|
||||
}
|
||||
/* Get more detailed stats if requested by verbose level */
|
||||
|
@ -1455,7 +1468,7 @@ static void TestIoSys(IOR_test_t *test)
|
|||
"barrier error");
|
||||
if (rank == 0 && verbose >= VERBOSE_1) {
|
||||
fprintf(out_logfile,
|
||||
"Commencing read performance test: %s",
|
||||
"Commencing read performance test: %s\n",
|
||||
CurrentTimeString());
|
||||
}
|
||||
timer[2] = GetTimeStamp();
|
||||
|
|
|
@ -98,8 +98,8 @@ typedef struct
|
|||
// intermediate options
|
||||
int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */
|
||||
int numTasks; /* number of tasks for test */
|
||||
int nodes; /* number of nodes for test */
|
||||
int tasksPerNode; /* number of tasks per node */
|
||||
int numNodes; /* number of nodes for test */
|
||||
int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */
|
||||
int tasksBlockMapping; /* are the tasks in contiguous blocks across nodes or round-robin */
|
||||
int repetitions; /* number of repetitions of test */
|
||||
int repCounter; /* rep counter */
|
||||
|
|
13
src/mdtest.c
13
src/mdtest.c
|
@ -1870,7 +1870,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
|
|||
|
||||
mdtest_init_args();
|
||||
int i, j;
|
||||
int nodeCount;
|
||||
int numNodes;
|
||||
int numTasksOnNode0 = 0;
|
||||
MPI_Group worldgroup, testgroup;
|
||||
struct {
|
||||
int first;
|
||||
|
@ -1950,8 +1951,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
|
|||
pid = getpid();
|
||||
uid = getuid();
|
||||
|
||||
tasksPerNode = CountTasksPerNode(testComm);
|
||||
nodeCount = size / tasksPerNode;
|
||||
numNodes = GetNumNodes(testComm);
|
||||
numTasksOnNode0 = GetNumTasksOnNode0(testComm);
|
||||
|
||||
char cmd_buffer[4096];
|
||||
strncpy(cmd_buffer, argv[0], 4096);
|
||||
|
@ -1960,7 +1961,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
|
|||
}
|
||||
|
||||
VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp());
|
||||
VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, nodeCount);
|
||||
VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, numNodes);
|
||||
VERBOSE(0,-1,"Command line used: %s", cmd_buffer);
|
||||
|
||||
/* adjust special variables */
|
||||
|
@ -2128,10 +2129,10 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
|
|||
|
||||
/* set the shift to mimic IOR and shift by procs per node */
|
||||
if (nstride > 0) {
|
||||
if ( nodeCount > 1 && tasksBlockMapping ) {
|
||||
if ( numNodes > 1 && tasksBlockMapping ) {
|
||||
/* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks
|
||||
however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */
|
||||
nstride *= tasksPerNode;
|
||||
nstride *= numTasksOnNode0;
|
||||
}
|
||||
VERBOSE(0,5,"Shifting ranks by %d for each phase.", nstride);
|
||||
}
|
||||
|
|
|
@ -151,8 +151,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt
|
|||
params->maxTimeDuration = atoi(value);
|
||||
} else if (strcasecmp(option, "outlierthreshold") == 0) {
|
||||
params->outlierThreshold = atoi(value);
|
||||
} else if (strcasecmp(option, "nodes") == 0) {
|
||||
params->nodes = atoi(value);
|
||||
} else if (strcasecmp(option, "numnodes") == 0) {
|
||||
params->numNodes = atoi(value);
|
||||
} else if (strcasecmp(option, "numtasks") == 0) {
|
||||
params->numTasks = atoi(value);
|
||||
} else if (strcasecmp(option, "numtasksonnode0") == 0) {
|
||||
params->numTasksOnNode0 = atoi(value);
|
||||
} else if (strcasecmp(option, "repetitions") == 0) {
|
||||
params->repetitions = atoi(value);
|
||||
} else if (strcasecmp(option, "intertestdelay") == 0) {
|
||||
|
@ -286,8 +290,6 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt
|
|||
params->beegfs_chunkSize = string_to_bytes(value);
|
||||
if (!ISPOWEROFTWO(params->beegfs_chunkSize) || params->beegfs_chunkSize < (1<<16))
|
||||
ERR("beegfsChunkSize must be a power of two and >64k");
|
||||
} else if (strcasecmp(option, "numtasks") == 0) {
|
||||
params->numTasks = atoi(value);
|
||||
} else if (strcasecmp(option, "summaryalways") == 0) {
|
||||
params->summary_every_test = atoi(value);
|
||||
} else {
|
||||
|
@ -498,7 +500,7 @@ option_help * createGlobalOptions(IOR_param_t * params){
|
|||
{'m', NULL, "multiFile -- use number of reps (-i) for multiple file count", OPTION_FLAG, 'd', & params->multiFile},
|
||||
{'M', NULL, "memoryPerNode -- hog memory on the node (e.g.: 2g, 75%)", OPTION_OPTIONAL_ARGUMENT, 's', & params->memoryPerNodeStr},
|
||||
{'n', NULL, "noFill -- no fill in HDF5 file creation", OPTION_FLAG, 'd', & params->noFill},
|
||||
{'N', NULL, "numTasks -- number of tasks that should participate in the test", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks},
|
||||
{'N', NULL, "numTasks -- number of tasks that are participating in the test (overrides MPI)", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks},
|
||||
{'o', NULL, "testFile -- full name for test", OPTION_OPTIONAL_ARGUMENT, 's', & params->testFileName},
|
||||
{'O', NULL, "string of IOR directives (e.g. -O checkRead=1,lustreStripeCount=32)", OPTION_OPTIONAL_ARGUMENT, 'p', & decodeDirectiveWrapper},
|
||||
{'p', NULL, "preallocate -- preallocate file size", OPTION_FLAG, 'd', & params->preallocate},
|
||||
|
|
109
src/utilities.c
109
src/utilities.c
|
@ -53,11 +53,9 @@
|
|||
extern int errno;
|
||||
extern int numTasks;
|
||||
|
||||
/* globals used by other files, also defined "extern" in ior.h */
|
||||
int numTasksWorld = 0;
|
||||
/* globals used by other files, also defined "extern" in utilities.h */
|
||||
int rank = 0;
|
||||
int rankOffset = 0;
|
||||
int tasksPerNode = 0; /* tasks per node */
|
||||
int verbose = VERBOSE_0; /* verbose output */
|
||||
MPI_Comm testComm;
|
||||
MPI_Comm mpi_comm_world;
|
||||
|
@ -265,35 +263,108 @@ int QueryNodeMapping(MPI_Comm comm, int print_nodemap) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* There is a more direct way to determine the node count in modern MPI
|
||||
* versions so we use that if possible.
|
||||
*
|
||||
* For older versions we use a method which should still provide accurate
|
||||
* results even if the total number of tasks is not evenly divisible by the
|
||||
* tasks on node rank 0.
|
||||
*/
|
||||
int GetNumNodes(MPI_Comm comm) {
|
||||
#if MPI_VERSION >= 3
|
||||
int CountTasksPerNode(MPI_Comm comm) {
|
||||
/* modern MPI provides a simple way to get the local process count */
|
||||
MPI_Comm shared_comm;
|
||||
int count;
|
||||
MPI_Comm shared_comm;
|
||||
int shared_rank = 0;
|
||||
int local_result = 0;
|
||||
int numNodes = 0;
|
||||
|
||||
MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
|
||||
"MPI_Comm_split_type() error");
|
||||
MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
|
||||
local_result = shared_rank == 0? 1 : 0;
|
||||
MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm),
|
||||
"MPI_Allreduce() error");
|
||||
MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
|
||||
|
||||
MPI_Comm_split_type (comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm);
|
||||
MPI_Comm_size (shared_comm, &count);
|
||||
MPI_Comm_free (&shared_comm);
|
||||
return numNodes;
|
||||
#else
|
||||
int numTasks = 0;
|
||||
int numTasksOnNode0 = 0;
|
||||
|
||||
return count;
|
||||
numTasks = GetNumTasks(comm);
|
||||
numTasksOnNode0 = GetNumTasksOnNode0(comm);
|
||||
|
||||
return ((numTasks - 1) / numTasksOnNode0) + 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int GetNumTasks(MPI_Comm comm) {
|
||||
int numTasks = 0;
|
||||
|
||||
MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks");
|
||||
|
||||
return numTasks;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* It's very important that this method provide the same result to every
|
||||
* process as it's used for redistributing which jobs read from which files.
|
||||
* It was renamed accordingly.
|
||||
*
|
||||
* If different nodes get different results from this method then jobs get
|
||||
* redistributed unevenly and you no longer have a 1:1 relationship with some
|
||||
* nodes reading multiple files while others read none.
|
||||
*
|
||||
* In the common case the number of tasks on each node (MPI_Comm_size on an
|
||||
* MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is
|
||||
* nothing which guarantees this. It's valid to have, for example, 64 jobs
|
||||
* across 4 systems which can run 20 jobs each. In that scenario you end up
|
||||
* with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4.
|
||||
*
|
||||
* In the (MPI_VERSION < 3) implementation of this method consistency is
|
||||
* ensured by asking specifically about the number of tasks on the node with
|
||||
* rank 0. In the original implementation for (MPI_VERSION >= 3) this was
|
||||
* broken by using the LOCAL process count which differed depending on which
|
||||
* node you were on.
|
||||
*
|
||||
* This was corrected below by first splitting the comm into groups by node
|
||||
* (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and
|
||||
* shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields
|
||||
* the original consistent behavior no matter which node asks.
|
||||
*
|
||||
* In the common case where every node has the same number of tasks this
|
||||
* method will return the same value it always has.
|
||||
*/
|
||||
int GetNumTasksOnNode0(MPI_Comm comm) {
|
||||
#if MPI_VERSION >= 3
|
||||
MPI_Comm shared_comm;
|
||||
int shared_rank = 0;
|
||||
int tasks_on_node_rank0 = 0;
|
||||
int local_result = 0;
|
||||
|
||||
MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
|
||||
"MPI_Comm_split_type() error");
|
||||
MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
|
||||
if (rank == 0 && shared_rank == 0) {
|
||||
MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error");
|
||||
}
|
||||
MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm),
|
||||
"MPI_Allreduce() error");
|
||||
MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
|
||||
|
||||
return tasks_on_node_rank0;
|
||||
#else
|
||||
/*
|
||||
* Count the number of tasks that share a host.
|
||||
*
|
||||
* This function employees the gethostname() call, rather than using
|
||||
* This version employs the gethostname() call, rather than using
|
||||
* MPI_Get_processor_name(). We are interested in knowing the number
|
||||
* of tasks that share a file system client (I/O node, compute node,
|
||||
* whatever that may be). However on machines like BlueGene/Q,
|
||||
* MPI_Get_processor_name() uniquely identifies a cpu in a compute node,
|
||||
* not the node where the I/O is function shipped to. gethostname()
|
||||
* is assumed to identify the shared filesystem client in more situations.
|
||||
*
|
||||
* NOTE: This also assumes that the task count on all nodes is equal
|
||||
* to the task count on the host running MPI task 0.
|
||||
*/
|
||||
int CountTasksPerNode(MPI_Comm comm) {
|
||||
int size;
|
||||
MPI_Comm_size(comm, & size);
|
||||
/* for debugging and testing */
|
||||
|
@ -336,8 +407,8 @@ int CountTasksPerNode(MPI_Comm comm) {
|
|||
MPI_Bcast(&count, 1, MPI_INT, 0, comm);
|
||||
|
||||
return(count);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
|
|
|
@ -18,10 +18,8 @@
|
|||
#include <mpi.h>
|
||||
#include "ior.h"
|
||||
|
||||
extern int numTasksWorld;
|
||||
extern int rank;
|
||||
extern int rankOffset;
|
||||
extern int tasksPerNode;
|
||||
extern int verbose;
|
||||
extern MPI_Comm testComm;
|
||||
extern MPI_Comm mpi_comm_world;
|
||||
|
@ -55,8 +53,10 @@ void SeedRandGen(MPI_Comm);
|
|||
void SetHints (MPI_Info *, char *);
|
||||
void ShowHints (MPI_Info *);
|
||||
char *HumanReadable(IOR_offset_t value, int base);
|
||||
int CountTasksPerNode(MPI_Comm comm);
|
||||
int QueryNodeMapping(MPI_Comm comm, int print_nodemap);
|
||||
int GetNumNodes(MPI_Comm);
|
||||
int GetNumTasks(MPI_Comm);
|
||||
int GetNumTasksOnNode0(MPI_Comm);
|
||||
void DelaySecs(int delay);
|
||||
void updateParsedOptions(IOR_param_t * options, options_all_t * global_options);
|
||||
size_t NodeMemoryStringToBytes(char *size_str);
|
||||
|
|
Loading…
Reference in New Issue