From 0e952f0f8cc56aad2967d61e3d8777131fa2a040 Mon Sep 17 00:00:00 2001 From: Josh Schwartz <52082483+jschwartz-cray@users.noreply.github.com> Date: Fri, 30 Aug 2019 16:45:03 -0600 Subject: [PATCH] Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values --- NEWS | 2 +- src/aiori-NCMPI.c | 4 +- src/ior-output.c | 11 +++-- src/ior.c | 79 ++++++++++++++++++-------------- src/ior.h | 4 +- src/mdtest.c | 13 +++--- src/parse_options.c | 12 +++-- src/utilities.c | 109 ++++++++++++++++++++++++++++++++++++-------- src/utilities.h | 6 +-- 9 files changed, 164 insertions(+), 76 deletions(-) diff --git a/NEWS b/NEWS index 1f2cc74..9367112 100644 --- a/NEWS +++ b/NEWS @@ -120,7 +120,7 @@ Version 2.10.1 - Corrected IOR_GetFileSize() function to point to HDF5 and NCMPI versions of IOR_GetFileSize() calls - Changed the netcdf dataset from 1D array to 4D array, where the 4 dimensions - are: [segmentCount][numTasksWorld][numTransfers][transferSize] + are: [segmentCount][numTasks][numTransfers][transferSize] This patch from Wei-keng Liao allows for file sizes > 4GB (provided no single dimension is > 4GB). - Finalized random-capability release diff --git a/src/aiori-NCMPI.c b/src/aiori-NCMPI.c index 3607466..5fc1375 100755 --- a/src/aiori-NCMPI.c +++ b/src/aiori-NCMPI.c @@ -216,7 +216,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, param->blockSize / param->transferSize; /* reshape 1D array to 3D array: - [segmentCount*numTasksWorld][numTransfers][transferSize] + [segmentCount*numTasks][numTransfers][transferSize] Requirement: none of these dimensions should be > 4G, */ NCMPI_CHECK(ncmpi_def_dim @@ -267,7 +267,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, bufSize[1] = 1; bufSize[2] = param->transferSize; - offset[0] = segmentNum * numTasksWorld + rank; + offset[0] = segmentNum * param->numTasks + rank; offset[1] = transferNum; offset[2] = 0; diff --git a/src/ior-output.c b/src/ior-output.c index 7049a97..26600a3 100644 --- a/src/ior-output.c +++ b/src/ior-output.c @@ -339,10 +339,10 @@ void ShowTestStart(IOR_param_t *test) PrintKeyVal("options", test->options); PrintKeyValInt("dryRun", test->dryRun); - PrintKeyValInt("nodes", test->nodes); + PrintKeyValInt("nodes", test->numNodes); PrintKeyValInt("memoryPerTask", (unsigned long) test->memoryPerTask); PrintKeyValInt("memoryPerNode", (unsigned long) test->memoryPerNode); - PrintKeyValInt("tasksPerNode", tasksPerNode); + PrintKeyValInt("tasksPerNode", test->numTasksOnNode0); PrintKeyValInt("repetitions", test->repetitions); PrintKeyValInt("multiFile", test->multiFile); PrintKeyValInt("interTestDelay", test->interTestDelay); @@ -430,8 +430,9 @@ void ShowSetup(IOR_param_t *params) PrintKeyValInt("task offset", params->taskPerNodeOffset); PrintKeyValInt("reorder random seed", params->reorderTasksRandomSeed); } + PrintKeyValInt("nodes", params->numNodes); PrintKeyValInt("tasks", params->numTasks); - PrintKeyValInt("clients per node", params->tasksPerNode); + PrintKeyValInt("clients per node", params->numTasksOnNode0); if (params->memoryPerTask != 0){ PrintKeyVal("memoryPerTask", HumanReadable(params->memoryPerTask, BASE_TWO)); } @@ -571,7 +572,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access) } fprintf(out_resultfile, "%5d ", params->id); fprintf(out_resultfile, "%6d ", params->numTasks); - fprintf(out_resultfile, "%3d ", params->tasksPerNode); + fprintf(out_resultfile, "%3d ", params->numTasksOnNode0); fprintf(out_resultfile, "%4d ", params->repetitions); fprintf(out_resultfile, "%3d ", params->filePerProc); fprintf(out_resultfile, "%5d ", params->reorderTasks); @@ -595,7 +596,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access) PrintKeyValInt("blockSize", params->blockSize); PrintKeyValInt("transferSize", params->transferSize); PrintKeyValInt("numTasks", params->numTasks); - PrintKeyValInt("tasksPerNode", params->tasksPerNode); + PrintKeyValInt("tasksPerNode", params->numTasksOnNode0); PrintKeyValInt("repetitions", params->repetitions); PrintKeyValInt("filePerProc", params->filePerProc); PrintKeyValInt("reorderTasks", params->reorderTasks); diff --git a/src/ior.c b/src/ior.c index 4cd5571..b694c51 100755 --- a/src/ior.c +++ b/src/ior.c @@ -65,7 +65,6 @@ IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out out_resultfile = world_out; mpi_comm_world = world_com; - MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld), "cannot get number of tasks"); MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); /* setup tests, and validate parameters */ @@ -113,8 +112,6 @@ int ior_main(int argc, char **argv) MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI"); mpi_comm_world = MPI_COMM_WORLD; - MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld), - "cannot get number of tasks"); MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); /* set error-handling */ @@ -188,8 +185,14 @@ void init_IOR_Param_t(IOR_param_t * p) p->writeFile = p->readFile = FALSE; p->checkWrite = p->checkRead = FALSE; - p->nodes = 1; - p->tasksPerNode = 1; + /* + * These can be overridden from the command-line but otherwise will be + * set from MPI. + */ + p->numTasks = -1; + p->numNodes = -1; + p->numTasksOnNode0 = -1; + p->repetitions = 1; p->repCounter = -1; p->open = WRITE; @@ -919,12 +922,17 @@ static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test) */ static void InitTests(IOR_test_t *tests, MPI_Comm com) { - int size; + int mpiNumNodes = 0; + int mpiNumTasks = 0; + int mpiNumTasksOnNode0 = 0; - MPI_CHECK(MPI_Comm_size(com, & size), "MPI_Comm_size() error"); - - /* count the tasks per node */ - tasksPerNode = CountTasksPerNode(com); + /* + * These default values are the same for every test and expensive to + * retrieve so just do it once. + */ + mpiNumNodes = GetNumNodes(com); + mpiNumTasks = GetNumTasks(com); + mpiNumTasksOnNode0 = GetNumTasksOnNode0(com); /* * Since there is no guarantee that anyone other than @@ -937,12 +945,28 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com) while (tests != NULL) { IOR_param_t *params = & tests->params; params->testComm = com; - params->nodes = params->numTasks / tasksPerNode; - params->tasksPerNode = tasksPerNode; - params->tasksBlockMapping = QueryNodeMapping(com,false); - if (params->numTasks == 0) { - params->numTasks = size; + + /* use MPI values if not overridden on command-line */ + if (params->numNodes == -1) { + params->numNodes = mpiNumNodes; } + if (params->numTasks == -1) { + params->numTasks = mpiNumTasks; + } else if (params->numTasks > mpiNumTasks) { + if (rank == 0) { + fprintf(out_logfile, + "WARNING: More tasks requested (%d) than available (%d),", + params->numTasks, mpiNumTasks); + fprintf(out_logfile, " running with %d tasks.\n", + mpiNumTasks); + } + params->numTasks = mpiNumTasks; + } + if (params->numTasksOnNode0 == -1) { + params->numTasksOnNode0 = mpiNumTasksOnNode0; + } + + params->tasksBlockMapping = QueryNodeMapping(com,false); params->expectedAggFileSize = params->blockSize * params->segmentCount * params->numTasks; @@ -1090,7 +1114,7 @@ static void *HogMemory(IOR_param_t *params) if (verbose >= VERBOSE_3) fprintf(out_logfile, "This node hogging %ld bytes of memory\n", params->memoryPerNode); - size = params->memoryPerNode / params->tasksPerNode; + size = params->memoryPerNode / params->numTasksOnNode0; } else { return NULL; } @@ -1190,16 +1214,6 @@ static void TestIoSys(IOR_test_t *test) IOR_io_buffers ioBuffers; /* set up communicator for test */ - if (params->numTasks > numTasksWorld) { - if (rank == 0) { - fprintf(out_logfile, - "WARNING: More tasks requested (%d) than available (%d),", - params->numTasks, numTasksWorld); - fprintf(out_logfile, " running on %d tasks.\n", - numTasksWorld); - } - params->numTasks = numTasksWorld; - } MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group), "MPI_Comm_group() error"); range[0] = 0; /* first rank */ @@ -1226,7 +1240,6 @@ static void TestIoSys(IOR_test_t *test) "Using reorderTasks '-C' (useful to avoid read cache in client)\n"); fflush(out_logfile); } - params->tasksPerNode = CountTasksPerNode(testComm); backend = params->backend; /* show test setup */ if (rank == 0 && verbose >= VERBOSE_0) @@ -1363,7 +1376,7 @@ static void TestIoSys(IOR_test_t *test) /* move two nodes away from writing node */ int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */ if (params->tasksBlockMapping) { - shift = params->tasksPerNode; /* switch to by-slot (contiguous block) mapping */ + shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */ } rankOffset = (2 * shift) % params->numTasks; } @@ -1388,7 +1401,7 @@ static void TestIoSys(IOR_test_t *test) if(params->stoneWallingStatusFile){ params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile); if(params->stoneWallingWearOutIterations == -1 && rank == 0){ - fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!"); + fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!\n"); params->stoneWallingWearOutIterations = 0; } } @@ -1403,7 +1416,7 @@ static void TestIoSys(IOR_test_t *test) /* move one node away from writing node */ int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */ if (params->tasksBlockMapping) { - shift=params->tasksPerNode; /* switch to a by-slot (contiguous block) mapping */ + shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */ } rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks; } @@ -1414,7 +1427,7 @@ static void TestIoSys(IOR_test_t *test) int nodeoffset; unsigned int iseed0; nodeoffset = params->taskPerNodeOffset; - nodeoffset = (nodeoffset < params->nodes) ? nodeoffset : params->nodes - 1; + nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1; if (params->reorderTasksRandomSeed < 0) iseed0 = -1 * params->reorderTasksRandomSeed + rep; else @@ -1424,7 +1437,7 @@ static void TestIoSys(IOR_test_t *test) rankOffset = rand() % params->numTasks; } while (rankOffset < - (nodeoffset * params->tasksPerNode)) { + (nodeoffset * params->numTasksOnNode0)) { rankOffset = rand() % params->numTasks; } /* Get more detailed stats if requested by verbose level */ @@ -1454,7 +1467,7 @@ static void TestIoSys(IOR_test_t *test) "barrier error"); if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, - "Commencing read performance test: %s", + "Commencing read performance test: %s\n", CurrentTimeString()); } timer[2] = GetTimeStamp(); diff --git a/src/ior.h b/src/ior.h index e245b08..ccf47fa 100755 --- a/src/ior.h +++ b/src/ior.h @@ -98,8 +98,8 @@ typedef struct // intermediate options int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */ int numTasks; /* number of tasks for test */ - int nodes; /* number of nodes for test */ - int tasksPerNode; /* number of tasks per node */ + int numNodes; /* number of nodes for test */ + int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */ int tasksBlockMapping; /* are the tasks in contiguous blocks across nodes or round-robin */ int repetitions; /* number of repetitions of test */ int repCounter; /* rep counter */ diff --git a/src/mdtest.c b/src/mdtest.c index 77b2759..39edf8c 100644 --- a/src/mdtest.c +++ b/src/mdtest.c @@ -1867,7 +1867,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * mdtest_init_args(); int i, j; - int nodeCount; + int numNodes; + int numTasksOnNode0 = 0; MPI_Group worldgroup, testgroup; struct { int first; @@ -1943,8 +1944,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * pid = getpid(); uid = getuid(); - tasksPerNode = CountTasksPerNode(testComm); - nodeCount = size / tasksPerNode; + numNodes = GetNumNodes(testComm); + numTasksOnNode0 = GetNumTasksOnNode0(testComm); char cmd_buffer[4096]; strncpy(cmd_buffer, argv[0], 4096); @@ -1953,7 +1954,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * } VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp()); - VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, nodeCount); + VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, numNodes); VERBOSE(0,-1,"Command line used: %s", cmd_buffer); /* adjust special variables */ @@ -2120,10 +2121,10 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * /* set the shift to mimic IOR and shift by procs per node */ if (nstride > 0) { - if ( nodeCount > 1 && tasksBlockMapping ) { + if ( numNodes > 1 && tasksBlockMapping ) { /* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */ - nstride *= tasksPerNode; + nstride *= numTasksOnNode0; } VERBOSE(0,5,"Shifting ranks by %d for each phase.", nstride); } diff --git a/src/parse_options.c b/src/parse_options.c index af30c36..ab9509d 100755 --- a/src/parse_options.c +++ b/src/parse_options.c @@ -151,8 +151,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->maxTimeDuration = atoi(value); } else if (strcasecmp(option, "outlierthreshold") == 0) { params->outlierThreshold = atoi(value); - } else if (strcasecmp(option, "nodes") == 0) { - params->nodes = atoi(value); + } else if (strcasecmp(option, "numnodes") == 0) { + params->numNodes = atoi(value); + } else if (strcasecmp(option, "numtasks") == 0) { + params->numTasks = atoi(value); + } else if (strcasecmp(option, "numtasksonnode0") == 0) { + params->numTasksOnNode0 = atoi(value); } else if (strcasecmp(option, "repetitions") == 0) { params->repetitions = atoi(value); } else if (strcasecmp(option, "intertestdelay") == 0) { @@ -286,8 +290,6 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->beegfs_chunkSize = string_to_bytes(value); if (!ISPOWEROFTWO(params->beegfs_chunkSize) || params->beegfs_chunkSize < (1<<16)) ERR("beegfsChunkSize must be a power of two and >64k"); - } else if (strcasecmp(option, "numtasks") == 0) { - params->numTasks = atoi(value); } else if (strcasecmp(option, "summaryalways") == 0) { params->summary_every_test = atoi(value); } else { @@ -498,7 +500,7 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'m', NULL, "multiFile -- use number of reps (-i) for multiple file count", OPTION_FLAG, 'd', & params->multiFile}, {'M', NULL, "memoryPerNode -- hog memory on the node (e.g.: 2g, 75%)", OPTION_OPTIONAL_ARGUMENT, 's', & params->memoryPerNodeStr}, {'n', NULL, "noFill -- no fill in HDF5 file creation", OPTION_FLAG, 'd', & params->noFill}, - {'N', NULL, "numTasks -- number of tasks that should participate in the test", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks}, + {'N', NULL, "numTasks -- number of tasks that are participating in the test (overrides MPI)", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks}, {'o', NULL, "testFile -- full name for test", OPTION_OPTIONAL_ARGUMENT, 's', & params->testFileName}, {'O', NULL, "string of IOR directives (e.g. -O checkRead=1,lustreStripeCount=32)", OPTION_OPTIONAL_ARGUMENT, 'p', & decodeDirectiveWrapper}, {'p', NULL, "preallocate -- preallocate file size", OPTION_FLAG, 'd', & params->preallocate}, diff --git a/src/utilities.c b/src/utilities.c index cdb090e..f7c073d 100755 --- a/src/utilities.c +++ b/src/utilities.c @@ -53,11 +53,9 @@ extern int errno; extern int numTasks; -/* globals used by other files, also defined "extern" in ior.h */ -int numTasksWorld = 0; +/* globals used by other files, also defined "extern" in utilities.h */ int rank = 0; int rankOffset = 0; -int tasksPerNode = 0; /* tasks per node */ int verbose = VERBOSE_0; /* verbose output */ MPI_Comm testComm; MPI_Comm mpi_comm_world; @@ -265,35 +263,108 @@ int QueryNodeMapping(MPI_Comm comm, int print_nodemap) { return ret; } +/* + * There is a more direct way to determine the node count in modern MPI + * versions so we use that if possible. + * + * For older versions we use a method which should still provide accurate + * results even if the total number of tasks is not evenly divisible by the + * tasks on node rank 0. + */ +int GetNumNodes(MPI_Comm comm) { #if MPI_VERSION >= 3 -int CountTasksPerNode(MPI_Comm comm) { - /* modern MPI provides a simple way to get the local process count */ - MPI_Comm shared_comm; - int count; + MPI_Comm shared_comm; + int shared_rank = 0; + int local_result = 0; + int numNodes = 0; + MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm), + "MPI_Comm_split_type() error"); + MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error"); + local_result = shared_rank == 0? 1 : 0; + MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm), + "MPI_Allreduce() error"); + MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error"); - MPI_Comm_split_type (comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm); - MPI_Comm_size (shared_comm, &count); - MPI_Comm_free (&shared_comm); + return numNodes; +#else + int numTasks = 0; + int numTasksOnNode0 = 0; - return count; + numTasks = GetNumTasks(comm); + numTasksOnNode0 = GetNumTasksOnNode0(comm); + + return ((numTasks - 1) / numTasksOnNode0) + 1; +#endif } + + +int GetNumTasks(MPI_Comm comm) { + int numTasks = 0; + + MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks"); + + return numTasks; +} + + +/* + * It's very important that this method provide the same result to every + * process as it's used for redistributing which jobs read from which files. + * It was renamed accordingly. + * + * If different nodes get different results from this method then jobs get + * redistributed unevenly and you no longer have a 1:1 relationship with some + * nodes reading multiple files while others read none. + * + * In the common case the number of tasks on each node (MPI_Comm_size on an + * MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is + * nothing which guarantees this. It's valid to have, for example, 64 jobs + * across 4 systems which can run 20 jobs each. In that scenario you end up + * with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4. + * + * In the (MPI_VERSION < 3) implementation of this method consistency is + * ensured by asking specifically about the number of tasks on the node with + * rank 0. In the original implementation for (MPI_VERSION >= 3) this was + * broken by using the LOCAL process count which differed depending on which + * node you were on. + * + * This was corrected below by first splitting the comm into groups by node + * (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and + * shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields + * the original consistent behavior no matter which node asks. + * + * In the common case where every node has the same number of tasks this + * method will return the same value it always has. + */ +int GetNumTasksOnNode0(MPI_Comm comm) { +#if MPI_VERSION >= 3 + MPI_Comm shared_comm; + int shared_rank = 0; + int tasks_on_node_rank0 = 0; + int local_result = 0; + + MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm), + "MPI_Comm_split_type() error"); + MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error"); + if (rank == 0 && shared_rank == 0) { + MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error"); + } + MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm), + "MPI_Allreduce() error"); + MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error"); + + return tasks_on_node_rank0; #else /* - * Count the number of tasks that share a host. - * - * This function employees the gethostname() call, rather than using + * This version employs the gethostname() call, rather than using * MPI_Get_processor_name(). We are interested in knowing the number * of tasks that share a file system client (I/O node, compute node, * whatever that may be). However on machines like BlueGene/Q, * MPI_Get_processor_name() uniquely identifies a cpu in a compute node, * not the node where the I/O is function shipped to. gethostname() * is assumed to identify the shared filesystem client in more situations. - * - * NOTE: This also assumes that the task count on all nodes is equal - * to the task count on the host running MPI task 0. */ -int CountTasksPerNode(MPI_Comm comm) { int size; MPI_Comm_size(comm, & size); /* for debugging and testing */ @@ -336,8 +407,8 @@ int CountTasksPerNode(MPI_Comm comm) { MPI_Bcast(&count, 1, MPI_INT, 0, comm); return(count); -} #endif +} /* diff --git a/src/utilities.h b/src/utilities.h index d2c9962..2a9abe3 100755 --- a/src/utilities.h +++ b/src/utilities.h @@ -18,10 +18,8 @@ #include #include "ior.h" -extern int numTasksWorld; extern int rank; extern int rankOffset; -extern int tasksPerNode; extern int verbose; extern MPI_Comm testComm; extern MPI_Comm mpi_comm_world; @@ -55,8 +53,10 @@ void SeedRandGen(MPI_Comm); void SetHints (MPI_Info *, char *); void ShowHints (MPI_Info *); char *HumanReadable(IOR_offset_t value, int base); -int CountTasksPerNode(MPI_Comm comm); int QueryNodeMapping(MPI_Comm comm, int print_nodemap); +int GetNumNodes(MPI_Comm); +int GetNumTasks(MPI_Comm); +int GetNumTasksOnNode0(MPI_Comm); void DelaySecs(int delay); void updateParsedOptions(IOR_param_t * options, options_all_t * global_options); size_t NodeMemoryStringToBytes(char *size_str);