diff --git a/NEWS b/NEWS index 1f2cc74..9367112 100644 --- a/NEWS +++ b/NEWS @@ -120,7 +120,7 @@ Version 2.10.1 - Corrected IOR_GetFileSize() function to point to HDF5 and NCMPI versions of IOR_GetFileSize() calls - Changed the netcdf dataset from 1D array to 4D array, where the 4 dimensions - are: [segmentCount][numTasksWorld][numTransfers][transferSize] + are: [segmentCount][numTasks][numTransfers][transferSize] This patch from Wei-keng Liao allows for file sizes > 4GB (provided no single dimension is > 4GB). - Finalized random-capability release diff --git a/src/aiori-NCMPI.c b/src/aiori-NCMPI.c index 3607466..5fc1375 100755 --- a/src/aiori-NCMPI.c +++ b/src/aiori-NCMPI.c @@ -216,7 +216,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, param->blockSize / param->transferSize; /* reshape 1D array to 3D array: - [segmentCount*numTasksWorld][numTransfers][transferSize] + [segmentCount*numTasks][numTransfers][transferSize] Requirement: none of these dimensions should be > 4G, */ NCMPI_CHECK(ncmpi_def_dim @@ -267,7 +267,7 @@ static IOR_offset_t NCMPI_Xfer(int access, void *fd, IOR_size_t * buffer, bufSize[1] = 1; bufSize[2] = param->transferSize; - offset[0] = segmentNum * numTasksWorld + rank; + offset[0] = segmentNum * param->numTasks + rank; offset[1] = transferNum; offset[2] = 0; diff --git a/src/ior-output.c b/src/ior-output.c index 560d995..8afa9fa 100644 --- a/src/ior-output.c +++ b/src/ior-output.c @@ -340,10 +340,10 @@ void ShowTestStart(IOR_param_t *test) PrintKeyVal("options", test->options); PrintKeyValInt("dryRun", test->dryRun); - PrintKeyValInt("nodes", test->nodes); + PrintKeyValInt("nodes", test->numNodes); PrintKeyValInt("memoryPerTask", (unsigned long) test->memoryPerTask); PrintKeyValInt("memoryPerNode", (unsigned long) test->memoryPerNode); - PrintKeyValInt("tasksPerNode", tasksPerNode); + PrintKeyValInt("tasksPerNode", test->numTasksOnNode0); PrintKeyValInt("repetitions", test->repetitions); PrintKeyValInt("multiFile", test->multiFile); PrintKeyValInt("interTestDelay", test->interTestDelay); @@ -431,8 +431,9 @@ void ShowSetup(IOR_param_t *params) PrintKeyValInt("task offset", params->taskPerNodeOffset); PrintKeyValInt("reorder random seed", params->reorderTasksRandomSeed); } + PrintKeyValInt("nodes", params->numNodes); PrintKeyValInt("tasks", params->numTasks); - PrintKeyValInt("clients per node", params->tasksPerNode); + PrintKeyValInt("clients per node", params->numTasksOnNode0); if (params->memoryPerTask != 0){ PrintKeyVal("memoryPerTask", HumanReadable(params->memoryPerTask, BASE_TWO)); } @@ -572,7 +573,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access) } fprintf(out_resultfile, "%5d ", params->id); fprintf(out_resultfile, "%6d ", params->numTasks); - fprintf(out_resultfile, "%3d ", params->tasksPerNode); + fprintf(out_resultfile, "%3d ", params->numTasksOnNode0); fprintf(out_resultfile, "%4d ", params->repetitions); fprintf(out_resultfile, "%3d ", params->filePerProc); fprintf(out_resultfile, "%5d ", params->reorderTasks); @@ -596,7 +597,7 @@ static void PrintLongSummaryOneOperation(IOR_test_t *test, const int access) PrintKeyValInt("blockSize", params->blockSize); PrintKeyValInt("transferSize", params->transferSize); PrintKeyValInt("numTasks", params->numTasks); - PrintKeyValInt("tasksPerNode", params->tasksPerNode); + PrintKeyValInt("tasksPerNode", params->numTasksOnNode0); PrintKeyValInt("repetitions", params->repetitions); PrintKeyValInt("filePerProc", params->filePerProc); PrintKeyValInt("reorderTasks", params->reorderTasks); diff --git a/src/ior.c b/src/ior.c index 2d08234..e110ebc 100755 --- a/src/ior.c +++ b/src/ior.c @@ -65,7 +65,6 @@ IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out out_resultfile = world_out; mpi_comm_world = world_com; - MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld), "cannot get number of tasks"); MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); /* setup tests, and validate parameters */ @@ -113,8 +112,6 @@ int ior_main(int argc, char **argv) MPI_CHECK(MPI_Init(&argc, &argv), "cannot initialize MPI"); mpi_comm_world = MPI_COMM_WORLD; - MPI_CHECK(MPI_Comm_size(mpi_comm_world, &numTasksWorld), - "cannot get number of tasks"); MPI_CHECK(MPI_Comm_rank(mpi_comm_world, &rank), "cannot get rank"); /* set error-handling */ @@ -189,8 +186,14 @@ void init_IOR_Param_t(IOR_param_t * p) p->writeFile = p->readFile = FALSE; p->checkWrite = p->checkRead = FALSE; - p->nodes = 1; - p->tasksPerNode = 1; + /* + * These can be overridden from the command-line but otherwise will be + * set from MPI. + */ + p->numTasks = -1; + p->numNodes = -1; + p->numTasksOnNode0 = -1; + p->repetitions = 1; p->repCounter = -1; p->open = WRITE; @@ -920,12 +923,17 @@ static void RemoveFile(char *testFileName, int filePerProc, IOR_param_t * test) */ static void InitTests(IOR_test_t *tests, MPI_Comm com) { - int size; + int mpiNumNodes = 0; + int mpiNumTasks = 0; + int mpiNumTasksOnNode0 = 0; - MPI_CHECK(MPI_Comm_size(com, & size), "MPI_Comm_size() error"); - - /* count the tasks per node */ - tasksPerNode = CountTasksPerNode(com); + /* + * These default values are the same for every test and expensive to + * retrieve so just do it once. + */ + mpiNumNodes = GetNumNodes(com); + mpiNumTasks = GetNumTasks(com); + mpiNumTasksOnNode0 = GetNumTasksOnNode0(com); /* * Since there is no guarantee that anyone other than @@ -938,12 +946,28 @@ static void InitTests(IOR_test_t *tests, MPI_Comm com) while (tests != NULL) { IOR_param_t *params = & tests->params; params->testComm = com; - params->nodes = params->numTasks / tasksPerNode; - params->tasksPerNode = tasksPerNode; - params->tasksBlockMapping = QueryNodeMapping(com,false); - if (params->numTasks == 0) { - params->numTasks = size; + + /* use MPI values if not overridden on command-line */ + if (params->numNodes == -1) { + params->numNodes = mpiNumNodes; } + if (params->numTasks == -1) { + params->numTasks = mpiNumTasks; + } else if (params->numTasks > mpiNumTasks) { + if (rank == 0) { + fprintf(out_logfile, + "WARNING: More tasks requested (%d) than available (%d),", + params->numTasks, mpiNumTasks); + fprintf(out_logfile, " running with %d tasks.\n", + mpiNumTasks); + } + params->numTasks = mpiNumTasks; + } + if (params->numTasksOnNode0 == -1) { + params->numTasksOnNode0 = mpiNumTasksOnNode0; + } + + params->tasksBlockMapping = QueryNodeMapping(com,false); params->expectedAggFileSize = params->blockSize * params->segmentCount * params->numTasks; @@ -1091,7 +1115,7 @@ static void *HogMemory(IOR_param_t *params) if (verbose >= VERBOSE_3) fprintf(out_logfile, "This node hogging %ld bytes of memory\n", params->memoryPerNode); - size = params->memoryPerNode / params->tasksPerNode; + size = params->memoryPerNode / params->numTasksOnNode0; } else { return NULL; } @@ -1191,16 +1215,6 @@ static void TestIoSys(IOR_test_t *test) IOR_io_buffers ioBuffers; /* set up communicator for test */ - if (params->numTasks > numTasksWorld) { - if (rank == 0) { - fprintf(out_logfile, - "WARNING: More tasks requested (%d) than available (%d),", - params->numTasks, numTasksWorld); - fprintf(out_logfile, " running on %d tasks.\n", - numTasksWorld); - } - params->numTasks = numTasksWorld; - } MPI_CHECK(MPI_Comm_group(mpi_comm_world, &orig_group), "MPI_Comm_group() error"); range[0] = 0; /* first rank */ @@ -1227,7 +1241,6 @@ static void TestIoSys(IOR_test_t *test) "Using reorderTasks '-C' (useful to avoid read cache in client)\n"); fflush(out_logfile); } - params->tasksPerNode = CountTasksPerNode(testComm); backend = params->backend; /* show test setup */ if (rank == 0 && verbose >= VERBOSE_0) @@ -1364,7 +1377,7 @@ static void TestIoSys(IOR_test_t *test) /* move two nodes away from writing node */ int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */ if (params->tasksBlockMapping) { - shift = params->tasksPerNode; /* switch to by-slot (contiguous block) mapping */ + shift = params->numTasksOnNode0; /* switch to by-slot (contiguous block) mapping */ } rankOffset = (2 * shift) % params->numTasks; } @@ -1389,7 +1402,7 @@ static void TestIoSys(IOR_test_t *test) if(params->stoneWallingStatusFile){ params->stoneWallingWearOutIterations = ReadStoneWallingIterations(params->stoneWallingStatusFile); if(params->stoneWallingWearOutIterations == -1 && rank == 0){ - fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!"); + fprintf(out_logfile, "WARNING: Could not read back the stonewalling status from the file!\n"); params->stoneWallingWearOutIterations = 0; } } @@ -1404,7 +1417,7 @@ static void TestIoSys(IOR_test_t *test) /* move one node away from writing node */ int shift = 1; /* assume a by-node (round-robin) mapping of tasks to nodes */ if (params->tasksBlockMapping) { - shift=params->tasksPerNode; /* switch to a by-slot (contiguous block) mapping */ + shift=params->numTasksOnNode0; /* switch to a by-slot (contiguous block) mapping */ } rankOffset = (params->taskPerNodeOffset * shift) % params->numTasks; } @@ -1415,7 +1428,7 @@ static void TestIoSys(IOR_test_t *test) int nodeoffset; unsigned int iseed0; nodeoffset = params->taskPerNodeOffset; - nodeoffset = (nodeoffset < params->nodes) ? nodeoffset : params->nodes - 1; + nodeoffset = (nodeoffset < params->numNodes) ? nodeoffset : params->numNodes - 1; if (params->reorderTasksRandomSeed < 0) iseed0 = -1 * params->reorderTasksRandomSeed + rep; else @@ -1425,7 +1438,7 @@ static void TestIoSys(IOR_test_t *test) rankOffset = rand() % params->numTasks; } while (rankOffset < - (nodeoffset * params->tasksPerNode)) { + (nodeoffset * params->numTasksOnNode0)) { rankOffset = rand() % params->numTasks; } /* Get more detailed stats if requested by verbose level */ @@ -1455,7 +1468,7 @@ static void TestIoSys(IOR_test_t *test) "barrier error"); if (rank == 0 && verbose >= VERBOSE_1) { fprintf(out_logfile, - "Commencing read performance test: %s", + "Commencing read performance test: %s\n", CurrentTimeString()); } timer[2] = GetTimeStamp(); diff --git a/src/ior.h b/src/ior.h index e245b08..ccf47fa 100755 --- a/src/ior.h +++ b/src/ior.h @@ -98,8 +98,8 @@ typedef struct // intermediate options int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */ int numTasks; /* number of tasks for test */ - int nodes; /* number of nodes for test */ - int tasksPerNode; /* number of tasks per node */ + int numNodes; /* number of nodes for test */ + int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */ int tasksBlockMapping; /* are the tasks in contiguous blocks across nodes or round-robin */ int repetitions; /* number of repetitions of test */ int repCounter; /* rep counter */ diff --git a/src/mdtest.c b/src/mdtest.c index 17f6e5c..eb7ee0f 100644 --- a/src/mdtest.c +++ b/src/mdtest.c @@ -1870,7 +1870,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * mdtest_init_args(); int i, j; - int nodeCount; + int numNodes; + int numTasksOnNode0 = 0; MPI_Group worldgroup, testgroup; struct { int first; @@ -1950,8 +1951,8 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * pid = getpid(); uid = getuid(); - tasksPerNode = CountTasksPerNode(testComm); - nodeCount = size / tasksPerNode; + numNodes = GetNumNodes(testComm); + numTasksOnNode0 = GetNumTasksOnNode0(testComm); char cmd_buffer[4096]; strncpy(cmd_buffer, argv[0], 4096); @@ -1960,7 +1961,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * } VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp()); - VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, nodeCount); + VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, size, numNodes); VERBOSE(0,-1,"Command line used: %s", cmd_buffer); /* adjust special variables */ @@ -2128,10 +2129,10 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * /* set the shift to mimic IOR and shift by procs per node */ if (nstride > 0) { - if ( nodeCount > 1 && tasksBlockMapping ) { + if ( numNodes > 1 && tasksBlockMapping ) { /* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */ - nstride *= tasksPerNode; + nstride *= numTasksOnNode0; } VERBOSE(0,5,"Shifting ranks by %d for each phase.", nstride); } diff --git a/src/parse_options.c b/src/parse_options.c index 47f9920..74a7b54 100755 --- a/src/parse_options.c +++ b/src/parse_options.c @@ -151,8 +151,12 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->maxTimeDuration = atoi(value); } else if (strcasecmp(option, "outlierthreshold") == 0) { params->outlierThreshold = atoi(value); - } else if (strcasecmp(option, "nodes") == 0) { - params->nodes = atoi(value); + } else if (strcasecmp(option, "numnodes") == 0) { + params->numNodes = atoi(value); + } else if (strcasecmp(option, "numtasks") == 0) { + params->numTasks = atoi(value); + } else if (strcasecmp(option, "numtasksonnode0") == 0) { + params->numTasksOnNode0 = atoi(value); } else if (strcasecmp(option, "repetitions") == 0) { params->repetitions = atoi(value); } else if (strcasecmp(option, "intertestdelay") == 0) { @@ -286,8 +290,6 @@ void DecodeDirective(char *line, IOR_param_t *params, options_all_t * module_opt params->beegfs_chunkSize = string_to_bytes(value); if (!ISPOWEROFTWO(params->beegfs_chunkSize) || params->beegfs_chunkSize < (1<<16)) ERR("beegfsChunkSize must be a power of two and >64k"); - } else if (strcasecmp(option, "numtasks") == 0) { - params->numTasks = atoi(value); } else if (strcasecmp(option, "summaryalways") == 0) { params->summary_every_test = atoi(value); } else { @@ -498,7 +500,7 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'m', NULL, "multiFile -- use number of reps (-i) for multiple file count", OPTION_FLAG, 'd', & params->multiFile}, {'M', NULL, "memoryPerNode -- hog memory on the node (e.g.: 2g, 75%)", OPTION_OPTIONAL_ARGUMENT, 's', & params->memoryPerNodeStr}, {'n', NULL, "noFill -- no fill in HDF5 file creation", OPTION_FLAG, 'd', & params->noFill}, - {'N', NULL, "numTasks -- number of tasks that should participate in the test", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks}, + {'N', NULL, "numTasks -- number of tasks that are participating in the test (overrides MPI)", OPTION_OPTIONAL_ARGUMENT, 'd', & params->numTasks}, {'o', NULL, "testFile -- full name for test", OPTION_OPTIONAL_ARGUMENT, 's', & params->testFileName}, {'O', NULL, "string of IOR directives (e.g. -O checkRead=1,lustreStripeCount=32)", OPTION_OPTIONAL_ARGUMENT, 'p', & decodeDirectiveWrapper}, {'p', NULL, "preallocate -- preallocate file size", OPTION_FLAG, 'd', & params->preallocate}, diff --git a/src/utilities.c b/src/utilities.c index c7e1c8c..a657d9f 100755 --- a/src/utilities.c +++ b/src/utilities.c @@ -53,11 +53,9 @@ extern int errno; extern int numTasks; -/* globals used by other files, also defined "extern" in ior.h */ -int numTasksWorld = 0; +/* globals used by other files, also defined "extern" in utilities.h */ int rank = 0; int rankOffset = 0; -int tasksPerNode = 0; /* tasks per node */ int verbose = VERBOSE_0; /* verbose output */ MPI_Comm testComm; MPI_Comm mpi_comm_world; @@ -265,35 +263,108 @@ int QueryNodeMapping(MPI_Comm comm, int print_nodemap) { return ret; } +/* + * There is a more direct way to determine the node count in modern MPI + * versions so we use that if possible. + * + * For older versions we use a method which should still provide accurate + * results even if the total number of tasks is not evenly divisible by the + * tasks on node rank 0. + */ +int GetNumNodes(MPI_Comm comm) { #if MPI_VERSION >= 3 -int CountTasksPerNode(MPI_Comm comm) { - /* modern MPI provides a simple way to get the local process count */ - MPI_Comm shared_comm; - int count; + MPI_Comm shared_comm; + int shared_rank = 0; + int local_result = 0; + int numNodes = 0; + MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm), + "MPI_Comm_split_type() error"); + MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error"); + local_result = shared_rank == 0? 1 : 0; + MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm), + "MPI_Allreduce() error"); + MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error"); - MPI_Comm_split_type (comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm); - MPI_Comm_size (shared_comm, &count); - MPI_Comm_free (&shared_comm); + return numNodes; +#else + int numTasks = 0; + int numTasksOnNode0 = 0; - return count; + numTasks = GetNumTasks(comm); + numTasksOnNode0 = GetNumTasksOnNode0(comm); + + return ((numTasks - 1) / numTasksOnNode0) + 1; +#endif } + + +int GetNumTasks(MPI_Comm comm) { + int numTasks = 0; + + MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks"); + + return numTasks; +} + + +/* + * It's very important that this method provide the same result to every + * process as it's used for redistributing which jobs read from which files. + * It was renamed accordingly. + * + * If different nodes get different results from this method then jobs get + * redistributed unevenly and you no longer have a 1:1 relationship with some + * nodes reading multiple files while others read none. + * + * In the common case the number of tasks on each node (MPI_Comm_size on an + * MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is + * nothing which guarantees this. It's valid to have, for example, 64 jobs + * across 4 systems which can run 20 jobs each. In that scenario you end up + * with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4. + * + * In the (MPI_VERSION < 3) implementation of this method consistency is + * ensured by asking specifically about the number of tasks on the node with + * rank 0. In the original implementation for (MPI_VERSION >= 3) this was + * broken by using the LOCAL process count which differed depending on which + * node you were on. + * + * This was corrected below by first splitting the comm into groups by node + * (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and + * shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields + * the original consistent behavior no matter which node asks. + * + * In the common case where every node has the same number of tasks this + * method will return the same value it always has. + */ +int GetNumTasksOnNode0(MPI_Comm comm) { +#if MPI_VERSION >= 3 + MPI_Comm shared_comm; + int shared_rank = 0; + int tasks_on_node_rank0 = 0; + int local_result = 0; + + MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm), + "MPI_Comm_split_type() error"); + MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error"); + if (rank == 0 && shared_rank == 0) { + MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error"); + } + MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm), + "MPI_Allreduce() error"); + MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error"); + + return tasks_on_node_rank0; #else /* - * Count the number of tasks that share a host. - * - * This function employees the gethostname() call, rather than using + * This version employs the gethostname() call, rather than using * MPI_Get_processor_name(). We are interested in knowing the number * of tasks that share a file system client (I/O node, compute node, * whatever that may be). However on machines like BlueGene/Q, * MPI_Get_processor_name() uniquely identifies a cpu in a compute node, * not the node where the I/O is function shipped to. gethostname() * is assumed to identify the shared filesystem client in more situations. - * - * NOTE: This also assumes that the task count on all nodes is equal - * to the task count on the host running MPI task 0. */ -int CountTasksPerNode(MPI_Comm comm) { int size; MPI_Comm_size(comm, & size); /* for debugging and testing */ @@ -336,8 +407,8 @@ int CountTasksPerNode(MPI_Comm comm) { MPI_Bcast(&count, 1, MPI_INT, 0, comm); return(count); -} #endif +} /* diff --git a/src/utilities.h b/src/utilities.h index d2c9962..2a9abe3 100755 --- a/src/utilities.h +++ b/src/utilities.h @@ -18,10 +18,8 @@ #include #include "ior.h" -extern int numTasksWorld; extern int rank; extern int rankOffset; -extern int tasksPerNode; extern int verbose; extern MPI_Comm testComm; extern MPI_Comm mpi_comm_world; @@ -55,8 +53,10 @@ void SeedRandGen(MPI_Comm); void SetHints (MPI_Info *, char *); void ShowHints (MPI_Info *); char *HumanReadable(IOR_offset_t value, int base); -int CountTasksPerNode(MPI_Comm comm); int QueryNodeMapping(MPI_Comm comm, int print_nodemap); +int GetNumNodes(MPI_Comm); +int GetNumTasks(MPI_Comm); +int GetNumTasksOnNode0(MPI_Comm); void DelaySecs(int delay); void updateParsedOptions(IOR_param_t * options, options_all_t * global_options); size_t NodeMemoryStringToBytes(char *size_str);