mdtest/src/utilities.c

895 lines
29 KiB
C
Raw Normal View History

/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
2011-06-17 23:20:43 +04:00
/******************************************************************************\
* *
* Copyright (c) 2003, The Regents of the University of California *
* See the file COPYRIGHT for a complete copyright notice and license. *
* *
********************************************************************************
*
* Additional utilities
2011-06-17 23:20:43 +04:00
*
\******************************************************************************/
#ifdef HAVE_CONFIG_H
2014-07-31 03:17:21 +04:00
# include "config.h"
#endif
2018-09-18 18:48:59 +03:00
#ifdef __linux__
# define _GNU_SOURCE /* Needed for O_DIRECT in fcntl */
#endif /* __linux__ */
#include <stdarg.h>
#include <stdio.h>
2011-06-17 23:20:43 +04:00
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <math.h> /* pow() */
2011-06-17 23:20:43 +04:00
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
2014-07-31 03:17:21 +04:00
2011-06-17 23:20:43 +04:00
#ifndef _WIN32
2014-07-31 03:17:21 +04:00
# include <regex.h>
# ifdef __sun /* SunOS does not support statfs(), instead uses statvfs() */
# include <sys/statvfs.h>
# elif (defined __APPLE__)
# include <sys/param.h>
# include <sys/mount.h>
# else /* ! __sun or __APPLE__ */
# include <sys/statfs.h>
# endif /* __sun */
# include <sys/time.h> /* gettimeofday() */
2011-06-17 23:20:43 +04:00
#endif
#include "utilities.h"
#include "aiori.h"
#include "ior.h"
2011-06-17 23:20:43 +04:00
/************************** D E C L A R A T I O N S ***************************/
2011-11-12 03:11:28 +04:00
extern int errno;
extern int numTasks;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/* globals used by other files, also defined "extern" in utilities.h */
int rank = 0;
int rankOffset = 0;
int verbose = VERBOSE_0; /* verbose output */
MPI_Comm testComm;
MPI_Comm mpi_comm_world;
FILE * out_logfile;
2018-07-08 15:47:55 +03:00
FILE * out_resultfile;
2018-07-08 15:07:32 +03:00
enum OutputFormat_t outputFormat;
2011-06-17 23:20:43 +04:00
/***************************** F U N C T I O N S ******************************/
void* safeMalloc(uint64_t size){
void * d = malloc(size);
if (d == NULL){
ERR("Could not malloc an array");
}
memset(d, 0, size);
return d;
}
void FailMessage(int rank, const char *location, char *format, ...) {
char msg[4096];
va_list args;
va_start(args, format);
vsnprintf(msg, 4096, format, args);
va_end(args);
fprintf(out_logfile, "%s: Process %d: FAILED in %s, %s: %s\n",
PrintTimestamp(), rank, location, msg, strerror(errno));
fflush(out_logfile);
MPI_Abort(testComm, 1);
}
size_t NodeMemoryStringToBytes(char *size_str)
{
int percent;
int rc;
long page_size;
long num_pages;
long long mem;
rc = sscanf(size_str, " %d %% ", &percent);
if (rc == 0)
return (size_t) string_to_bytes(size_str);
if (percent > 100 || percent < 0)
ERR("percentage must be between 0 and 100");
#ifdef HAVE_SYSCONF
page_size = sysconf(_SC_PAGESIZE);
#else
page_size = getpagesize();
#endif
#ifdef _SC_PHYS_PAGES
num_pages = sysconf(_SC_PHYS_PAGES);
if (num_pages == -1)
ERR("sysconf(_SC_PHYS_PAGES) is not supported");
#else
ERR("sysconf(_SC_PHYS_PAGES) is not supported");
#endif
mem = page_size * num_pages;
return mem / 100 * percent;
}
void updateParsedOptions(IOR_param_t * options, options_all_t * global_options){
if (options->setTimeStampSignature){
options->incompressibleSeed = options->setTimeStampSignature;
}
if (options->buffer_type && options->buffer_type[0] != 0){
switch(options->buffer_type[0]) {
case 'i': /* Incompressible */
options->dataPacketType = incompressible;
break;
case 't': /* timestamp */
options->dataPacketType = timestamp;
break;
case 'o': /* offset packet */
options->storeFileOffset = TRUE;
options->dataPacketType = offset;
break;
default:
fprintf(out_logfile,
"Unknown argument for -l %s; generic assumed\n", options->buffer_type);
break;
}
}
if (options->memoryPerNodeStr){
options->memoryPerNode = NodeMemoryStringToBytes(options->memoryPerNodeStr);
}
const ior_aiori_t * backend = aiori_select(options->api);
if (backend == NULL)
ERR("Unrecognized I/O API");
options->backend = backend;
/* copy the actual module options into the test */
options->backend_options = airoi_update_module_options(backend, global_options);
options->apiVersion = backend->get_version();
}
/* Used in aiori-POSIX.c and aiori-PLFS.c
*/
void set_o_direct_flag(int *fd)
{
/* note that TRU64 needs O_DIRECTIO, SunOS uses directio(),
and everyone else needs O_DIRECT */
#ifndef O_DIRECT
# ifndef O_DIRECTIO
WARN("cannot use O_DIRECT");
# define O_DIRECT 000000
# else /* O_DIRECTIO */
# define O_DIRECT O_DIRECTIO
# endif /* not O_DIRECTIO */
#endif /* not O_DIRECT */
*fd |= O_DIRECT;
}
2011-06-17 23:20:43 +04:00
/*
* Returns string containing the current time.
*
* NOTE: On some systems, MPI jobs hang while ctime() waits for a lock.
* This is true even though CurrentTimeString() is only called for rank==0.
* ctime_r() fixes this.
2011-06-17 23:20:43 +04:00
*/
char *CurrentTimeString(void)
2011-06-17 23:20:43 +04:00
{
static time_t currentTime;
char* currentTimePtr;
2011-06-17 23:20:43 +04:00
if ((currentTime = time(NULL)) == -1)
ERR("cannot get current time");
#if (_POSIX_C_SOURCE >= 1 || _XOPEN_SOURCE || _BSD_SOURCE || _SVID_SOURCE || _POSIX_SOURCE)
static char threadSafeBuff[32]; /* "must be at least 26 characters long" */
if ((currentTimePtr = ctime_r(&currentTime, threadSafeBuff)) == NULL) {
ERR("cannot read current time");
}
#else
if ((currentTimePtr = ctime(&currentTime)) == NULL) {
ERR("cannot read current time");
}
#endif
/* ctime string ends in \n */
return (currentTimePtr);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Dump transfer buffer.
*/
void DumpBuffer(void *buffer,
size_t size) /* <size> in bytes */
2011-06-17 23:20:43 +04:00
{
size_t i, j;
IOR_size_t *dumpBuf = (IOR_size_t *)buffer;
2011-06-17 23:20:43 +04:00
/* Turns out, IOR_size_t is unsigned long long, but we don't want
to assume that it must always be */
for (i = 0; i < ((size / sizeof(IOR_size_t)) / 4); i++) {
for (j = 0; j < 4; j++) {
fprintf(out_logfile, IOR_format" ", dumpBuf[4 * i + j]);
}
fprintf(out_logfile, "\n");
2011-06-17 23:20:43 +04:00
}
return;
} /* DumpBuffer() */
2011-06-17 23:20:43 +04:00
/* a function that prints an int array where each index corresponds to a rank
and the value is whether that rank is on the same host as root.
Also returns 1 if rank 1 is on same host and 0 otherwise
*/
int QueryNodeMapping(MPI_Comm comm, int print_nodemap) {
char localhost[MAX_PATHLEN], roothost[MAX_PATHLEN];
int num_ranks;
MPI_Comm_size(comm, &num_ranks);
int *node_map = (int*)malloc(sizeof(int) * num_ranks);
if ( ! node_map ) {
FAIL("malloc");
}
if (gethostname(localhost, MAX_PATHLEN) != 0) {
FAIL("gethostname()");
}
if (rank==0) {
strncpy(roothost,localhost,MAX_PATHLEN);
}
/* have rank 0 broadcast out its hostname */
MPI_Bcast(roothost, MAX_PATHLEN, MPI_CHAR, 0, comm);
//printf("Rank %d received root host as %s\n", rank, roothost);
/* then every rank figures out whether it is same host as root and then gathers that */
int same_as_root = strcmp(roothost,localhost) == 0;
MPI_Gather( &same_as_root, 1, MPI_INT, node_map, 1, MPI_INT, 0, comm);
if ( print_nodemap && rank==0) {
fprintf( out_logfile, "Nodemap: " );
for ( int i = 0; i < num_ranks; i++ ) {
fprintf( out_logfile, "%d", node_map[i] );
}
fprintf( out_logfile, "\n" );
}
int ret = 1;
if(num_ranks>1)
ret = node_map[1] == 1;
MPI_Bcast(&ret, 1, MPI_INT, 0, comm);
free(node_map);
return ret;
}
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/*
* There is a more direct way to determine the node count in modern MPI
* versions so we use that if possible.
*
* For older versions we use a method which should still provide accurate
* results even if the total number of tasks is not evenly divisible by the
* tasks on node rank 0.
*/
int GetNumNodes(MPI_Comm comm) {
if (getenv("IOR_FAKE_NODES")){
int numNodes = atoi(getenv("IOR_FAKE_NODES"));
int rank;
MPI_Comm_rank(comm, & rank);
if(rank == 0){
printf("Fake number of node: using %d\n", numNodes);
}
return numNodes;
}
#if MPI_VERSION >= 3
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
MPI_Comm shared_comm;
int shared_rank = 0;
int local_result = 0;
int numNodes = 0;
MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
"MPI_Comm_split_type() error");
MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
local_result = shared_rank == 0? 1 : 0;
MPI_CHECK(MPI_Allreduce(&local_result, &numNodes, 1, MPI_INT, MPI_SUM, comm),
"MPI_Allreduce() error");
MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
return numNodes;
#else
int numTasks = 0;
int numTasksOnNode0 = 0;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
numTasks = GetNumTasks(comm);
numTasksOnNode0 = GetNumTasksOnNode0(comm);
return ((numTasks - 1) / numTasksOnNode0) + 1;
#endif
}
2011-06-17 23:20:43 +04:00
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
int GetNumTasks(MPI_Comm comm) {
int numTasks = 0;
MPI_CHECK(MPI_Comm_size(comm, &numTasks), "cannot get number of tasks");
return numTasks;
}
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
/*
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
* It's very important that this method provide the same result to every
* process as it's used for redistributing which jobs read from which files.
* It was renamed accordingly.
*
* If different nodes get different results from this method then jobs get
* redistributed unevenly and you no longer have a 1:1 relationship with some
* nodes reading multiple files while others read none.
*
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
* In the common case the number of tasks on each node (MPI_Comm_size on an
* MPI_COMM_TYPE_SHARED communicator) will be the same. However, there is
* nothing which guarantees this. It's valid to have, for example, 64 jobs
* across 4 systems which can run 20 jobs each. In that scenario you end up
* with 3 MPI_COMM_TYPE_SHARED groups of 20, and one group of 4.
*
* In the (MPI_VERSION < 3) implementation of this method consistency is
* ensured by asking specifically about the number of tasks on the node with
* rank 0. In the original implementation for (MPI_VERSION >= 3) this was
* broken by using the LOCAL process count which differed depending on which
* node you were on.
*
* This was corrected below by first splitting the comm into groups by node
* (MPI_COMM_TYPE_SHARED) and then having only the node with world rank 0 and
* shared rank 0 return the MPI_Comm_size of its shared subgroup. This yields
* the original consistent behavior no matter which node asks.
*
* In the common case where every node has the same number of tasks this
* method will return the same value it always has.
*/
int GetNumTasksOnNode0(MPI_Comm comm) {
if (getenv("IOR_FAKE_TASK_PER_NODES")){
int tasksPerNode = atoi(getenv("IOR_FAKE_TASK_PER_NODES"));
int rank;
MPI_Comm_rank(comm, & rank);
if(rank == 0){
printf("Fake tasks per node: using %d\n", tasksPerNode);
}
return tasksPerNode;
}
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
#if MPI_VERSION >= 3
MPI_Comm shared_comm;
int shared_rank = 0;
int tasks_on_node_rank0 = 0;
int local_result = 0;
MPI_CHECK(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shared_comm),
"MPI_Comm_split_type() error");
MPI_CHECK(MPI_Comm_rank(shared_comm, &shared_rank), "MPI_Comm_rank() error");
if (rank == 0 && shared_rank == 0) {
MPI_CHECK(MPI_Comm_size(shared_comm, &local_result), "MPI_Comm_size() error");
}
MPI_CHECK(MPI_Allreduce(&local_result, &tasks_on_node_rank0, 1, MPI_INT, MPI_SUM, comm),
"MPI_Allreduce() error");
MPI_CHECK(MPI_Comm_free(&shared_comm), "MPI_Comm_free() error");
return tasks_on_node_rank0;
#else
/*
* This version employs the gethostname() call, rather than using
* MPI_Get_processor_name(). We are interested in knowing the number
* of tasks that share a file system client (I/O node, compute node,
* whatever that may be). However on machines like BlueGene/Q,
* MPI_Get_processor_name() uniquely identifies a cpu in a compute node,
* not the node where the I/O is function shipped to. gethostname()
* is assumed to identify the shared filesystem client in more situations.
*/
2018-07-08 00:46:13 +03:00
int size;
MPI_Comm_size(comm, & size);
/* for debugging and testing */
2018-07-08 00:42:05 +03:00
char localhost[MAX_PATHLEN],
hostname[MAX_PATHLEN];
int count = 1,
i;
MPI_Status status;
if (( rank == 0 ) && ( verbose >= 1 )) {
fprintf( out_logfile, "V-1: Entering count_tasks_per_node...\n" );
fflush( out_logfile );
}
2018-07-08 00:42:05 +03:00
if (gethostname(localhost, MAX_PATHLEN) != 0) {
FAIL("gethostname()");
}
if (rank == 0) {
2018-07-08 00:46:13 +03:00
/* MPI_receive all hostnames, and compares them to the local hostname */
for (i = 0; i < size-1; i++) {
2018-07-08 00:42:05 +03:00
MPI_Recv(hostname, MAX_PATHLEN, MPI_CHAR, MPI_ANY_SOURCE,
MPI_ANY_TAG, comm, &status);
if (strcmp(hostname, localhost) == 0) {
count++;
}
}
} else {
/* MPI_send hostname to root node */
MPI_Send(localhost, MAX_PATHLEN, MPI_CHAR, 0, 0, comm);
}
MPI_Bcast(&count, 1, MPI_INT, 0, comm);
2011-06-17 23:20:43 +04:00
return(count);
#endif
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
}
2011-06-17 23:20:43 +04:00
/*
* Extract key/value pair from hint string.
*/
void ExtractHint(char *settingVal, char *valueVal, char *hintString)
{
char *settingPtr, *valuePtr, *tmpPtr2;
/* find the value */
settingPtr = (char *)strtok(hintString, " =");
valuePtr = (char *)strtok(NULL, " =\t\r\n");
/* is this an MPI hint? */
tmpPtr2 = (char *) strstr(settingPtr, "IOR_HINT__MPI__");
if (settingPtr == tmpPtr2) {
settingPtr += strlen("IOR_HINT__MPI__");
} else {
tmpPtr2 = (char *) strstr(hintString, "IOR_HINT__GPFS__");
/* is it an GPFS hint? */
if (settingPtr == tmpPtr2) {
settingPtr += strlen("IOR_HINT__GPFS__");
}else{
fprintf(out_logfile, "WARNING: Unable to set unknown hint type (not implemented.)\n");
return;
}
}
strcpy(settingVal, settingPtr);
strcpy(valueVal, valuePtr);
}
2011-06-17 23:20:43 +04:00
/*
* Set hints for MPIIO, HDF5, or NCMPI.
*/
void SetHints(MPI_Info * mpiHints, char *hintsFileName)
2011-06-17 23:20:43 +04:00
{
2011-11-12 03:11:28 +04:00
char hintString[MAX_STR];
char settingVal[MAX_STR];
char valueVal[MAX_STR];
extern char **environ;
int i;
FILE *fd;
/*
* This routine checks for hints from the environment and/or from the
* hints files. The hints are of the form:
* 'IOR_HINT__<layer>__<hint>=<value>', where <layer> is either 'MPI'
* or 'GPFS', <hint> is the full name of the hint to be set, and <value>
* is the hint value. E.g., 'setenv IOR_HINT__MPI__IBM_largeblock_io true'
* or 'IOR_HINT__GPFS__hint=value' in the hints file.
*/
MPI_CHECK(MPI_Info_create(mpiHints), "cannot create info object");
/* get hints from environment */
for (i = 0; environ[i] != NULL; i++) {
/* if this is an IOR_HINT, pass the hint to the info object */
if (strncmp(environ[i], "IOR_HINT", strlen("IOR_HINT")) == 0) {
strcpy(hintString, environ[i]);
ExtractHint(settingVal, valueVal, hintString);
MPI_CHECK(MPI_Info_set(*mpiHints, settingVal, valueVal),
"cannot set info object");
}
2011-06-17 23:20:43 +04:00
}
/* get hints from hints file */
if (hintsFileName != NULL && strcmp(hintsFileName, "") != 0) {
/* open the hint file */
fd = fopen(hintsFileName, "r");
if (fd == NULL) {
WARN("cannot open hints file");
} else {
/* iterate over hints file */
while (fgets(hintString, MAX_STR, fd) != NULL) {
if (strncmp
(hintString, "IOR_HINT",
strlen("IOR_HINT")) == 0) {
ExtractHint(settingVal, valueVal,
hintString);
MPI_CHECK(MPI_Info_set
(*mpiHints, settingVal,
valueVal),
"cannot set info object");
}
}
/* close the hints files */
if (fclose(fd) != 0)
ERR("cannot close hints file");
2011-06-17 23:20:43 +04:00
}
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Show all hints (key/value pairs) in an MPI_Info object.
*/
void ShowHints(MPI_Info * mpiHints)
{
2011-11-12 03:11:28 +04:00
char key[MPI_MAX_INFO_VAL];
char value[MPI_MAX_INFO_VAL];
int flag, i, nkeys;
MPI_CHECK(MPI_Info_get_nkeys(*mpiHints, &nkeys),
"cannot get info object keys");
for (i = 0; i < nkeys; i++) {
MPI_CHECK(MPI_Info_get_nthkey(*mpiHints, i, key),
"cannot get info object key");
MPI_CHECK(MPI_Info_get(*mpiHints, key, MPI_MAX_INFO_VAL - 1,
value, &flag),
"cannot get info object value");
fprintf(out_logfile, "\t%s = %s\n", key, value);
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Takes a string of the form 64, 8m, 128k, 4g, etc. and converts to bytes.
*/
IOR_offset_t StringToBytes(char *size_str)
2011-06-17 23:20:43 +04:00
{
IOR_offset_t size = 0;
char range;
int rc;
rc = sscanf(size_str, "%lld%c", &size, &range);
if (rc == 2) {
switch ((int)range) {
case 'k':
case 'K':
size <<= 10;
break;
case 'm':
case 'M':
size <<= 20;
break;
case 'g':
case 'G':
size <<= 30;
break;
}
} else if (rc == 0) {
size = -1;
2011-06-17 23:20:43 +04:00
}
return (size);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Displays size of file system and percent of data blocks and inodes used.
*/
void ShowFileSystemSize(char *fileSystem) // this might be converted to an AIORI call
2011-06-17 23:20:43 +04:00
{
#ifndef _WIN32 /* FIXME */
char realPath[PATH_MAX];
2011-11-12 03:11:28 +04:00
char *fileSystemUnitStr;
long long int totalFileSystemSize;
long long int freeFileSystemSize;
long long int totalInodes;
long long int freeInodes;
double totalFileSystemSizeHR;
double usedFileSystemPercentage;
double usedInodePercentage;
#ifdef __sun /* SunOS does not support statfs(), instead uses statvfs() */
struct statvfs statusBuffer;
#else /* !__sun */
struct statfs statusBuffer;
#endif /* __sun */
2011-06-17 23:20:43 +04:00
#ifdef __sun
if (statvfs(fileSystem, &statusBuffer) != 0) {
WARN("unable to statvfs() file system");
return;
}
#else /* !__sun */
if (statfs(fileSystem, &statusBuffer) != 0) {
WARN("unable to statfs() file system");
return;
}
#endif /* __sun */
/* data blocks */
2011-06-17 23:20:43 +04:00
#ifdef __sun
totalFileSystemSize = statusBuffer.f_blocks * statusBuffer.f_frsize;
freeFileSystemSize = statusBuffer.f_bfree * statusBuffer.f_frsize;
#else /* !__sun */
totalFileSystemSize = statusBuffer.f_blocks * statusBuffer.f_bsize;
freeFileSystemSize = statusBuffer.f_bfree * statusBuffer.f_bsize;
#endif /* __sun */
usedFileSystemPercentage = (1 - ((double)freeFileSystemSize
/ (double)totalFileSystemSize)) * 100;
totalFileSystemSizeHR =
2011-11-12 03:11:28 +04:00
(double)totalFileSystemSize / (double)(1<<30);
fileSystemUnitStr = "GiB";
if (totalFileSystemSizeHR > 1024) {
totalFileSystemSizeHR = (double)totalFileSystemSize / (double)((long long)1<<40);
fileSystemUnitStr = "TiB";
}
/* inodes */
totalInodes = statusBuffer.f_files;
freeInodes = statusBuffer.f_ffree;
usedInodePercentage =
(1 - ((double)freeInodes / (double)totalInodes)) * 100;
2011-06-17 23:20:43 +04:00
/* show results */
if (realpath(fileSystem, realPath) == NULL) {
WARN("unable to use realpath()");
return;
}
if(outputFormat == OUTPUT_DEFAULT){
2018-07-08 18:56:04 +03:00
fprintf(out_resultfile, "%-20s: %s\n", "Path", realPath);
fprintf(out_resultfile, "%-20s: %.1f %s Used FS: %2.1f%% ",
"FS", totalFileSystemSizeHR, fileSystemUnitStr,
usedFileSystemPercentage);
fprintf(out_resultfile, "Inodes: %.1f Mi Used Inodes: %2.1f%%\n",
(double)totalInodes / (double)(1<<20),
usedInodePercentage);
fflush(out_logfile);
}else if(outputFormat == OUTPUT_JSON){
2018-07-08 18:47:23 +03:00
fprintf(out_resultfile, " , \"Path\": \"%s\",", realPath);
fprintf(out_resultfile, "\"Capacity\": \"%.1f %s\", \"Used Capacity\": \"%2.1f%%\",",
totalFileSystemSizeHR, fileSystemUnitStr,
usedFileSystemPercentage);
2018-07-08 18:47:23 +03:00
fprintf(out_resultfile, "\"Inodes\": \"%.1f Mi\", \"Used Inodes\" : \"%2.1f%%\"\n",
(double)totalInodes / (double)(1<<20),
usedInodePercentage);
}else if(outputFormat == OUTPUT_CSV){
}
2011-11-12 03:11:28 +04:00
#endif /* !_WIN32 */
return;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Return match of regular expression -- 0 is failure, 1 is success.
*/
int Regex(char *string, char *pattern)
2011-06-17 23:20:43 +04:00
{
int retValue = 0;
#ifndef _WIN32 /* Okay to always not match */
regex_t regEx;
regmatch_t regMatch;
regcomp(&regEx, pattern, REG_EXTENDED);
if (regexec(&regEx, string, 1, &regMatch, 0) == 0) {
retValue = 1;
}
regfree(&regEx);
2011-06-17 23:20:43 +04:00
#endif
return (retValue);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Seed random generator.
*/
void SeedRandGen(MPI_Comm testComm)
2011-06-17 23:20:43 +04:00
{
unsigned int randomSeed;
2011-06-17 23:20:43 +04:00
if (rank == 0) {
2011-06-17 23:20:43 +04:00
#ifdef _WIN32
rand_s(&randomSeed);
2011-06-17 23:20:43 +04:00
#else
struct timeval randGenTimer;
gettimeofday(&randGenTimer, (struct timezone *)NULL);
randomSeed = randGenTimer.tv_usec;
2011-06-17 23:20:43 +04:00
#endif
}
MPI_CHECK(MPI_Bcast(&randomSeed, 1, MPI_INT, 0,
testComm), "cannot broadcast random seed value");
srandom(randomSeed);
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* System info for Windows.
*/
#ifdef _WIN32
int uname(struct utsname *name)
{
DWORD nodeNameSize = sizeof(name->nodename) - 1;
memset(name, 0, sizeof(struct utsname));
if (!GetComputerNameEx
(ComputerNameDnsFullyQualified, name->nodename, &nodeNameSize))
ERR("GetComputerNameEx failed");
strncpy(name->sysname, "Windows", sizeof(name->sysname) - 1);
/* FIXME - these should be easy to fetch */
strncpy(name->release, "-", sizeof(name->release) - 1);
strncpy(name->version, "-", sizeof(name->version) - 1);
strncpy(name->machine, "-", sizeof(name->machine) - 1);
return 0;
2011-06-17 23:20:43 +04:00
}
2011-11-12 03:11:28 +04:00
#endif /* _WIN32 */
double wall_clock_deviation;
double wall_clock_delta = 0;
/*
* Get time stamp. Use MPI_Timer() unless _NO_MPI_TIMER is defined,
* in which case use gettimeofday().
*/
double GetTimeStamp(void)
{
double timeVal;
#ifdef _NO_MPI_TIMER
struct timeval timer;
if (gettimeofday(&timer, (struct timezone *)NULL) != 0)
ERR("cannot use gettimeofday()");
timeVal = (double)timer.tv_sec + ((double)timer.tv_usec / 1000000);
#else /* not _NO_MPI_TIMER */
timeVal = MPI_Wtime(); /* no MPI_CHECK(), just check return value */
if (timeVal < 0)
ERR("cannot use MPI_Wtime()");
#endif /* _NO_MPI_TIMER */
/* wall_clock_delta is difference from root node's time */
timeVal -= wall_clock_delta;
return (timeVal);
}
/*
* Determine any spread (range) between node times.
*/
static double TimeDeviation(void)
{
double timestamp;
double min = 0;
double max = 0;
double roottimestamp;
MPI_CHECK(MPI_Barrier(mpi_comm_world), "barrier error");
timestamp = GetTimeStamp();
MPI_CHECK(MPI_Reduce(&timestamp, &min, 1, MPI_DOUBLE,
MPI_MIN, 0, mpi_comm_world),
"cannot reduce tasks' times");
MPI_CHECK(MPI_Reduce(&timestamp, &max, 1, MPI_DOUBLE,
MPI_MAX, 0, mpi_comm_world),
"cannot reduce tasks' times");
/* delta between individual nodes' time and root node's time */
roottimestamp = timestamp;
MPI_CHECK(MPI_Bcast(&roottimestamp, 1, MPI_DOUBLE, 0, mpi_comm_world),
"cannot broadcast root's time");
wall_clock_delta = timestamp - roottimestamp;
return max - min;
}
void init_clock(){
/* check for skew between tasks' start times */
wall_clock_deviation = TimeDeviation();
}
char * PrintTimestamp() {
static char datestring[80];
time_t cur_timestamp;
if (( rank == 0 ) && ( verbose >= 1 )) {
fprintf( out_logfile, "V-1: Entering PrintTimestamp...\n" );
}
fflush(out_logfile);
cur_timestamp = time(NULL);
strftime(datestring, 80, "%m/%d/%Y %T", localtime(&cur_timestamp));
return datestring;
}
int64_t ReadStoneWallingIterations(char * const filename){
long long data;
if(rank != 0){
MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, mpi_comm_world);
return data;
}else{
FILE * out = fopen(filename, "r");
if (out == NULL){
2018-07-14 14:22:36 +03:00
data = -1;
MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, mpi_comm_world);
return data;
}
int ret = fscanf(out, "%lld", & data);
if (ret != 1){
return -1;
}
fclose(out);
MPI_Bcast( & data, 1, MPI_LONG_LONG_INT, 0, mpi_comm_world);
return data;
}
}
void StoreStoneWallingIterations(char * const filename, int64_t count){
if(rank != 0){
return;
}
FILE * out = fopen(filename, "w");
if (out == NULL){
FAIL("Cannot write to the stonewalling file!");
}
fprintf(out, "%lld", (long long) count);
fclose(out);
}
/*
* Sleep for 'delay' seconds.
*/
void DelaySecs(int delay){
if (rank == 0 && delay > 0) {
if (verbose >= VERBOSE_1)
fprintf(out_logfile, "delaying %d seconds . . .\n", delay);
sleep(delay);
}
}
2018-07-08 15:38:05 +03:00
/*
* Convert IOR_offset_t value to human readable string. This routine uses a
* statically-allocated buffer internally and so is not re-entrant.
*/
char *HumanReadable(IOR_offset_t value, int base)
{
static char valueStr[MAX_STR];
IOR_offset_t m = 0, g = 0, t = 0;
char m_str[8], g_str[8], t_str[8];
if (base == BASE_TWO) {
m = MEBIBYTE;
g = GIBIBYTE;
t = GIBIBYTE * 1024llu;
strcpy(m_str, "MiB");
strcpy(g_str, "GiB");
strcpy(t_str, "TiB");
} else if (base == BASE_TEN) {
m = MEGABYTE;
g = GIGABYTE;
t = GIGABYTE * 1000llu;
strcpy(m_str, "MB");
strcpy(g_str, "GB");
strcpy(t_str, "TB");
}
if (value >= t) {
if (value % t) {
snprintf(valueStr, MAX_STR-1, "%.2f %s",
(double)((double)value / t), t_str);
} else {
snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / t), t_str);
}
}else if (value >= g) {
if (value % g) {
snprintf(valueStr, MAX_STR-1, "%.2f %s",
(double)((double)value / g), g_str);
} else {
snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / g), g_str);
}
} else if (value >= m) {
if (value % m) {
snprintf(valueStr, MAX_STR-1, "%.2f %s",
(double)((double)value / m), m_str);
} else {
snprintf(valueStr, MAX_STR-1, "%d %s", (int)(value / m), m_str);
}
} else if (value >= 0) {
snprintf(valueStr, MAX_STR-1, "%d bytes", (int)value);
} else {
snprintf(valueStr, MAX_STR-1, "-");
}
return valueStr;
}