mdtest/src/ior.h

217 lines
9.1 KiB
C
Raw Normal View History

/* -*- mode: c; indent-tabs-mode: nil; -*-
* vim:expandtab:
*
* NOTE: Someone was setting indent-sizes in the mode-line. Don't do that.
* 8-chars of indenting is ridiculous. If you really want 8-spaces,
* then change the mode-line to use tabs, and configure your personal
* editor environment to use 8-space tab-stops.
*
*/
2011-06-17 23:20:43 +04:00
/******************************************************************************\
* *
* Copyright (c) 2003, The Regents of the University of California *
* See the file COPYRIGHT for a complete copyright notice and license. *
* *
\******************************************************************************/
#ifndef _IOR_H
#define _IOR_H
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#ifdef USE_HDFS_AIORI
# include <hdfs.h> /* hdfsFS */
#else
# include <stdint.h>
typedef uint16_t tPort; /* unused, but needs a type */
typedef void* hdfsFS; /* unused, but needs a type */
#endif
#ifdef USE_RADOS_AIORI
# include <rados/librados.h>
#else
typedef void *rados_t;
typedef void *rados_ioctx_t;
#endif
#include "option.h"
#include "iordef.h"
#include "aiori.h"
#include <mpi.h>
#ifndef MPI_FILE_NULL
# include <mpio.h>
#endif /* not MPI_FILE_NULL */
#define ISPOWEROFTWO(x) ((x != 0) && !(x & (x - 1)))
typedef enum{
IOR_MEMORY_TYPE_CPU = 0,
IOR_MEMORY_TYPE_GPU_MANAGED = 1,
IOR_MEMORY_TYPE_GPU_DEVICE_ONLY = 2,
} ior_memory_flags;
/***************** IOR_BUFFERS *************************************************/
/* A struct to hold the buffers so we can pass 1 pointer around instead of 3
*/
typedef struct IO_BUFFERS
{
void* buffer;
void* checkBuffer;
void* readCheckBuffer;
} IOR_io_buffers;
/******************************************************************************/
/*
* The parameter struct holds all of the "global" data to be passed,
* as well as results to be parsed.
*
* NOTE: If IOR_Param_t is changed, also change:
* init_IOR_Param_t() [ior.c]
* DisplayUsage() [ior.c]
* ShowTest() [ior.c]
* DecodeDirective() [parse_options.c]
* ParseCommandLine() [parse_options.c]
* USER_GUIDE
*/
typedef struct
{
const struct ior_aiori * backend;
char * debug; /* debug info string */
2012-01-14 02:05:13 +04:00
int referenceNumber; /* user supplied reference number */
char * api; /* API for I/O */
char * apiVersion; /* API version */
char * platform; /* platform type */
char * testFileName; /* full name for test */
char * options; /* options string */
// intermediate options
int collective; /* collective I/O */
MPI_Comm testComm; /* Current MPI communicator */
MPI_Comm mpi_comm_world; /* The global MPI communicator */
int dryRun; /* do not perform any I/Os just run evtl. inputs print dummy output */
int dualMount; /* dual mount points */
ior_memory_flags gpuMemoryFlags; /* use the GPU to store the data */
int gpuDirect; /* use gpuDirect, this influences gpuMemoryFlags as well */
int gpuID; /* the GPU to use for gpuDirect or memory options */
int numTasks; /* number of tasks for test */
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
int numNodes; /* number of nodes for test */
int numTasksOnNode0; /* number of tasks on node 0 (usually all the same, but don't have to be, use with caution) */
int tasksBlockMapping; /* are the tasks in contiguous blocks across nodes or round-robin */
int repetitions; /* number of repetitions of test */
int repCounter; /* rep counter */
int multiFile; /* multiple files */
int interTestDelay; /* delay between reps in seconds */
int interIODelay; /* delay after each I/O in us */
int open; /* flag for writing or reading */
int readFile; /* read of existing file */
int writeFile; /* write of file */
int filePerProc; /* single file or file-per-process */
int reorderTasks; /* reorder tasks for read back and check */
int taskPerNodeOffset; /* task node offset for reading files */
int reorderTasksRandom; /* reorder tasks for random file read back */
int reorderTasksRandomSeed; /* reorder tasks for random file read seed */
int checkWrite; /* check read after write */
int checkRead; /* check read after read */
int keepFile; /* don't delete the testfile on exit */
int keepFileWithError; /* don't delete the testfile with errors */
int errorFound; /* error found in data check */
IOR_offset_t segmentCount; /* number of segments (or HDF5 datasets) */
IOR_offset_t blockSize; /* contiguous bytes to write per task */
IOR_offset_t transferSize; /* size of transfer in bytes */
IOR_offset_t expectedAggFileSize; /* calculated aggregate file size */
IOR_offset_t randomPrefillBlocksize; /* prefill option for random IO, the amount of data used for prefill */
char * saveRankDetailsCSV; /* save the details about the performance to a file */
int summary_every_test; /* flag to print summary every test, not just at end */
int uniqueDir; /* use unique directory for each fpp */
int useExistingTestFile; /* do not delete test file before access */
int deadlineForStonewalling; /* max time in seconds to run any test phase */
2020-07-03 10:09:40 +03:00
int stoneWallingWearOut; /* wear out the stonewalling, once the timeout is over, each process has to write the same amount */
uint64_t stoneWallingWearOutIterations; /* the number of iterations for the stonewallingWearOut, needed for readBack */
char * stoneWallingStatusFile;
2012-01-09 00:30:05 +04:00
int maxTimeDuration; /* max time in minutes to run each test */
int outlierThreshold; /* warn on outlier N seconds from mean */
int verbose; /* verbosity */
int setTimeStampSignature; /* set time stamp signature */
unsigned int timeStampSignatureValue; /* value for time stamp signature */
int randomSeed; /* random seed for write/read check */
unsigned int incompressibleSeed; /* random seed for incompressible file creation */
int randomOffset; /* access is to random offsets */
2012-01-09 06:41:30 +04:00
size_t memoryPerTask; /* additional memory used per task */
size_t memoryPerNode; /* additional memory used per node */
char * memoryPerNodeStr; /* for parsing */
char * testscripts; /* for parsing */
char * buffer_type; /* for parsing */
ior_dataPacketType_e dataPacketType; /* The type of data packet. */
void * backend_options; /* Backend-specific options */
/* POSIX variables */
int singleXferAttempt; /* do not retry transfer if incomplete */
int fsyncPerWrite; /* fsync() after each write */
int fsync; /* fsync() after write */
char* URI; /* "path" to target object */
/* RADOS variables */
rados_t rados_cluster; /* RADOS cluster handle */
rados_ioctx_t rados_ioctx; /* I/O context for our pool in the RADOS cluster */
int id; /* test's unique ID */
int intraTestBarriers; /* barriers between open/op and op/close */
int warningAsErrors; /* treat any warning as an error */
aiori_xfer_hint_t hints;
} IOR_param_t;
/* each pointer for a single test */
typedef struct {
double time;
size_t pairs_accessed; // number of I/Os done, useful for deadlineForStonewalling
double stonewall_time;
2020-12-04 00:07:45 +03:00
long long stonewall_min_data_accessed; // of all processes
long long stonewall_avg_data_accessed; // across all processes
long long stonewall_total_data_accessed; // sum accross all processes
IOR_offset_t aggFileSizeFromStat;
IOR_offset_t aggFileSizeFromXfer;
IOR_offset_t aggFileSizeForBW;
} IOR_point_t;
typedef struct {
int errors;
IOR_point_t write;
IOR_point_t read;
} IOR_results_t;
/* define the queuing structure for the test parameters */
typedef struct IOR_test_t {
IOR_param_t params;
IOR_results_t *results;
struct IOR_test_t *next;
} IOR_test_t;
IOR_test_t *CreateTest(IOR_param_t *init_params, int test_num);
2018-08-29 19:49:41 +03:00
void AllocResults(IOR_test_t *test);
char * GetPlatformName(void);
void init_IOR_Param_t(IOR_param_t *p, MPI_Comm global_com);
2011-06-17 23:20:43 +04:00
/*
* This function runs IOR given by command line, useful for testing
*/
IOR_test_t * ior_run(int argc, char **argv, MPI_Comm world_com, FILE * out_logfile);
2011-06-17 23:20:43 +04:00
2018-07-07 13:42:21 +03:00
/* Actual IOR Main function, renamed to allow library usage */
int ior_main(int argc, char **argv);
#endif /* !_IOR_H */