mdtest/src/mdtest.c

2630 lines
95 KiB
C
Raw Normal View History

/*
* Copyright (C) 2003, The Regents of the University of California.
* Produced at the Lawrence Livermore National Laboratory.
* Written by Christopher J. Morrone <morrone@llnl.gov>,
* Bill Loewe <loewe@loewe.net>, Tyce McLarty <mclarty@llnl.gov>,
* and Ryan Kroiss <rrkroiss@lanl.gov>.
* All rights reserved.
* UCRL-CODE-155800
*
* Please read the COPYRIGHT file.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (as published by
* the Free Software Foundation) version 2, dated June 1991.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* terms and conditions of the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* CVS info:
* $RCSfile: mdtest.c,v $
* $Revision: 1.4 $
* $Date: 2013/11/27 17:05:31 $
* $Author: brettkettering $
*/
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdarg.h>
#include "option.h"
#include "utilities.h"
#if HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#if HAVE_SYS_MOUNT_H
#include <sys/mount.h>
#endif
#if HAVE_SYS_STATFS_H
#include <sys/statfs.h>
#endif
#if HAVE_SYS_STATVFS_H
#include <sys/statvfs.h>
#endif
#include <fcntl.h>
#include <string.h>
#if HAVE_STRINGS_H
#include <strings.h>
#endif
#include <unistd.h>
#include <dirent.h>
#include <errno.h>
#include <time.h>
#include <sys/time.h>
#include "aiori.h"
#include "ior.h"
#include "mdtest.h"
#include <mpi.h>
#pragma GCC diagnostic ignored "-Wformat-overflow"
#ifdef HAVE_LUSTRE_LUSTREAPI
#include <lustre/lustreapi.h>
#endif /* HAVE_LUSTRE_LUSTREAPI */
#define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH
#define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH
#define RELEASE_VERS META_VERSION
2019-08-03 11:12:48 +03:00
#define TEST_DIR "test-dir"
#define ITEM_COUNT 25000
#define LLU "%lu"
typedef struct {
int size;
uint64_t *rand_array;
char testdir[MAX_PATHLEN];
char testdirpath[MAX_PATHLEN];
char base_tree_name[MAX_PATHLEN];
char **filenames;
char hostname[MAX_PATHLEN];
char mk_name[MAX_PATHLEN];
char stat_name[MAX_PATHLEN];
char read_name[MAX_PATHLEN];
char rm_name[MAX_PATHLEN];
char unique_mk_dir[MAX_PATHLEN];
char unique_chdir_dir[MAX_PATHLEN];
char unique_stat_dir[MAX_PATHLEN];
char unique_read_dir[MAX_PATHLEN];
char unique_rm_dir[MAX_PATHLEN];
char unique_rm_uni_dir[MAX_PATHLEN];
char *write_buffer;
char *stoneWallingStatusFile;
int gpu_memory_flags;
int barriers;
int create_only;
int stat_only;
int read_only;
int verify_read;
int verify_write;
int verification_error;
int remove_only;
int rename_dirs;
int leaf_only;
unsigned branch_factor;
int depth;
int random_buffer_offset; /* user settable value, otherwise random */
/*
* This is likely a small value, but it's sometimes computed by
* branch_factor^(depth+1), so we'll make it a larger variable,
* just in case.
*/
uint64_t num_dirs_in_tree;
/*
* As we start moving towards Exascale, we could have billions
* of files in a directory. Make room for that possibility with
* a larger variable.
*/
uint64_t items;
uint64_t items_per_dir;
uint64_t num_dirs_in_tree_calc; /* this is a workaround until the overal code is refactored */
int directory_loops;
int print_time;
int print_rate_and_time;
int print_all_proc;
int show_perrank_statistics;
ior_dataPacketType_e dataPacketType;
int random_seed;
int shared_file;
int files_only;
int dirs_only;
int pre_delay;
int unique_dir_per_task;
int time_unique_dir_overhead;
int collective_creates;
size_t write_bytes;
int stone_wall_timer_seconds;
size_t read_bytes;
int sync_file;
int call_sync;
int path_count;
int nstride; /* neighbor stride */
int make_node;
#ifdef HAVE_LUSTRE_LUSTREAPI
int global_dir_layout;
#endif /* HAVE_LUSTRE_LUSTREAPI */
char * saveRankDetailsCSV; /* save the details about the performance to a file */
const char *prologue;
const char *epilogue;
mdtest_results_t * summary_table;
pid_t pid;
uid_t uid;
/* Use the POSIX backend by default */
const ior_aiori_t *backend;
void * backend_options;
aiori_xfer_hint_t hints;
char * api;
} mdtest_options_t;
static mdtest_options_t o;
/* This structure describes the processing status for stonewalling */
typedef struct{
double start_time;
int stone_wall_timer_seconds;
2018-07-14 18:46:24 +03:00
uint64_t items_start;
uint64_t items_done;
uint64_t items_per_dir;
} rank_progress_t;
#define CHECK_STONE_WALL(p) (((p)->stone_wall_timer_seconds != 0) && ((GetTimeStamp() - (p)->start_time) > (p)->stone_wall_timer_seconds))
/* for making/removing unique directory && stating/deleting subdirectory */
enum {MK_UNI_DIR, STAT_SUB_DIR, READ_SUB_DIR, RM_SUB_DIR, RM_UNI_DIR};
#define PRINT(...) fprintf(out_logfile, __VA_ARGS__);
/* a helper function for passing debug and verbose messages.
use the MACRO as it will insert __LINE__ for you.
Pass the verbose level for root to print, then the verbose level for anyone to print.
Pass -1 to suppress the print for anyone.
Then do the standard printf stuff. This function adds the newline for you.
*/
#define VERBOSE(root,any,...) VerboseMessage(root,any,__LINE__,__VA_ARGS__)
void VerboseMessage (int root_level, int any_level, int line, char * format, ...) {
if ((rank==0 && verbose >= root_level) || (any_level > 0 && verbose >= any_level)) {
char buffer[1024];
va_list args;
va_start (args, format);
vsnprintf (buffer, 1024, format, args);
va_end (args);
if (root_level == 0 && any_level == -1) {
/* No header when it is just the standard output */
fprintf( out_logfile, "%s\n", buffer );
} else {
/* add a header when the verbose is greater than 0 */
fprintf( out_logfile, "V-%d: Rank %3d Line %5d %s\n", root_level, rank, line, buffer );
}
fflush(out_logfile);
}
}
void parse_dirpath(char *dirpath_arg) {
char * tmp, * token;
char delimiter_string[3] = { '@', '\n', '\0' };
int i = 0;
VERBOSE(1,-1, "Entering parse_dirpath on %s...", dirpath_arg );
tmp = dirpath_arg;
if (* tmp != '\0') o.path_count++;
while (* tmp != '\0') {
if (* tmp == '@') {
o.path_count++;
}
tmp++;
}
2019-01-24 03:02:58 +03:00
// prevent changes to the original dirpath_arg
dirpath_arg = strdup(dirpath_arg);
o.filenames = (char **) safeMalloc(o.path_count * sizeof(char **));
token = strtok(dirpath_arg, delimiter_string);
while (token != NULL) {
o.filenames[i] = token;
token = strtok(NULL, delimiter_string);
i++;
}
}
static void prep_testdir(int j, int dir_iter){
int pos = sprintf(o.testdir, "%s", o.testdirpath);
if ( o.testdir[strlen( o.testdir ) - 1] != '/' ) {
pos += sprintf(& o.testdir[pos], "/");
}
pos += sprintf(& o.testdir[pos], "%s", TEST_DIR);
pos += sprintf(& o.testdir[pos], ".%d-%d", j, dir_iter);
}
static void phase_prepare(){
if (*o.prologue){
VERBOSE(0,5,"calling prologue: \"%s\"", o.prologue);
system(o.prologue);
if (o.barriers) {
MPI_Barrier(testComm);
}
}
}
static void phase_end(){
if (o.call_sync){
if(! o.backend->sync){
FAIL("Error, backend does not provide the sync method, but you requested to use sync.\n");
2019-09-01 17:47:42 +03:00
}
o.backend->sync(o.backend_options);
}
if (*o.epilogue){
VERBOSE(0,5,"calling epilogue: \"%s\"", o.epilogue);
system(o.epilogue);
}
if (o.barriers) {
MPI_Barrier(testComm);
}
}
/*
* This function copies the unique directory name for a given option to
* the "to" parameter. Some memory must be allocated to the "to" parameter.
*/
void unique_dir_access(int opt, char *to) {
if (opt == MK_UNI_DIR) {
MPI_Barrier(testComm);
sprintf( to, "%s/%s", o.testdir, o.unique_chdir_dir );
} else if (opt == STAT_SUB_DIR) {
sprintf( to, "%s/%s", o.testdir, o.unique_stat_dir );
} else if (opt == READ_SUB_DIR) {
sprintf( to, "%s/%s", o.testdir, o.unique_read_dir );
} else if (opt == RM_SUB_DIR) {
sprintf( to, "%s/%s", o.testdir, o.unique_rm_dir );
} else if (opt == RM_UNI_DIR) {
sprintf( to, "%s/%s", o.testdir, o.unique_rm_uni_dir );
}
VERBOSE(1,-1,"Entering unique_dir_access, set it to %s", to );
}
static void create_remove_dirs (const char *path, bool create, uint64_t itemNum) {
2018-07-08 00:39:14 +03:00
char curr_item[MAX_PATHLEN];
const char *operation = create ? "create" : "remove";
if ( (itemNum % ITEM_COUNT==0 && (itemNum != 0))) {
VERBOSE(3,5,"dir: "LLU"", operation, itemNum);
}
//create dirs
sprintf(curr_item, "%s/dir.%s%" PRIu64, path, create ? o.mk_name : o.rm_name, itemNum);
VERBOSE(3,5,"create_remove_items_helper (dirs %s): curr_item is '%s'", operation, curr_item);
if (create) {
if (o.backend->mkdir(curr_item, DIRMODE, o.backend_options) == -1) {
EWARNF("unable to create directory %s", curr_item);
}
} else {
if (o.backend->rmdir(curr_item, o.backend_options) == -1) {
EWARNF("unable to remove directory %s", curr_item);
}
}
}
static void remove_file (const char *path, uint64_t itemNum) {
2018-07-08 00:39:14 +03:00
char curr_item[MAX_PATHLEN];
if ( (itemNum % ITEM_COUNT==0 && (itemNum != 0))) {
VERBOSE(3,5,"remove file: "LLU"\n", itemNum);
}
//remove files
sprintf(curr_item, "%s/file.%s"LLU"", path, o.rm_name, itemNum);
VERBOSE(3,5,"create_remove_items_helper (non-dirs remove): curr_item is '%s'", curr_item);
if (!(o.shared_file && rank != 0)) {
o.backend->delete (curr_item, o.backend_options);
}
}
static void create_file (const char *path, uint64_t itemNum) {
2018-07-08 00:39:14 +03:00
char curr_item[MAX_PATHLEN];
aiori_fd_t *aiori_fh = NULL;
if ( (itemNum % ITEM_COUNT==0 && (itemNum != 0))) {
VERBOSE(3,5,"create file: "LLU"", itemNum);
}
//create files
sprintf(curr_item, "%s/file.%s"LLU"", path, o.mk_name, itemNum);
VERBOSE(3,5,"create_remove_items_helper (non-dirs create): curr_item is '%s'", curr_item);
if (o.make_node) {
int ret;
VERBOSE(3,5,"create_remove_items_helper : mknod..." );
ret = o.backend->mknod (curr_item);
if (ret != 0)
EWARNF("unable to mknode file %s", curr_item);
return;
} else if (o.collective_creates) {
VERBOSE(3,5,"create_remove_items_helper (collective): open..." );
aiori_fh = o.backend->open (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options);
if (NULL == aiori_fh){
EWARNF("unable to open file %s", curr_item);
return;
}
/*
* !collective_creates
*/
} else {
o.hints.filePerProc = ! o.shared_file;
VERBOSE(3,5,"create_remove_items_helper (non-collective, shared): open..." );
aiori_fh = o.backend->create (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options);
if (NULL == aiori_fh){
EWARNF("unable to create file %s", curr_item);
return;
}
}
if (o.write_bytes > 0) {
VERBOSE(3,5,"create_remove_items_helper: write..." );
o.hints.fsyncPerWrite = o.sync_file;
update_write_memory_pattern(itemNum, o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType);
if ( o.write_bytes != (size_t) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) o.write_buffer, o.write_bytes, 0, o.backend_options)) {
EWARNF("unable to write file %s", curr_item);
}
if (o.verify_write) {
o.write_buffer[0] = 42;
if (o.write_bytes != (size_t) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) o.write_buffer, o.write_bytes, 0, o.backend_options)) {
EWARNF("unable to verify write (read/back) file %s", curr_item);
}
int error = verify_memory_pattern(itemNum, o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType);
o.verification_error += error;
if(error){
VERBOSE(1,1,"verification error in file: %s", curr_item);
}
}
}
VERBOSE(3,5,"create_remove_items_helper: close..." );
o.backend->close (aiori_fh, o.backend_options);
}
/* helper for creating/removing items */
void create_remove_items_helper(const int dirs, const int create, const char *path,
uint64_t itemNum, rank_progress_t * progress) {
VERBOSE(1,-1,"Entering create_remove_items_helper on %s", path );
for (uint64_t i = progress->items_start; i < progress->items_per_dir ; ++i) {
if (!dirs) {
if (create) {
create_file (path, itemNum + i);
} else {
remove_file (path, itemNum + i);
}
} else {
create_remove_dirs (path, create, itemNum + i);
}
if(CHECK_STONE_WALL(progress)){
if(progress->items_done == 0){
progress->items_done = i + 1;
2018-12-19 23:29:47 +03:00
}
return;
}
}
progress->items_done = progress->items_per_dir;
}
/* helper function to do collective operations */
void collective_helper(const int dirs, const int create, const char* path, uint64_t itemNum, rank_progress_t * progress) {
2018-07-08 00:39:14 +03:00
char curr_item[MAX_PATHLEN];
VERBOSE(1,-1,"Entering collective_helper on %s", path );
for (uint64_t i = progress->items_start ; i < progress->items_per_dir ; ++i) {
if (dirs) {
create_remove_dirs (path, create, itemNum + i);
continue;
}
sprintf(curr_item, "%s/file.%s"LLU"", path, create ? o.mk_name : o.rm_name, itemNum+i);
VERBOSE(3,5,"create file: %s", curr_item);
if (create) {
aiori_fd_t *aiori_fh;
//create files
aiori_fh = o.backend->create (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options);
if (NULL == aiori_fh) {
EWARNF("unable to create file %s", curr_item);
}else{
o.backend->close (aiori_fh, o.backend_options);
}
} else if (!(o.shared_file && rank != 0)) {
//remove files
o.backend->delete (curr_item, o.backend_options);
}
if(CHECK_STONE_WALL(progress)){
progress->items_done = i + 1;
return;
}
}
progress->items_done = progress->items_per_dir;
}
2020-07-03 10:09:40 +03:00
/* recursive function to create and remove files/directories from the
directory tree */
void create_remove_items(int currDepth, const int dirs, const int create, const int collective, const char *path, uint64_t dirNum, rank_progress_t * progress) {
unsigned i;
2018-07-08 00:39:14 +03:00
char dir[MAX_PATHLEN];
char temp_path[MAX_PATHLEN];
unsigned long long currDir = dirNum;
VERBOSE(1,-1,"Entering create_remove_items on %s, currDepth = %d...", path, currDepth );
2018-07-08 00:39:14 +03:00
memset(dir, 0, MAX_PATHLEN);
strcpy(temp_path, path);
VERBOSE(3,5,"create_remove_items (start): temp_path is '%s'", temp_path );
if (currDepth == 0) {
/* create items at this depth */
if (! o.leaf_only || (o.depth == 0 && o.leaf_only)) {
if (collective) {
collective_helper(dirs, create, temp_path, 0, progress);
} else {
create_remove_items_helper(dirs, create, temp_path, 0, progress);
}
}
if (o.depth > 0) {
create_remove_items(++currDepth, dirs, create,
collective, temp_path, ++dirNum, progress);
}
} else if (currDepth <= o.depth) {
/* iterate through the branches */
for (i=0; i< o.branch_factor; i++) {
/* determine the current branch and append it to the path */
sprintf(dir, "%s.%llu/", o.base_tree_name, currDir);
strcat(temp_path, "/");
strcat(temp_path, dir);
VERBOSE(3,5,"create_remove_items (for loop): temp_path is '%s'", temp_path );
/* create the items in this branch */
if (! o.leaf_only || (o.leaf_only && currDepth == o.depth)) {
if (collective) {
collective_helper(dirs, create, temp_path, currDir* o.items_per_dir, progress);
} else {
create_remove_items_helper(dirs, create, temp_path, currDir*o.items_per_dir, progress);
}
}
/* make the recursive call for the next level below this branch */
create_remove_items(
++currDepth,
dirs,
create,
collective,
temp_path,
( currDir * ( unsigned long long ) o.branch_factor ) + 1,
progress
);
currDepth--;
/* reset the path */
strcpy(temp_path, path);
currDir++;
}
}
}
/* stats all of the items created as specified by the input parameters */
void mdtest_stat(const int random, const int dirs, const long dir_iter, const char *path, rank_progress_t * progress) {
struct stat buf;
uint64_t parent_dir, item_num = 0;
2018-07-08 00:39:14 +03:00
char item[MAX_PATHLEN], temp[MAX_PATHLEN];
VERBOSE(1,-1,"Entering mdtest_stat on %s", path );
uint64_t stop_items = o.items;
if( o.directory_loops != 1 ){
stop_items = o.items_per_dir;
}
/* iterate over all of the item IDs */
for (uint64_t i = 0 ; i < stop_items ; ++i) {
/*
* It doesn't make sense to pass the address of the array because that would
* be like passing char **. Tested it on a Cray and it seems to work either
* way, but it seems that it is correct without the "&".
*
2018-07-08 00:39:14 +03:00
memset(&item, 0, MAX_PATHLEN);
*/
2018-07-08 00:39:14 +03:00
memset(item, 0, MAX_PATHLEN);
memset(temp, 0, MAX_PATHLEN);
/* determine the item number to stat */
if (random) {
item_num = o.rand_array[i];
} else {
item_num = i;
}
/* make adjustments if in leaf only mode*/
if (o.leaf_only) {
item_num += o.items_per_dir *
(o.num_dirs_in_tree - (uint64_t) pow( o.branch_factor, o.depth ));
}
/* create name of file/dir to stat */
if (dirs) {
if ( (i % ITEM_COUNT == 0) && (i != 0)) {
VERBOSE(3,5,"stat dir: "LLU"", i);
}
sprintf(item, "dir.%s"LLU"", o.stat_name, item_num);
} else {
if ( (i % ITEM_COUNT == 0) && (i != 0)) {
VERBOSE(3,5,"stat file: "LLU"", i);
}
sprintf(item, "file.%s"LLU"", o.stat_name, item_num);
}
/* determine the path to the file/dir to be stat'ed */
parent_dir = item_num / o.items_per_dir;
if (parent_dir > 0) { //item is not in tree's root directory
/* prepend parent directory to item's path */
sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item);
strcpy(item, temp);
//still not at the tree's root dir
while (parent_dir > o.branch_factor) {
parent_dir = (uint64_t) ((parent_dir-1) / o.branch_factor);
sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item);
strcpy(item, temp);
}
}
/* Now get item to have the full path */
sprintf( temp, "%s/%s", path, item );
strcpy( item, temp );
/* below temp used to be hiername */
VERBOSE(3,5,"mdtest_stat %4s: %s", (dirs ? "dir" : "file"), item);
if (-1 == o.backend->stat (item, &buf, o.backend_options)) {
EWARNF("unable to stat %s %s", dirs ? "directory" : "file", item);
}
}
}
/* reads all of the items created as specified by the input parameters */
void mdtest_read(int random, int dirs, const long dir_iter, char *path) {
2018-07-14 18:46:24 +03:00
uint64_t parent_dir, item_num = 0;
2018-07-08 00:39:14 +03:00
char item[MAX_PATHLEN], temp[MAX_PATHLEN];
aiori_fd_t *aiori_fh;
VERBOSE(1,-1,"Entering mdtest_read on %s", path );
char *read_buffer;
/* allocate read buffer */
if (o.read_bytes > 0) {
read_buffer = aligned_buffer_alloc(o.read_bytes, o.gpu_memory_flags);
2021-01-20 00:23:30 +03:00
memset(read_buffer, -1, o.read_bytes);
}
uint64_t stop_items = o.items;
if( o.directory_loops != 1 ){
stop_items = o.items_per_dir;
}
/* iterate over all of the item IDs */
for (uint64_t i = 0 ; i < stop_items ; ++i) {
/*
* It doesn't make sense to pass the address of the array because that would
* be like passing char **. Tested it on a Cray and it seems to work either
* way, but it seems that it is correct without the "&".
*
* NTH: Both are technically correct in C.
*
2018-07-08 00:39:14 +03:00
* memset(&item, 0, MAX_PATHLEN);
*/
2018-07-08 00:39:14 +03:00
memset(item, 0, MAX_PATHLEN);
memset(temp, 0, MAX_PATHLEN);
/* determine the item number to read */
if (random) {
item_num = o.rand_array[i];
} else {
item_num = i;
}
/* make adjustments if in leaf only mode*/
if (o.leaf_only) {
item_num += o.items_per_dir *
(o.num_dirs_in_tree - (uint64_t) pow (o.branch_factor, o.depth));
}
/* create name of file to read */
if (!dirs) {
if ((i%ITEM_COUNT == 0) && (i != 0)) {
VERBOSE(3,5,"read file: "LLU"", i);
}
sprintf(item, "file.%s"LLU"", o.read_name, item_num);
}
/* determine the path to the file/dir to be read'ed */
parent_dir = item_num / o.items_per_dir;
if (parent_dir > 0) { //item is not in tree's root directory
/* prepend parent directory to item's path */
sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item);
strcpy(item, temp);
/* still not at the tree's root dir */
while (parent_dir > o.branch_factor) {
parent_dir = (unsigned long long) ((parent_dir-1) / o.branch_factor);
sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item);
strcpy(item, temp);
}
}
/* Now get item to have the full path */
sprintf( temp, "%s/%s", path, item );
strcpy( item, temp );
/* below temp used to be hiername */
VERBOSE(3,5,"mdtest_read file: %s", item);
o.hints.filePerProc = ! o.shared_file;
/* open file for reading */
aiori_fh = o.backend->open (item, O_RDONLY, o.backend_options);
if (NULL == aiori_fh) {
EWARNF("unable to open file %s", item);
continue;
}
/* read file */
if (o.read_bytes > 0) {
read_buffer[0] = 42;
if (o.read_bytes != (size_t) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) read_buffer, o.read_bytes, 0, o.backend_options)) {
EWARNF("unable to read file %s", item);
continue;
}
int pretend_rank = (2 * o.nstride + rank) % o.size;
if(o.verify_read){
if (o.shared_file) {
pretend_rank = rank;
}
int error = verify_memory_pattern(item_num, read_buffer, o.read_bytes, o.random_buffer_offset, pretend_rank, o.dataPacketType);
o.verification_error += error;
if(error){
VERBOSE(1,1,"verification error in file: %s", item);
}
}
}
/* close file */
o.backend->close (aiori_fh, o.backend_options);
}
if(o.read_bytes){
aligned_buffer_free(read_buffer, o.gpu_memory_flags);
}
}
/* This method should be called by rank 0. It subsequently does all of
the creates and removes for the other ranks */
void collective_create_remove(const int create, const int dirs, const int ntasks, const char *path, rank_progress_t * progress) {
2018-07-08 00:39:14 +03:00
char temp[MAX_PATHLEN];
VERBOSE(1,-1,"Entering collective_create_remove on %s", path );
/* rank 0 does all of the creates and removes for all of the ranks */
for (int i = 0 ; i < ntasks ; ++i) {
2018-07-08 00:39:14 +03:00
memset(temp, 0, MAX_PATHLEN);
strcpy(temp, o.testdir);
strcat(temp, "/");
/* set the base tree name appropriately */
if (o.unique_dir_per_task) {
sprintf(o.base_tree_name, "mdtest_tree.%d", i);
} else {
sprintf(o.base_tree_name, "mdtest_tree");
}
/* Setup to do I/O to the appropriate test dir */
strcat(temp, o.base_tree_name);
strcat(temp, ".0");
/* set all item names appropriately */
if (! o.shared_file) {
sprintf(o.mk_name, "mdtest.%d.", (i+(0*o.nstride))%ntasks);
sprintf(o.stat_name, "mdtest.%d.", (i+(1*o.nstride))%ntasks);
sprintf(o.read_name, "mdtest.%d.", (i+(2*o.nstride))%ntasks);
sprintf(o.rm_name, "mdtest.%d.", (i+(3*o.nstride))%ntasks);
}
if (o.unique_dir_per_task) {
VERBOSE(3,5,"i %d nstride %d ntasks %d", i, o.nstride, ntasks);
sprintf(o.unique_mk_dir, "%s/mdtest_tree.%d.0", o.testdir,
(i+(0*o.nstride))%ntasks);
sprintf(o.unique_chdir_dir, "%s/mdtest_tree.%d.0", o.testdir,
(i+(1*o.nstride))%ntasks);
sprintf(o.unique_stat_dir, "%s/mdtest_tree.%d.0", o.testdir,
(i+(2*o.nstride))%ntasks);
sprintf(o.unique_read_dir, "%s/mdtest_tree.%d.0", o.testdir,
(i+(3*o.nstride))%ntasks);
sprintf(o.unique_rm_dir, "%s/mdtest_tree.%d.0", o.testdir,
(i+(4*o.nstride))%ntasks);
sprintf(o.unique_rm_uni_dir, "%s", o.testdir);
}
/* Now that everything is set up as it should be, do the create or remove */
VERBOSE(3,5,"collective_create_remove (create_remove_items): temp is '%s'", temp);
create_remove_items(0, dirs, create, 1, temp, 0, progress);
}
/* reset all of the item names */
if (o.unique_dir_per_task) {
sprintf(o.base_tree_name, "mdtest_tree.0");
} else {
sprintf(o.base_tree_name, "mdtest_tree");
}
if (! o.shared_file) {
sprintf(o.mk_name, "mdtest.%d.", (0+(0*o.nstride))%ntasks);
sprintf(o.stat_name, "mdtest.%d.", (0+(1*o.nstride))%ntasks);
sprintf(o.read_name, "mdtest.%d.", (0+(2*o.nstride))%ntasks);
sprintf(o.rm_name, "mdtest.%d.", (0+(3*o.nstride))%ntasks);
}
if (o.unique_dir_per_task) {
sprintf(o.unique_mk_dir, "%s/mdtest_tree.%d.0", o.testdir,
(0+(0*o.nstride))%ntasks);
sprintf(o.unique_chdir_dir, "%s/mdtest_tree.%d.0", o.testdir,
(0+(1*o.nstride))%ntasks);
sprintf(o.unique_stat_dir, "%s/mdtest_tree.%d.0", o.testdir,
(0+(2*o.nstride))%ntasks);
sprintf(o.unique_read_dir, "%s/mdtest_tree.%d.0", o.testdir,
(0+(3*o.nstride))%ntasks);
sprintf(o.unique_rm_dir, "%s/mdtest_tree.%d.0", o.testdir,
(0+(4*o.nstride))%ntasks);
sprintf(o.unique_rm_uni_dir, "%s", o.testdir);
}
}
void rename_dir_test(const int dirs, const long dir_iter, const char *path, rank_progress_t * progress) {
uint64_t parent_dir, item_num = 0;
char item[MAX_PATHLEN], temp[MAX_PATHLEN];
char item_last[MAX_PATHLEN];
if(o.backend->rename == NULL){
WARN("Backend doesn't support rename\n");
return;
}
VERBOSE(1,-1,"Entering mdtest_rename on %s", path );
uint64_t stop_items = o.items;
if( o.directory_loops != 1 ){
stop_items = o.items_per_dir;
}
if(stop_items == 1) return;
/* iterate over all of the item IDs */
char first_item_name[MAX_PATHLEN];
for (uint64_t i = 0 ; i < stop_items; ++i) {
item_num = i;
/* make adjustments if in leaf only mode*/
if (o.leaf_only) {
item_num += o.items_per_dir * (o.num_dirs_in_tree - (uint64_t) pow( o.branch_factor, o.depth ));
}
/* create name of file/dir to stat */
if (dirs) {
sprintf(item, "dir.%s"LLU"", o.stat_name, item_num);
} else {
sprintf(item, "file.%s"LLU"", o.stat_name, item_num);
}
/* determine the path to the file/dir to be stat'ed */
parent_dir = item_num / o.items_per_dir;
if (parent_dir > 0) { //item is not in tree's root directory
/* prepend parent directory to item's path */
sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item);
strcpy(item, temp);
//still not at the tree's root dir
while (parent_dir > o.branch_factor) {
parent_dir = (uint64_t) ((parent_dir-1) / o.branch_factor);
sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item);
strcpy(item, temp);
}
}
/* Now get item to have the full path */
sprintf( temp, "%s/%s", path, item );
strcpy( item, temp );
VERBOSE(3,5,"mdtest_rename %4s: %s", (dirs ? "dir" : "file"), item);
if(i == 0){
sprintf(first_item_name, "%s-XX", item);
strcpy(item_last, first_item_name);
}else if(i == stop_items - 1){
strcpy(item, first_item_name);
}
if (-1 == o.backend->rename(item, item_last, o.backend_options)) {
EWARNF("unable to rename %s %s", dirs ? "directory" : "file", item);
}
strcpy(item_last, item);
}
}
2021-03-20 18:34:43 +03:00
static void updateResult(mdtest_results_t * res, mdtest_test_num_t test, uint64_t item_count, double t_start, double t_end, double t_end_before_barrier){
res->time[test] = t_end - t_start;
if(isfinite(t_end_before_barrier)){
res->time_before_barrier[test] = t_end_before_barrier - t_start;
}else{
res->time_before_barrier[test] = res->time[test];
}
if(item_count == 0){
res->rate[test] = 0.0;
res->rate_before_barrier[test] = 0.0;
}else{
res->rate[test] = item_count/res->time[test];
res->rate_before_barrier[test] = item_count/res->time_before_barrier[test];
}
res->items[test] = item_count;
res->stonewall_last_item[test] = o.items;
}
2021-09-01 21:04:47 +03:00
static void sync_dir(const char *testdir, const char *path) {
char temp_path[MAX_PATHLEN];
sprintf(temp_path, "%s/%s", testdir, path);
int fd = open(temp_path, O_RDONLY);
if (fd >= 0) {
fsync(fd);
close(fd);
} else {
printf("failed to open %s for fsync: %s\n", temp_path, strerror(errno));
}
}
void directory_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) {
int size;
2021-03-20 18:34:43 +03:00
double t_start, t_end, t_end_before_barrier;
2018-07-08 00:39:14 +03:00
char temp_path[MAX_PATHLEN];
mdtest_results_t * res = & o.summary_table[iteration];
MPI_Comm_size(testComm, &size);
VERBOSE(1,-1,"Entering directory_test on %s", path );
MPI_Barrier(testComm);
/* create phase */
if(o.create_only) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds;
progress->items_done = 0;
progress->start_time = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(MK_UNI_DIR, temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,-1,"directory_test: create path is '%s'", temp_path );
/* "touch" the files */
if (o.collective_creates) {
if (rank == 0) {
collective_create_remove(1, 1, ntasks, temp_path, progress);
}
} else {
/* create directories */
create_remove_items(0, 1, 1, 0, temp_path, 0, progress);
}
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
progress->stone_wall_timer_seconds = 0;
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_DIR_CREATE_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
/* stat phase */
if (o.stat_only) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(STAT_SUB_DIR, temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"stat path is '%s'", temp_path );
/* stat directories */
if (o.random_seed > 0) {
mdtest_stat(1, 1, dir_iter, temp_path, progress);
} else {
mdtest_stat(0, 1, dir_iter, temp_path, progress);
}
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_DIR_STAT_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
2021-03-20 18:34:43 +03:00
/* read phase */
if (o.read_only) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(READ_SUB_DIR, temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"directory_test: read path is '%s'", temp_path );
/* read directories */
if (o.random_seed > 0) {
; /* N/A */
} else {
; /* N/A */
}
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_DIR_READ_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
2021-03-20 18:34:43 +03:00
/* rename phase */
if(o.rename_dirs && o.items > 1){
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(STAT_SUB_DIR, temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"rename path is '%s'", temp_path );
rename_dir_test(1, dir_iter, temp_path, progress);
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_DIR_RENAME_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
2021-03-20 18:34:43 +03:00
/* remove phase */
if (o.remove_only) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(RM_SUB_DIR, temp_path);
if (!o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"directory_test: remove directories path is '%s'", temp_path );
/* remove directories */
if (o.collective_creates) {
if (rank == 0) {
collective_create_remove(0, 1, ntasks, temp_path, progress);
}
} else {
create_remove_items(0, 1, 0, 0, temp_path, 0, progress);
}
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_DIR_REMOVE_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
if (o.remove_only) {
if (o.unique_dir_per_task) {
unique_dir_access(RM_UNI_DIR, temp_path);
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"directory_test: remove unique directories path is '%s'\n", temp_path );
}
2021-03-20 18:34:43 +03:00
VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_CREATE_NUM], o.summary_table[iteration].rate[MDTEST_DIR_CREATE_NUM]);
VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_STAT_NUM], o.summary_table[iteration].rate[MDTEST_DIR_STAT_NUM]);
VERBOSE(1,-1," Directory rename : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_RENAME_NUM], o.summary_table[iteration].rate[MDTEST_DIR_RENAME_NUM]);
VERBOSE(1,-1," Directory removal : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_REMOVE_NUM], o.summary_table[iteration].rate[MDTEST_DIR_REMOVE_NUM]);
}
/* Returns if the stonewall was hit */
2020-11-27 18:35:32 +03:00
int updateStoneWallIterations(int iteration, uint64_t items_done, double tstart, uint64_t * out_max_iter){
int hit = 0;
long long unsigned max_iter = 0;
2020-11-27 18:35:32 +03:00
VERBOSE(1,1,"stonewall hit with %lld items", (long long) items_done );
MPI_Allreduce(& items_done, & max_iter, 1, MPI_LONG_LONG_INT, MPI_MAX, testComm);
o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM] = GetTimeStamp() - tstart;
o.summary_table[iteration].stonewall_last_item[MDTEST_FILE_CREATE_NUM] = items_done;
2020-11-27 18:35:32 +03:00
*out_max_iter = max_iter;
// continue to the maximum...
long long min_accessed = 0;
2020-11-27 18:35:32 +03:00
MPI_Reduce(& items_done, & min_accessed, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm);
long long sum_accessed = 0;
2020-11-27 18:35:32 +03:00
MPI_Reduce(& items_done, & sum_accessed, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm);
o.summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM] = sum_accessed;
o.summary_table[iteration].stonewall_item_min[MDTEST_FILE_CREATE_NUM] = min_accessed * o.size;
if(o.items != (sum_accessed / o.size)){
VERBOSE(0,-1, "Continue stonewall hit min: %lld max: %lld avg: %.1f \n", min_accessed, max_iter, ((double) sum_accessed) / o.size);
hit = 1;
}
return hit;
}
2021-03-20 18:34:43 +03:00
void file_test_create(const int iteration, const int ntasks, const char *path, rank_progress_t * progress, double *t_start){
char temp_path[MAX_PATHLEN];
2020-11-28 13:34:20 +03:00
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(MK_UNI_DIR, temp_path);
VERBOSE(5,5,"operating on %s", temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
*t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,-1,"file_test: create path is '%s'", temp_path );
/* "touch" the files */
if (o.collective_creates) {
if (rank == 0) {
collective_create_remove(1, 0, ntasks, temp_path, progress);
}
MPI_Barrier(testComm);
}
/* create files */
create_remove_items(0, 0, 1, 0, temp_path, 0, progress);
if(o.stone_wall_timer_seconds){
// hit the stonewall
uint64_t max_iter = 0;
uint64_t items_done = progress->items_done + dir_iter * o.items_per_dir;
2021-03-20 18:34:43 +03:00
int hit = updateStoneWallIterations(iteration, items_done, *t_start, & max_iter);
progress->items_start = items_done;
progress->items_per_dir = max_iter;
if (hit){
progress->stone_wall_timer_seconds = 0;
VERBOSE(1,1,"stonewall: %lld of %lld", (long long) progress->items_start, (long long) progress->items_per_dir);
create_remove_items(0, 0, 1, 0, temp_path, 0, progress);
// now reset the values
progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds;
o.items = progress->items_done;
}
if (o.stoneWallingStatusFile){
StoreStoneWallingIterations(o.stoneWallingStatusFile, max_iter);
}
// reset stone wall timer to allow proper cleanup
progress->stone_wall_timer_seconds = 0;
2020-11-28 13:34:20 +03:00
// at the moment, stonewall can be done only with one directory_loop, so we can return here safely
break;
}
}
}
void file_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) {
int size;
2021-03-20 18:34:43 +03:00
double t_start, t_end, t_end_before_barrier;
2018-07-08 00:39:14 +03:00
char temp_path[MAX_PATHLEN];
2021-03-20 18:34:43 +03:00
mdtest_results_t * res = & o.summary_table[iteration];
MPI_Comm_size(testComm, &size);
VERBOSE(3,5,"Entering file_test on %s", path);
MPI_Barrier(testComm);
/* create phase */
if (o.create_only ) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds;
progress->items_done = 0;
progress->start_time = GetTimeStamp();
2021-03-20 18:34:43 +03:00
file_test_create(iteration, ntasks, path, progress, &t_start);
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_FILE_CREATE_NUM, o.items, t_start, t_end, t_end_before_barrier);
}else{
if (o.stoneWallingStatusFile){
2018-07-14 14:22:36 +03:00
int64_t expected_items;
/* The number of items depends on the stonewalling file */
expected_items = ReadStoneWallingIterations(o.stoneWallingStatusFile, testComm);
2018-07-14 14:22:36 +03:00
if(expected_items >= 0){
if(o.directory_loops > 1){
o.directory_loops = expected_items / o.items_per_dir;
o.items = o.items_per_dir;
}else{
o.items = expected_items;
progress->items_per_dir = o.items;
}
2018-07-14 14:22:36 +03:00
}
if (rank == 0) {
if(expected_items == -1){
WARN("Could not read stonewall status file");
}else {
VERBOSE(1,1, "Read stonewall status; items: "LLU"\n", o.items);
2018-07-14 14:22:36 +03:00
}
}
}
}
/* stat phase */
if (o.stat_only ) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(STAT_SUB_DIR, temp_path);
if (!o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"file_test: stat path is '%s'", temp_path );
/* stat files */
mdtest_stat((o.random_seed > 0 ? 1 : 0), 0, dir_iter, temp_path, progress);
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_FILE_STAT_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
/* read phase */
if (o.read_only ) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(READ_SUB_DIR, temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"file_test: read path is '%s'", temp_path );
/* read files */
if (o.random_seed > 0) {
mdtest_read(1,0, dir_iter, temp_path);
} else {
mdtest_read(0,0, dir_iter, temp_path);
}
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_FILE_READ_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
2021-03-20 18:34:43 +03:00
/* remove phase */
if (o.remove_only) {
phase_prepare();
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
progress->items_start = 0;
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(iteration, dir_iter);
if (o.unique_dir_per_task) {
unique_dir_access(RM_SUB_DIR, temp_path);
if (! o.time_unique_dir_overhead) {
2021-03-20 18:34:43 +03:00
t_start = GetTimeStamp();
}
} else {
sprintf( temp_path, "%s/%s", o.testdir, path );
}
VERBOSE(3,5,"file_test: rm directories path is '%s'", temp_path );
if (o.collective_creates) {
if (rank == 0) {
collective_create_remove(0, 0, ntasks, temp_path, progress);
}
} else {
VERBOSE(3,5,"gonna create %s", temp_path);
create_remove_items(0, 0, 0, 0, temp_path, 0, progress);
}
}
2021-09-01 21:04:47 +03:00
sync_dir(o.testdir, path);
2021-03-20 18:34:43 +03:00
t_end_before_barrier = GetTimeStamp();
phase_end();
t_end = GetTimeStamp();
updateResult(res, MDTEST_FILE_REMOVE_NUM, o.items, t_start, t_end, t_end_before_barrier);
}
if (o.remove_only) {
if (o.unique_dir_per_task) {
unique_dir_access(RM_UNI_DIR, temp_path);
} else {
strcpy( temp_path, path );
}
VERBOSE(3,5,"file_test: rm unique directories path is '%s'", temp_path );
}
if(o.num_dirs_in_tree_calc){ /* this is temporary fix needed when using -n and -i together */
o.items *= o.num_dirs_in_tree_calc;
}
VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_CREATE_NUM], o.summary_table[iteration].rate[MDTEST_FILE_CREATE_NUM]);
if(o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM]){
VERBOSE(1,-1," File creation (stonewall): %14.3f sec, %14.3f ops/sec", o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM], o.summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM]);
}
VERBOSE(1,-1," File stat : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_STAT_NUM], o.summary_table[iteration].rate[MDTEST_FILE_STAT_NUM]);
VERBOSE(1,-1," File read : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_READ_NUM], o.summary_table[iteration].rate[MDTEST_FILE_READ_NUM]);
VERBOSE(1,-1," File removal : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_REMOVE_NUM], o.summary_table[iteration].rate[MDTEST_FILE_REMOVE_NUM]);
}
2020-11-25 12:50:26 +03:00
char const * mdtest_test_name(int i){
switch (i) {
case MDTEST_DIR_CREATE_NUM: return "Directory creation";
case MDTEST_DIR_STAT_NUM: return "Directory stat";
case MDTEST_DIR_READ_NUM: return "Directory read";
case MDTEST_DIR_REMOVE_NUM: return "Directory removal";
case MDTEST_DIR_RENAME_NUM: return "Directory rename";
case MDTEST_FILE_CREATE_NUM: return "File creation";
case MDTEST_FILE_STAT_NUM: return "File stat";
case MDTEST_FILE_READ_NUM: return "File read";
case MDTEST_FILE_REMOVE_NUM: return "File removal";
case MDTEST_TREE_CREATE_NUM: return "Tree creation";
case MDTEST_TREE_REMOVE_NUM: return "Tree removal";
default: return "ERR INVALID TESTNAME :";
2020-11-25 12:50:26 +03:00
}
return NULL;
}
/*
* Store the results of each process in a file
*/
static void StoreRankInformation(int iterations, mdtest_results_t * agg){
const size_t size = sizeof(mdtest_results_t) * iterations;
if(rank == 0){
FILE* fd = fopen(o.saveRankDetailsCSV, "a");
if (fd == NULL){
FAIL("Cannot open saveRankPerformanceDetails file for writes!");
}
mdtest_results_t * results = safeMalloc(size * o.size);
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, results, size / sizeof(double), MPI_DOUBLE, 0, testComm);
char buff[4096];
char * cpos = buff;
cpos += sprintf(cpos, "all,%llu", (long long unsigned) o.items);
for(int e = 0; e < MDTEST_LAST_NUM; e++){
if(agg->items[e] == 0){
cpos += sprintf(cpos, ",,");
}else{
cpos += sprintf(cpos, ",%.10e,%.10e", agg->items[e] / agg->time[e], agg->time[e]);
}
}
cpos += sprintf(cpos, "\n");
int ret = fwrite(buff, cpos - buff, 1, fd);
for(int iter = 0; iter < iterations; iter++){
for(int i=0; i < o.size; i++){
mdtest_results_t * cur = & results[i * iterations + iter];
cpos = buff;
cpos += sprintf(cpos, "%d,", i);
for(int e = 0; e < MDTEST_TREE_CREATE_NUM; e++){
if(cur->items[e] == 0){
cpos += sprintf(cpos, ",,");
}else{
cpos += sprintf(cpos, ",%.10e,%.10e", cur->items[e] / cur->time_before_barrier[e], cur->time_before_barrier[e]);
}
}
cpos += sprintf(cpos, "\n");
ret = fwrite(buff, cpos - buff, 1, fd);
if(ret != 1){
WARN("Couln't append to saveRankPerformanceDetailsCSV file\n");
break;
}
}
}
fclose(fd);
free(results);
}else{
/* this is a hack for now assuming all datatypes in the structure are double */
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm);
}
}
static mdtest_results_t* get_result_index(mdtest_results_t* all_results, int proc, int iter, int interation_count){
return & all_results[proc * interation_count + iter];
}
static void summarize_results_rank0(int iterations, mdtest_results_t * all_results, int print_time) {
int start, stop;
double min, max, mean, sd, sum, var, curr = 0;
double imin, imax, imean, isum, icur; // calculation per iteration
char const * access;
/* if files only access, skip entries 0-3 (the dir tests) */
if (o.files_only && ! o.dirs_only) {
start = MDTEST_FILE_CREATE_NUM;
} else {
start = 0;
}
/* if directories only access, skip entries 4-7 (the file tests) */
if (o.dirs_only && !o.files_only) {
stop = MDTEST_FILE_CREATE_NUM;
} else {
stop = MDTEST_TREE_CREATE_NUM;
}
/* special case: if no directory or file tests, skip all */
if (!o.dirs_only && !o.files_only) {
start = stop = 0;
}
if(o.print_all_proc){
fprintf(out_logfile, "\nPer process result (%s):\n", print_time ? "time" : "rate");
for (int j = 0; j < iterations; j++) {
fprintf(out_logfile, "iteration: %d\n", j);
for (int i = start; i < MDTEST_LAST_NUM; i++) {
access = mdtest_test_name(i);
if(access == NULL){
continue;
}
fprintf(out_logfile, "Test %s", access);
for (int k=0; k < o.size; k++) {
mdtest_results_t * cur = get_result_index(all_results, k, j, iterations);
if(print_time){
curr = cur->time_before_barrier[i];
}else{
curr = cur->rate_before_barrier[i];
}
fprintf(out_logfile, "%c%e", (k==0 ? ' ': ','), curr);
}
fprintf(out_logfile, "\n");
}
}
}
VERBOSE(0, -1, "\nSUMMARY %s: (of %d iterations)", print_time ? "time" : "rate", iterations);
PRINT(" Operation ");
if(o.show_perrank_statistics){
PRINT("per Rank: Max Min Mean per Iteration:");
}else{
PRINT(" ");
}
PRINT(" Max Min Mean Std Dev\n");
PRINT(" --------- ");
if(o.show_perrank_statistics){
PRINT(" --- --- ---- ");
}
PRINT(" --- --- ---- -------\n");
for (int i = start; i < stop; i++) {
min = 1e308;
max = 0;
sum = var = 0;
imin = 1e308;
isum = imax = 0;
double iter_result[iterations];
for (int j = 0; j < iterations; j++) {
icur = print_time ? 0 : 1e308;
for (int k = 0; k < o.size; k++) {
mdtest_results_t * cur = get_result_index(all_results, k, j, iterations);
if(print_time){
curr = cur->time_before_barrier[i];
}else{
curr = cur->rate_before_barrier[i];
}
if (min > curr) {
min = curr;
}
if (max < curr) {
max = curr;
}
sum += curr;
if (print_time) {
curr = cur->time[i];
if (icur < curr) {
icur = curr;
}
} else {
curr = cur->rate[i];
if (icur > curr) {
icur = curr;
}
}
}
if (icur > imax) {
imax = icur;
}
if (icur < imin) {
imin = icur;
}
isum += icur;
if(print_time){
iter_result[j] = icur;
}else{
iter_result[j] = icur * o.size;
}
}
mean = sum / iterations / o.size;
imean = isum / iterations;
if(! print_time){
imax *= o.size;
imin *= o.size;
isum *= o.size;
imean *= o.size;
}
for (int j = 0; j < iterations; j++) {
var += (imean - iter_result[j]) * (imean - iter_result[j]);
}
var = var / (iterations - 1);
sd = sqrt(var);
access = mdtest_test_name(i);
if (i != 2) {
fprintf(out_logfile, " %-18s ", access);
if(o.show_perrank_statistics){
fprintf(out_logfile, "%14.3f ", max);
fprintf(out_logfile, "%14.3f ", min);
fprintf(out_logfile, "%14.3f ", mean);
fprintf(out_logfile, " ");
}
fprintf(out_logfile, " ");
fprintf(out_logfile, "%14.3f ", imax);
fprintf(out_logfile, "%14.3f ", imin);
fprintf(out_logfile, "%14.3f ", imean);
fprintf(out_logfile, "%14.3f\n", iterations == 1 ? 0 : sd);
fflush(out_logfile);
}
}
/* calculate tree create/remove rates, applies only to Rank 0 */
for (int i = MDTEST_TREE_CREATE_NUM; i < MDTEST_LAST_NUM; i++) {
min = imin = 1e308;
max = imax = 0;
sum = var = 0;
for (int j = 0; j < iterations; j++) {
if(print_time){
curr = o.summary_table[j].time[i];
}else{
curr = o.summary_table[j].rate[i];
}
if (min > curr) {
min = curr;
}
if (max < curr) {
max = curr;
}
sum += curr;
if(curr > imax){
imax = curr;
}
if(curr < imin){
imin = curr;
}
}
mean = sum / (iterations);
for (int j = 0; j < iterations; j++) {
if(print_time){
curr = o.summary_table[j].time[i];
}else{
curr = o.summary_table[j].rate[i];
2020-11-25 12:50:26 +03:00
}
var += (mean - curr)*(mean - curr);
}
var = var / (iterations - 1);
sd = sqrt(var);
access = mdtest_test_name(i);
fprintf(out_logfile, " %-22s ", access);
if(o.show_perrank_statistics){
fprintf(out_logfile, "%14.3f ", max);
fprintf(out_logfile, "%14.3f ", min);
fprintf(out_logfile, "%14.3f ", mean);
fprintf(out_logfile, " ");
}
fprintf(out_logfile, "%14.3f ", imax);
fprintf(out_logfile, "%14.3f ", imin);
fprintf(out_logfile, "%14.3f ", sum / iterations);
fprintf(out_logfile, "%14.3f\n", iterations == 1 ? 0 : sd);
fflush(out_logfile);
}
}
/*
Output the results and summarize them into rank 0's o.summary_table
*/
void summarize_results(int iterations, mdtest_results_t * results) {
const size_t size = sizeof(mdtest_results_t) * iterations;
mdtest_results_t * all_results = NULL;
if(rank == 0){
all_results = safeMalloc(size * o.size);
memset(all_results, 0, size * o.size);
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, all_results, size / sizeof(double), MPI_DOUBLE, 0, testComm);
// calculate the aggregated values for all processes
for(int j=0; j < iterations; j++){
for(int i=0; i < MDTEST_LAST_NUM; i++){
//double sum_rate = 0;
double max_time = 0;
double max_stonewall_time = 0;
uint64_t sum_items = 0;
// reduce over the processes
for(int p=0; p < o.size; p++){
mdtest_results_t * cur = get_result_index(all_results, p, j, iterations);
//sum_rate += all_results[p + j*p]->rate[i];
double t = cur->time[i];
max_time = max_time < t ? t : max_time;
sum_items += cur->items[i];
t = cur->stonewall_time[i];
max_stonewall_time = max_stonewall_time < t ? t : max_stonewall_time;
}
results[j].items[i] = sum_items;
results[j].time[i] = max_time;
results[j].stonewall_time[i] = max_stonewall_time;
if(sum_items == 0){
results[j].rate[i] = 0.0;
}else{
results[j].rate[i] = sum_items / max_time;
2020-11-25 12:50:26 +03:00
}
/* These results have already been reduced to Rank 0 */
results[j].stonewall_item_sum[i] = o.summary_table[j].stonewall_item_sum[i];
results[j].stonewall_item_min[i] = o.summary_table[j].stonewall_item_min[i];
results[j].stonewall_time[i] = o.summary_table[j].stonewall_time[i];
2020-11-25 12:50:26 +03:00
}
}
}else{
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm);
}
2020-11-25 12:50:26 +03:00
/* share global results across processes as these are returned by the API */
MPI_Bcast(results, size / sizeof(double), MPI_DOUBLE, 0, testComm);
2020-11-25 12:50:26 +03:00
/* update relevant result values with local values as these are returned by the API */
for(int j=0; j < iterations; j++){
for(int i=0; i < MDTEST_LAST_NUM; i++){
results[j].time_before_barrier[i] = o.summary_table[j].time_before_barrier[i];
results[j].stonewall_last_item[i] = o.summary_table[j].stonewall_last_item[i];
}
}
if(rank != 0){
return;
}
if (o.print_rate_and_time){
summarize_results_rank0(iterations, all_results, 0);
summarize_results_rank0(iterations, all_results, 1);
}else{
summarize_results_rank0(iterations, all_results, o.print_time);
}
free(all_results);
}
/* Checks to see if the test setup is valid. If it isn't, fail. */
void md_validate_tests() {
if (((o.stone_wall_timer_seconds > 0) && (o.branch_factor > 1)) || ! o.barriers) {
FAIL( "Error, stone wall timer does only work with a branch factor <= 1 (current is %d) and with barriers\n", o.branch_factor);
}
if (!o.create_only && ! o.stat_only && ! o.read_only && !o.remove_only && !o.rename_dirs) {
o.create_only = o.stat_only = o.read_only = o.remove_only = o.rename_dirs = 1;
VERBOSE(1,-1,"main: Setting create/stat/read/remove_only to True" );
}
VERBOSE(1,-1,"Entering md_validate_tests..." );
/* if dirs_only and files_only were both left unset, set both now */
if (!o.dirs_only && !o.files_only) {
o.dirs_only = o.files_only = 1;
}
/* if shared file 'S' access, no directory tests */
if (o.shared_file) {
o.dirs_only = 0;
}
/* check for no barriers with shifting processes for different phases.
that is, one may not specify both -B and -N as it will introduce
race conditions that may cause errors stat'ing or deleting after
creates.
*/
if (( o.barriers == 0 ) && ( o.nstride != 0 ) && ( rank == 0 )) {
FAIL( "Possible race conditions will occur: -B not compatible with -N");
}
/* check for collective_creates incompatibilities */
if (o.shared_file && o.collective_creates && rank == 0) {
FAIL("-c not compatible with -S");
}
if (o.path_count > 1 && o.collective_creates && rank == 0) {
FAIL("-c not compatible with multiple test directories");
}
if (o.collective_creates && !o.barriers) {
FAIL("-c not compatible with -B");
}
/* check for shared file incompatibilities */
if (o.unique_dir_per_task && o.shared_file && rank == 0) {
FAIL("-u not compatible with -S");
}
/* check multiple directory paths and strided option */
if (o.path_count > 1 && o.nstride > 0) {
FAIL("cannot have multiple directory paths with -N strides between neighbor tasks");
}
/* check for shared directory and multiple directories incompatibility */
if (o.path_count > 1 && o.unique_dir_per_task != 1) {
FAIL("shared directory mode is not compatible with multiple directory paths");
}
/* check if more directory paths than ranks */
if (o.path_count > o.size) {
FAIL("cannot have more directory paths than MPI tasks");
}
/* check depth */
if (o.depth < 0) {
FAIL("depth must be greater than or equal to zero");
}
/* check branch_factor */
if (o.branch_factor < 1 && o.depth > 0) {
FAIL("branch factor must be greater than or equal to zero");
}
/* check for valid number of items */
if ((o.items > 0) && (o.items_per_dir > 0)) {
if(o.unique_dir_per_task){
FAIL("only specify the number of items or the number of items per directory");
}else if( o.items % o.items_per_dir != 0){
FAIL("items must be a multiple of items per directory");
}
}
/* check for using mknod */
if (o.write_bytes > 0 && o.make_node) {
FAIL("-k not compatible with -w");
}
if(o.verify_read && ! o.read_only)
FAIL("Verify read requires that the read test is used");
if(o.verify_read && o.read_bytes <= 0)
FAIL("Verify read requires that read bytes is > 0");
if(o.read_only && o.read_bytes <= 0)
WARN("Read bytes is 0, thus, a read test will actually just open/close");
if(o.create_only && o.read_only && o.read_bytes > o.write_bytes)
FAIL("When writing and reading files, read bytes must be smaller than write bytes");
if (rank == 0 && o.saveRankDetailsCSV){
// check that the file is writeable, truncate it and add header
FILE* fd = fopen(o.saveRankDetailsCSV, "w");
if (fd == NULL){
FAIL("Cannot open saveRankPerformanceDetails file for write!");
}
char * head = "rank,items";
int ret = fwrite(head, strlen(head), 1, fd);
for(int e = 0; e < MDTEST_LAST_NUM; e++){
char buf[1024];
const char * str = mdtest_test_name(e);
sprintf(buf, ",rate-%s,time-%s", str, str);
ret = fwrite(buf, strlen(buf), 1, fd);
if(ret != 1){
FAIL("Cannot write header to saveRankPerformanceDetails file");
}
}
fwrite("\n", 1, 1, fd);
fclose(fd);
}
}
void show_file_system_size(char *file_system) {
2018-07-08 00:39:14 +03:00
char real_path[MAX_PATHLEN];
char file_system_unit_str[MAX_PATHLEN] = "GiB";
char inode_unit_str[MAX_PATHLEN] = "Mi";
int64_t file_system_unit_val = 1024 * 1024 * 1024;
int64_t inode_unit_val = 1024 * 1024;
int64_t total_file_system_size,
free_file_system_size,
total_inodes,
free_inodes;
double total_file_system_size_hr,
used_file_system_percentage,
used_inode_percentage;
ior_aiori_statfs_t stat_buf;
int ret;
VERBOSE(1,-1,"Entering show_file_system_size on %s", file_system );
ret = o.backend->statfs (file_system, &stat_buf, o.backend_options);
if (0 != ret) {
FAIL("unable to stat file system %s", file_system);
}
total_file_system_size = stat_buf.f_blocks * stat_buf.f_bsize;
free_file_system_size = stat_buf.f_bfree * stat_buf.f_bsize;
used_file_system_percentage = (1 - ((double)free_file_system_size
/ (double)total_file_system_size)) * 100;
total_file_system_size_hr = (double)total_file_system_size
/ (double)file_system_unit_val;
if (total_file_system_size_hr > 1024) {
total_file_system_size_hr = total_file_system_size_hr / 1024;
strcpy(file_system_unit_str, "TiB");
}
/* inodes */
total_inodes = stat_buf.f_files;
free_inodes = stat_buf.f_ffree;
used_inode_percentage = (1 - ((double)free_inodes/(double)total_inodes))
* 100;
2018-01-24 21:26:53 +03:00
if (realpath(file_system, real_path) == NULL) {
WARN("unable to use realpath() on file system");
2018-01-24 21:26:53 +03:00
}
/* show results */
VERBOSE(0,-1,"Path: %s", real_path);
VERBOSE(0,-1,"FS: %.1f %s Used FS: %2.1f%% Inodes: %.1f %s Used Inodes: %2.1f%%\n",
total_file_system_size_hr, file_system_unit_str, used_file_system_percentage,
(double)total_inodes / (double)inode_unit_val, inode_unit_str, used_inode_percentage);
return;
}
void create_remove_directory_tree(int create,
int currDepth, char* path, int dirNum, rank_progress_t * progress) {
unsigned i;
2018-07-08 00:39:14 +03:00
char dir[MAX_PATHLEN];
VERBOSE(1,5,"Entering create_remove_directory_tree on %s, currDepth = %d...", path, currDepth );
if (currDepth == 0) {
sprintf(dir, "%s/%s.%d/", path, o.base_tree_name, dirNum);
if (create) {
VERBOSE(2,5,"Making directory '%s'", dir);
if (-1 == o.backend->mkdir (dir, DIRMODE, o.backend_options)) {
EWARNF("unable to create tree directory '%s'", dir);
}
#ifdef HAVE_LUSTRE_LUSTREAPI
/* internal node for branching, can be non-striped for children */
if (o.global_dir_layout && \
llapi_dir_set_default_lmv_stripe(dir, -1, 0,
LMV_HASH_TYPE_FNV_1A_64,
NULL) == -1) {
FAIL("Unable to reset to global default directory layout");
}
#endif /* HAVE_LUSTRE_LUSTREAPI */
}
create_remove_directory_tree(create, ++currDepth, dir, ++dirNum, progress);
if (!create) {
VERBOSE(2,5,"Remove directory '%s'", dir);
if (-1 == o.backend->rmdir(dir, o.backend_options)) {
2021-01-14 19:41:56 +03:00
EWARNF("Unable to remove directory %s", dir);
}
}
} else if (currDepth <= o.depth) {
2018-07-08 00:39:14 +03:00
char temp_path[MAX_PATHLEN];
strcpy(temp_path, path);
int currDir = dirNum;
for (i=0; i < o.branch_factor; i++) {
sprintf(dir, "%s.%d/", o.base_tree_name, currDir);
strcat(temp_path, dir);
if (create) {
VERBOSE(2,5,"Making directory '%s'", temp_path);
if (-1 == o.backend->mkdir(temp_path, DIRMODE, o.backend_options)) {
EWARNF("Unable to create directory %s", temp_path);
}
}
create_remove_directory_tree(create, ++currDepth,
temp_path, (o.branch_factor*currDir)+1, progress);
currDepth--;
if (!create) {
VERBOSE(2,5,"Remove directory '%s'", temp_path);
if (-1 == o.backend->rmdir(temp_path, o.backend_options)) {
EWARNF("Unable to remove directory %s", temp_path);
}
}
strcpy(temp_path, path);
currDir++;
}
}
}
static void mdtest_iteration(int i, int j, mdtest_results_t * summary_table){
rank_progress_t progress_o;
memset(& progress_o, 0 , sizeof(progress_o));
progress_o.stone_wall_timer_seconds = 0;
progress_o.items_per_dir = o.items_per_dir;
rank_progress_t * progress = & progress_o;
/* start and end times of directory tree create/remove */
double startCreate, endCreate;
int k;
VERBOSE(1,-1,"main: * iteration %d *", j+1);
if(o.create_only){
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
if (rank >= o.path_count) {
continue;
}
prep_testdir(j, dir_iter);
VERBOSE(2,5,"main (for j loop): making o.testdir, '%s'", o.testdir );
if (o.backend->access(o.testdir, F_OK, o.backend_options) != 0) {
if (o.backend->mkdir(o.testdir, DIRMODE, o.backend_options) != 0) {
EWARNF("Unable to create test directory %s", o.testdir);
}
#ifdef HAVE_LUSTRE_LUSTREAPI
/* internal node for branching, can be non-striped for children */
if (o.global_dir_layout && o.unique_dir_per_task && llapi_dir_set_default_lmv_stripe(o.testdir, -1, 0, LMV_HASH_TYPE_FNV_1A_64, NULL) == -1) {
EWARNF("Unable to reset to global default directory layout");
}
#endif /* HAVE_LUSTRE_LUSTREAPI */
}
}
/* create hierarchical directory structure */
MPI_Barrier(testComm);
startCreate = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(j, dir_iter);
if (o.unique_dir_per_task) {
if (o.collective_creates && (rank == 0)) {
/*
* This is inside two loops, one of which already uses "i" and the other uses "j".
* I don't know how this ever worked. I'm changing this loop to use "k".
*/
for (k=0; k < o.size; k++) {
sprintf(o.base_tree_name, "mdtest_tree.%d", k);
VERBOSE(3,5,"main (create hierarchical directory loop-collective): Calling create_remove_directory_tree with '%s'", o.testdir );
/*
* Let's pass in the path to the directory we most recently made so that we can use
* full paths in the other calls.
*/
create_remove_directory_tree(1, 0, o.testdir, 0, progress);
if(CHECK_STONE_WALL(progress)){
o.size = k;
break;
}
}
} else if (! o.collective_creates) {
VERBOSE(3,5,"main (create hierarchical directory loop-!collective_creates): Calling create_remove_directory_tree with '%s'", o.testdir );
/*
* Let's pass in the path to the directory we most recently made so that we can use
* full paths in the other calls.
*/
create_remove_directory_tree(1, 0, o.testdir, 0, progress);
}
} else {
if (rank == 0) {
VERBOSE(3,5,"main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '%s'", o.testdir );
/*
* Let's pass in the path to the directory we most recently made so that we can use
* full paths in the other calls.
*/
create_remove_directory_tree(1, 0 , o.testdir, 0, progress);
}
}
}
MPI_Barrier(testComm);
endCreate = GetTimeStamp();
summary_table->rate[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree / (endCreate - startCreate);
summary_table->time[MDTEST_TREE_CREATE_NUM] = (endCreate - startCreate);
summary_table->items[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree;
summary_table->stonewall_last_item[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree;
VERBOSE(1,-1,"V-1: main: Tree creation : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[MDTEST_TREE_CREATE_NUM]);
}
sprintf(o.unique_mk_dir, "%s.0", o.base_tree_name);
sprintf(o.unique_chdir_dir, "%s.0", o.base_tree_name);
sprintf(o.unique_stat_dir, "%s.0", o.base_tree_name);
sprintf(o.unique_read_dir, "%s.0", o.base_tree_name);
sprintf(o.unique_rm_dir, "%s.0", o.base_tree_name);
o.unique_rm_uni_dir[0] = 0;
if (! o.unique_dir_per_task) {
VERBOSE(3,-1,"V-3: main: Using unique_mk_dir, '%s'", o.unique_mk_dir );
}
if (rank < i) {
if (! o.shared_file) {
sprintf(o.mk_name, "mdtest.%d.", (rank+(0*o.nstride))%i);
sprintf(o.stat_name, "mdtest.%d.", (rank+(1*o.nstride))%i);
sprintf(o.read_name, "mdtest.%d.", (rank+(2*o.nstride))%i);
sprintf(o.rm_name, "mdtest.%d.", (rank+(3*o.nstride))%i);
}
if (o.unique_dir_per_task) {
VERBOSE(3,5,"i %d nstride %d", i, o.nstride);
sprintf(o.unique_mk_dir, "mdtest_tree.%d.0", (rank+(0*o.nstride))%i);
sprintf(o.unique_chdir_dir, "mdtest_tree.%d.0", (rank+(1*o.nstride))%i);
sprintf(o.unique_stat_dir, "mdtest_tree.%d.0", (rank+(2*o.nstride))%i);
sprintf(o.unique_read_dir, "mdtest_tree.%d.0", (rank+(3*o.nstride))%i);
sprintf(o.unique_rm_dir, "mdtest_tree.%d.0", (rank+(4*o.nstride))%i);
o.unique_rm_uni_dir[0] = 0;
VERBOSE(5,5,"mk_dir %s chdir %s stat_dir %s read_dir %s rm_dir %s\n", o.unique_mk_dir, o.unique_chdir_dir, o.unique_stat_dir, o.unique_read_dir, o.unique_rm_dir);
}
VERBOSE(3,-1,"V-3: main: Copied unique_mk_dir, '%s', to topdir", o.unique_mk_dir );
if (o.dirs_only && ! o.shared_file) {
if (o.pre_delay) {
DelaySecs(o.pre_delay);
}
directory_test(j, i, o.unique_mk_dir, progress);
}
if (o.files_only) {
if (o.pre_delay) {
DelaySecs(o.pre_delay);
}
VERBOSE(3,5,"will file_test on %s", o.unique_mk_dir);
file_test(j, i, o.unique_mk_dir, progress);
}
}
/* remove directory structure */
if (! o.unique_dir_per_task) {
VERBOSE(3,-1,"main: Using o.testdir, '%s'", o.testdir );
}
MPI_Barrier(testComm);
if (o.remove_only) {
progress->items_start = 0;
startCreate = GetTimeStamp();
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(j, dir_iter);
if (o.unique_dir_per_task) {
if (o.collective_creates && (rank == 0)) {
/*
* This is inside two loops, one of which already uses "i" and the other uses "j".
* I don't know how this ever worked. I'm changing this loop to use "k".
*/
for (k=0; k < o.size; k++) {
sprintf(o.base_tree_name, "mdtest_tree.%d", k);
VERBOSE(3,-1,"main (remove hierarchical directory loop-collective): Calling create_remove_directory_tree with '%s'", o.testdir );
/*
* Let's pass in the path to the directory we most recently made so that we can use
* full paths in the other calls.
*/
create_remove_directory_tree(0, 0, o.testdir, 0, progress);
if(CHECK_STONE_WALL(progress)){
o.size = k;
break;
}
}
} else if (! o.collective_creates) {
VERBOSE(3,-1,"main (remove hierarchical directory loop-!collective): Calling create_remove_directory_tree with '%s'", o.testdir );
/*
* Let's pass in the path to the directory we most recently made so that we can use
* full paths in the other calls.
*/
create_remove_directory_tree(0, 0, o.testdir, 0, progress);
}
} else {
if (rank == 0) {
VERBOSE(3,-1,"V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '%s'", o.testdir );
/*
* Let's pass in the path to the directory we most recently made so that we can use
* full paths in the other calls.
*/
create_remove_directory_tree(0, 0 , o.testdir, 0, progress);
}
}
}
MPI_Barrier(testComm);
endCreate = GetTimeStamp();
summary_table->rate[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree / (endCreate - startCreate);
summary_table->time[MDTEST_TREE_REMOVE_NUM] = endCreate - startCreate;
summary_table->items[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree;
summary_table->stonewall_last_item[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree;
VERBOSE(1,-1,"main Tree removal : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[MDTEST_TREE_REMOVE_NUM]);
VERBOSE(2,-1,"main (at end of for j loop): Removing o.testdir of '%s'\n", o.testdir );
for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){
prep_testdir(j, dir_iter);
if ((rank < o.path_count) && o.backend->access(o.testdir, F_OK, o.backend_options) == 0) {
//if (( rank == 0 ) && access(o.testdir, F_OK) == 0) {
if (o.backend->rmdir(o.testdir, o.backend_options) == -1) {
EWARNF("unable to remove directory %s", o.testdir);
}
}
}
} else {
summary_table->rate[MDTEST_TREE_REMOVE_NUM] = 0;
}
}
void mdtest_init_args(){
2020-11-28 13:34:20 +03:00
o = (mdtest_options_t) {
.barriers = 1,
.branch_factor = 1,
.random_buffer_offset = -1,
.prologue = "",
.epilogue = "",
2020-11-28 13:34:20 +03:00
};
}
mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out) {
testComm = world_com;
out_logfile = world_out;
out_resultfile = world_out;
init_clock(world_com);
mdtest_init_args();
int i, j;
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
int numNodes;
int numTasksOnNode0 = 0;
MPI_Group worldgroup;
struct {
int first;
int last;
int stride;
} range = {0, 0, 1};
int first = 1;
int last = 0;
int stride = 1;
int iterations = 1;
int created_root_dir = 0; // was the root directory existing or newly created
verbose = 0;
int no_barriers = 0;
char * path = "./out";
int randomize = 0;
2018-08-15 19:07:17 +03:00
char APIs[1024];
char APIs_legacy[1024];
aiori_supported_apis(APIs, APIs_legacy, MDTEST);
2018-08-15 19:07:17 +03:00
char apiStr[1024];
sprintf(apiStr, "API for I/O [%s]", APIs);
memset(& o.hints, 0, sizeof(o.hints));
char * packetType = "t";
2018-08-15 19:07:17 +03:00
option_help options [] = {
{'a', NULL, apiStr, OPTION_OPTIONAL_ARGUMENT, 's', & o.api},
{'b', NULL, "branching factor of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.branch_factor},
{'d', NULL, "the directory in which the tests will run", OPTION_OPTIONAL_ARGUMENT, 's', & path},
{'B', NULL, "no barriers between phases", OPTION_OPTIONAL_ARGUMENT, 'd', & no_barriers},
{'C', NULL, "only create files/dirs", OPTION_FLAG, 'd', & o.create_only},
{'T', NULL, "only stat files/dirs", OPTION_FLAG, 'd', & o.stat_only},
{'E', NULL, "only read files/dir", OPTION_FLAG, 'd', & o.read_only},
{'r', NULL, "only remove files or directories left behind by previous runs", OPTION_FLAG, 'd', & o.remove_only},
{'D', NULL, "perform test on directories only (no files)", OPTION_FLAG, 'd', & o.dirs_only},
{'e', NULL, "bytes to read from each file", OPTION_OPTIONAL_ARGUMENT, 'l', & o.read_bytes},
{'f', NULL, "first number of tasks on which the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & first},
{'F', NULL, "perform test on files only (no directories)", OPTION_FLAG, 'd', & o.files_only},
#ifdef HAVE_LUSTRE_LUSTREAPI
{'g', NULL, "global default directory layout for test subdirectories (deletes inherited striping layout)", OPTION_FLAG, 'd', & o.global_dir_layout},
#endif /* HAVE_LUSTRE_LUSTREAPI */
{'G', NULL, "Offset for the data in the read/write buffer, if not set, a random value is used", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_buffer_offset},
2018-09-05 11:14:22 +03:00
{'i', NULL, "number of iterations the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & iterations},
{'I', NULL, "number of items per directory in tree", OPTION_OPTIONAL_ARGUMENT, 'l', & o.items_per_dir},
{'k', NULL, "use mknod to create file", OPTION_FLAG, 'd', & o.make_node},
{'l', NULL, "last number of tasks on which the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & last},
{'L', NULL, "files only at leaf level of tree", OPTION_FLAG, 'd', & o.leaf_only},
{'n', NULL, "every process will creat/stat/read/remove # directories and files", OPTION_OPTIONAL_ARGUMENT, 'l', & o.items},
{'N', NULL, "stride # between tasks for file/dir operation (local=0; set to 1 to avoid client cache)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.nstride},
{'p', NULL, "pre-iteration delay (in seconds)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.pre_delay},
{'P', NULL, "print rate AND time", OPTION_FLAG, 'd', & o.print_rate_and_time},
{0, "print-all-procs", "all processes print an excerpt of their results", OPTION_FLAG, 'd', & o.print_all_proc},
2018-07-14 18:46:24 +03:00
{'R', NULL, "random access to files (only for stat)", OPTION_FLAG, 'd', & randomize},
{0, "random-seed", "random seed for -R", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_seed},
{'s', NULL, "stride between the number of tasks for each test", OPTION_OPTIONAL_ARGUMENT, 'd', & stride},
{'S', NULL, "shared file access (file only, no directories)", OPTION_FLAG, 'd', & o.shared_file},
{'c', NULL, "collective creates: task 0 does all creates", OPTION_FLAG, 'd', & o.collective_creates},
{'t', NULL, "time unique working directory overhead", OPTION_FLAG, 'd', & o.time_unique_dir_overhead},
{'u', NULL, "unique working directory for each task", OPTION_FLAG, 'd', & o.unique_dir_per_task},
{'v', NULL, "verbosity (each instance of option increments by one)", OPTION_FLAG, 'd', & verbose},
{'V', NULL, "verbosity value", OPTION_OPTIONAL_ARGUMENT, 'd', & verbose},
{'w', NULL, "bytes to write to each file after it is created", OPTION_OPTIONAL_ARGUMENT, 'l', & o.write_bytes},
{'W', NULL, "number in seconds; stonewall timer, write as many seconds and ensure all processes did the same number of operations (currently only stops during create phase and files)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stone_wall_timer_seconds},
{'x', NULL, "StoneWallingStatusFile; contains the number of iterations of the creation phase, can be used to split phases across runs", OPTION_OPTIONAL_ARGUMENT, 's', & o.stoneWallingStatusFile},
{'X', "verify-read", "Verify the data read", OPTION_FLAG, 'd', & o.verify_read},
{0, "verify-write", "Verify the data after a write by reading it back immediately", OPTION_FLAG, 'd', & o.verify_write},
{'y', NULL, "sync file after writing", OPTION_FLAG, 'd', & o.sync_file},
{'Y', NULL, "call the sync command after each phase (included in the timing; note it causes all IO to be flushed from your node)", OPTION_FLAG, 'd', & o.call_sync},
{'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth},
{'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time},
2021-03-21 20:40:26 +03:00
{0, "run-cmd-before-phase", "call this external command before each phase (excluded from the timing)", OPTION_OPTIONAL_ARGUMENT, 's', & o.prologue},
{0, "run-cmd-after-phase", "call this external command after each phase (included in the timing)", OPTION_OPTIONAL_ARGUMENT, 's', & o.epilogue},
{0, "dataPacketType", "type of packet that will be created [offset|incompressible|timestamp|o|i|t]", OPTION_OPTIONAL_ARGUMENT, 's', & packetType},
{0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags},
{0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors},
{0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV},
{0, "showRankStatistics", "Include statistics per rank", OPTION_FLAG, 'd', & o.show_perrank_statistics},
LAST_OPTION
};
options_all_t * global_options = airoi_create_all_module_options(options);
option_parse(argc, argv, global_options);
o.backend = aiori_select(o.api);
if (o.backend == NULL)
ERR("Unrecognized I/O API");
if (! o.backend->enable_mdtest)
ERR("Backend doesn't support MDTest");
o.backend_options = airoi_update_module_options(o.backend, global_options);
free(global_options->modules);
free(global_options);
o.dataPacketType = parsePacketType(packetType[0]);
MPI_Comm_rank(testComm, &rank);
MPI_Comm_size(testComm, &o.size);
if(o.backend->xfer_hints){
o.backend->xfer_hints(& o.hints);
}
if(o.backend->check_params){
o.backend->check_params(o.backend_options);
}
if (o.backend->initialize){
o.backend->initialize(o.backend_options);
}
2020-11-25 12:50:26 +03:00
o.pid = getpid();
o.uid = getuid();
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
numNodes = GetNumNodes(testComm);
numTasksOnNode0 = GetNumTasksOnNode0(testComm);
char cmd_buffer[4096];
strncpy(cmd_buffer, argv[0], 4096);
for (i = 1; i < argc; i++) {
snprintf(&cmd_buffer[strlen(cmd_buffer)], 4096-strlen(cmd_buffer), " '%s'", argv[i]);
}
VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp());
VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, o.size, numNodes);
VERBOSE(0,-1,"Command line used: %s", cmd_buffer);
/* adjust special variables */
o.barriers = ! no_barriers;
if (path != NULL){
parse_dirpath(path);
}
if( randomize > 0 ){
if (o.random_seed == 0) {
/* Ensure all procs have the same random number */
o.random_seed = time(NULL);
MPI_Barrier(testComm);
MPI_Bcast(& o.random_seed, 1, MPI_INT, 0, testComm);
}
o.random_seed += rank;
}
if( o.random_buffer_offset == -1 ){
o.random_buffer_offset = time(NULL);
MPI_Bcast(& o.random_buffer_offset, 1, MPI_INT, 0, testComm);
}
if ((o.items > 0) && (o.items_per_dir > 0) && (! o.unique_dir_per_task)) {
o.directory_loops = o.items / o.items_per_dir;
}else{
o.directory_loops = 1;
}
md_validate_tests();
// option_print_current(options);
VERBOSE(1,-1, "api : %s", o.api);
VERBOSE(1,-1, "barriers : %s", ( o.barriers ? "True" : "False" ));
VERBOSE(1,-1, "collective_creates : %s", ( o.collective_creates ? "True" : "False" ));
VERBOSE(1,-1, "create_only : %s", ( o.create_only ? "True" : "False" ));
VERBOSE(1,-1, "dirpath(s):" );
for ( i = 0; i < o.path_count; i++ ) {
VERBOSE(1,-1, "\t%s", o.filenames[i] );
}
VERBOSE(1,-1, "dirs_only : %s", ( o.dirs_only ? "True" : "False" ));
VERBOSE(1,-1, "read_bytes : "LLU"", o.read_bytes );
VERBOSE(1,-1, "read_only : %s", ( o.read_only ? "True" : "False" ));
VERBOSE(1,-1, "first : %d", first );
VERBOSE(1,-1, "files_only : %s", ( o.files_only ? "True" : "False" ));
#ifdef HAVE_LUSTRE_LUSTREAPI
VERBOSE(1,-1, "global_dir_layout : %s", ( o.global_dir_layout ? "True" : "False" ));
#endif /* HAVE_LUSTRE_LUSTREAPI */
VERBOSE(1,-1, "iterations : %d", iterations );
VERBOSE(1,-1, "items_per_dir : "LLU"", o.items_per_dir );
VERBOSE(1,-1, "last : %d", last );
VERBOSE(1,-1, "leaf_only : %s", ( o.leaf_only ? "True" : "False" ));
VERBOSE(1,-1, "items : "LLU"", o.items );
VERBOSE(1,-1, "nstride : %d", o.nstride );
VERBOSE(1,-1, "pre_delay : %d", o.pre_delay );
VERBOSE(1,-1, "remove_only : %s", ( o.leaf_only ? "True" : "False" ));
VERBOSE(1,-1, "random_seed : %d", o.random_seed );
VERBOSE(1,-1, "stride : %d", stride );
VERBOSE(1,-1, "shared_file : %s", ( o.shared_file ? "True" : "False" ));
VERBOSE(1,-1, "time_unique_dir_overhead: %s", ( o.time_unique_dir_overhead ? "True" : "False" ));
VERBOSE(1,-1, "stone_wall_timer_seconds: %d", o.stone_wall_timer_seconds);
VERBOSE(1,-1, "stat_only : %s", ( o.stat_only ? "True" : "False" ));
VERBOSE(1,-1, "unique_dir_per_task : %s", ( o.unique_dir_per_task ? "True" : "False" ));
VERBOSE(1,-1, "write_bytes : "LLU"", o.write_bytes );
VERBOSE(1,-1, "sync_file : %s", ( o.sync_file ? "True" : "False" ));
VERBOSE(1,-1, "call_sync : %s", ( o.call_sync ? "True" : "False" ));
VERBOSE(1,-1, "depth : %d", o.depth );
VERBOSE(1,-1, "make_node : %d", o.make_node );
/* setup total number of items and number of items per dir */
if (o.depth <= 0) {
o.num_dirs_in_tree = 1;
} else {
if (o.branch_factor < 1) {
o.num_dirs_in_tree = 1;
} else if (o.branch_factor == 1) {
o.num_dirs_in_tree = o.depth + 1;
} else {
o.num_dirs_in_tree = (pow(o.branch_factor, o.depth+1) - 1) / (o.branch_factor - 1);
}
}
if (o.items_per_dir > 0) {
if(o.items == 0){
if (o.leaf_only) {
o.items = o.items_per_dir * (uint64_t) pow(o.branch_factor, o.depth);
2019-05-14 16:55:11 +03:00
} else {
o.items = o.items_per_dir * o.num_dirs_in_tree;
2019-05-14 16:55:11 +03:00
}
}else{
o.num_dirs_in_tree_calc = o.num_dirs_in_tree;
}
} else {
if (o.leaf_only) {
if (o.branch_factor <= 1) {
o.items_per_dir = o.items;
} else {
o.items_per_dir = (uint64_t) (o.items / pow(o.branch_factor, o.depth));
o.items = o.items_per_dir * (uint64_t) pow(o.branch_factor, o.depth);
}
} else {
o.items_per_dir = o.items / o.num_dirs_in_tree;
o.items = o.items_per_dir * o.num_dirs_in_tree;
}
}
/* initialize rand_array */
if (o.random_seed > 0) {
srand(o.random_seed);
uint64_t s;
o.rand_array = (uint64_t *) safeMalloc( o.items * sizeof(*o.rand_array));
for (s=0; s < o.items; s++) {
o.rand_array[s] = s;
}
/* shuffle list randomly */
uint64_t n = o.items;
while (n>1) {
n--;
/*
* Generate a random number in the range 0 .. n
*
* rand() returns a number from 0 .. RAND_MAX. Divide that
* by RAND_MAX and you get a floating point number in the
* range 0 .. 1. Multiply that by n and you get a number in
* the range 0 .. n.
*/
uint64_t k = ( uint64_t ) ((( double )rand() / ( double )RAND_MAX ) * ( double )n );
/*
* Now move the nth element to the kth (randomly chosen)
* element, and the kth element to the nth element.
*/
uint64_t tmp = o.rand_array[k];
o.rand_array[k] = o.rand_array[n];
o.rand_array[n] = tmp;
}
}
/* allocate and initialize write buffer with # */
if (o.write_bytes > 0) {
o.write_buffer = aligned_buffer_alloc(o.write_bytes, o.gpu_memory_flags);
generate_memory_pattern(o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType);
}
/* setup directory path to work in */
if (o.path_count == 0) { /* special case where no directory path provided with '-d' option */
char *ret = getcwd(o.testdirpath, MAX_PATHLEN);
if (ret == NULL) {
FAIL("Unable to get current working directory on %s", o.testdirpath);
}
o.path_count = 1;
} else {
strcpy(o.testdirpath, o.filenames[rank % o.path_count]);
}
/* if directory does not exist, create it */
if ((rank < o.path_count) && o.backend->access(o.testdirpath, F_OK, o.backend_options) != 0) {
if (o.backend->mkdir(o.testdirpath, DIRMODE, o.backend_options) != 0) {
EWARNF("Unable to create test directory path %s", o.testdirpath);
}
created_root_dir = 1;
}
/* display disk usage */
VERBOSE(3,-1,"main (before display_freespace): o.testdirpath is '%s'", o.testdirpath );
if (rank == 0) ShowFileSystemSize(o.testdirpath, o.backend, o.backend_options);
int tasksBlockMapping = QueryNodeMapping(testComm, true);
/* set the shift to mimic IOR and shift by procs per node */
if (o.nstride > 0) {
Fix #181. On systems where numTasks is not evenly divisible by 'tasksPerNode' we were seeing some nodes reading multiple files while others read none after reordering. Commonly all nodes have the same number of tasks but there is nothing requiring that to be the case. Imagine having 64 tasks running against 4 nodes which can run 20 tasks each. Here you get three groups of 20 and one group of 4. On this sytem nodes running in the group of 4 were previously getting tasksPerNode of 4 which meant they reordered tasks differently than the nodes which got tasksPerNode of 20. The key to fixing this is ensuring that every node reorders tasks the same way, which means ensuring they all use the same input values. Obviously on systems where the number of tasks per node is inconsistent the reordering will also be inconsistent (some tasks may end up on the same node, or not as far separated as desired, etc.) but at least this way you'll always end up with a 1:1 reordering. - Renamed nodes/nodeCount to numNodes - Renamed tasksPerNode to numTasksOnNode0 - Ensured that numTasksOnNode0 will always have the same value regardless of which node you're on - Removed inconsistently used globals numTasksWorld and tasksPerNode and replaced with per-test params equivalents - Added utility functions for setting these values: - numNodes -> GetNumNodes - numTasks -> GetNumTasks - numTasksOnNode0 -> GetNumNodesOnTask0 - Improved MPI_VERSION < 3 logic for GetNumNodes so it works when numTasks is not evenly divisible by numTasksOnNode0 - Left 'nodes' and 'tasksPerNode' in output alone to not break compatibility - Allowed command-line params to override numTasks, numNodes, and numTasksOnNode0 but default to using the MPI-calculated values
2019-08-31 01:45:03 +03:00
if ( numNodes > 1 && tasksBlockMapping ) {
/* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks
however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */
o.nstride *= numTasksOnNode0;
}
VERBOSE(0,5,"Shifting ranks by %d for each phase.", o.nstride);
}
VERBOSE(3,-1,"main (after display_freespace): o.testdirpath is '%s'", o.testdirpath );
if (rank == 0) {
if (o.random_seed > 0) {
VERBOSE(0,-1,"random seed: %d", o.random_seed);
}
}
if (gethostname(o.hostname, MAX_PATHLEN) == -1) {
perror("gethostname");
MPI_Abort(testComm, 2);
}
if (last == 0) {
first = o.size;
last = o.size;
}
if(first > last){
FAIL("process number: first > last doesn't make sense");
}
if(last > o.size){
FAIL("process number: last > number of processes doesn't make sense");
}
/* setup summary table for recording results */
o.summary_table = (mdtest_results_t *) safeMalloc(iterations * sizeof(mdtest_results_t));
memset(o.summary_table, 0, iterations * sizeof(mdtest_results_t));
if (o.unique_dir_per_task) {
sprintf(o.base_tree_name, "mdtest_tree.%d", rank);
} else {
sprintf(o.base_tree_name, "mdtest_tree");
}
mdtest_results_t * aggregated_results = safeMalloc(iterations * sizeof(mdtest_results_t));
/* default use shared directory */
strcpy(o.mk_name, "mdtest.shared.");
strcpy(o.stat_name, "mdtest.shared.");
strcpy(o.read_name, "mdtest.shared.");
strcpy(o.rm_name, "mdtest.shared.");
MPI_Comm_group(testComm, &worldgroup);
last = o.size < last ? o.size : last;
/* Run the tests */
for (i = first; i <= last; i += stride) {
sleep(1);
if(i < last){
MPI_Group testgroup;
range.last = i - 1;
MPI_Group_range_incl(worldgroup, 1, (void *)&range, &testgroup);
MPI_Comm_create(world_com, testgroup, &testComm);
MPI_Group_free(&testgroup);
if(testComm == MPI_COMM_NULL){
continue;
}
}else{
MPI_Comm_dup(world_com, & testComm);
}
MPI_Comm_size(testComm, &o.size);
if (rank == 0) {
uint64_t items_all = i * o.items;
if(o.num_dirs_in_tree_calc){
items_all *= o.num_dirs_in_tree_calc;
}
if (o.files_only && o.dirs_only) {
VERBOSE(0,-1,"%d tasks, "LLU" files/directories", i, items_all);
} else if (o.files_only) {
if (! o.shared_file) {
VERBOSE(0,-1,"%d tasks, "LLU" files", i, items_all);
}
else {
VERBOSE(0,-1,"%d tasks, 1 file", i);
}
} else if (o.dirs_only) {
VERBOSE(0,-1,"%d tasks, "LLU" directories", i, items_all);
}
}
VERBOSE(1,-1,"");
VERBOSE(1,-1," Operation Duration Rate");
VERBOSE(1,-1," --------- -------- ----");
for (j = 0; j < iterations; j++) {
// keep track of the current status for stonewalling
mdtest_iteration(i, j, & o.summary_table[j]);
}
summarize_results(iterations, aggregated_results);
if(o.saveRankDetailsCSV){
StoreRankInformation(iterations, aggregated_results);
}
int total_errors = 0;
MPI_Reduce(& o.verification_error, & total_errors, 1, MPI_INT, MPI_SUM, 0, testComm);
if(rank == 0 && total_errors){
VERBOSE(0, -1, "\nERROR: verifying the data on read (%lld errors)! Take the performance values with care!\n", total_errors);
}
MPI_Comm_free(&testComm);
}
MPI_Group_free(&worldgroup);
testComm = world_com;
if (created_root_dir && o.remove_only && o.backend->rmdir(o.testdirpath, o.backend_options) != 0) {
FAIL("Unable to remove test directory path %s", o.testdirpath);
}
VERBOSE(0,-1,"-- finished at %s --\n", PrintTimestamp());
if (o.random_seed > 0) {
free(o.rand_array);
}
if (o.backend->finalize){
o.backend->finalize(o.backend_options);
2020-05-30 20:30:26 +03:00
}
if (o.write_bytes > 0) {
aligned_buffer_free(o.write_buffer, o.gpu_memory_flags);
}
free(o.summary_table);
return aggregated_results;
}