/* * Copyright (C) 2003, The Regents of the University of California. * Produced at the Lawrence Livermore National Laboratory. * Written by Christopher J. Morrone , * Bill Loewe , Tyce McLarty , * and Ryan Kroiss . * All rights reserved. * UCRL-CODE-155800 * * Please read the COPYRIGHT file. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (as published by * the Free Software Foundation) version 2, dated June 1991. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * terms and conditions of the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * CVS info: * $RCSfile: mdtest.c,v $ * $Revision: 1.4 $ * $Date: 2013/11/27 17:05:31 $ * $Author: brettkettering $ */ #include #include #include #include #include #include #include #include #include #include "option.h" #include "utilities.h" #if HAVE_SYS_PARAM_H #include #endif #if HAVE_SYS_MOUNT_H #include #endif #if HAVE_SYS_STATFS_H #include #endif #if HAVE_SYS_STATVFS_H #include #endif #include #include #if HAVE_STRINGS_H #include #endif #include #include #include #include #include #include "aiori.h" #include "ior.h" #include "mdtest.h" #include #pragma GCC diagnostic ignored "-Wformat-overflow" #ifdef HAVE_LUSTRE_LUSTREAPI #include #endif /* HAVE_LUSTRE_LUSTREAPI */ #define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH #define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH #define RELEASE_VERS META_VERSION #define TEST_DIR "test-dir" #define ITEM_COUNT 25000 #define LLU "%lu" typedef struct { int size; uint64_t *rand_array; char testdir[MAX_PATHLEN]; char testdirpath[MAX_PATHLEN]; char base_tree_name[MAX_PATHLEN]; char **filenames; char hostname[MAX_PATHLEN]; char mk_name[MAX_PATHLEN]; char stat_name[MAX_PATHLEN]; char read_name[MAX_PATHLEN]; char rm_name[MAX_PATHLEN]; char unique_mk_dir[MAX_PATHLEN]; char unique_chdir_dir[MAX_PATHLEN]; char unique_stat_dir[MAX_PATHLEN]; char unique_read_dir[MAX_PATHLEN]; char unique_rm_dir[MAX_PATHLEN]; char unique_rm_uni_dir[MAX_PATHLEN]; char *write_buffer; char *stoneWallingStatusFile; int gpu_memory_flags; int barriers; int create_only; int stat_only; int read_only; int verify_read; int verify_write; int verification_error; int remove_only; int rename_dirs; int leaf_only; unsigned branch_factor; int depth; int random_buffer_offset; /* user settable value, otherwise random */ /* * This is likely a small value, but it's sometimes computed by * branch_factor^(depth+1), so we'll make it a larger variable, * just in case. */ uint64_t num_dirs_in_tree; /* * As we start moving towards Exascale, we could have billions * of files in a directory. Make room for that possibility with * a larger variable. */ uint64_t items; uint64_t items_per_dir; uint64_t num_dirs_in_tree_calc; /* this is a workaround until the overal code is refactored */ int directory_loops; int print_time; int print_rate_and_time; int print_all_proc; int show_perrank_statistics; ior_dataPacketType_e dataPacketType; int random_seed; int shared_file; int files_only; int dirs_only; int pre_delay; int unique_dir_per_task; int time_unique_dir_overhead; int collective_creates; size_t write_bytes; int stone_wall_timer_seconds; size_t read_bytes; int sync_file; int call_sync; int path_count; int nstride; /* neighbor stride */ int make_node; #ifdef HAVE_LUSTRE_LUSTREAPI int global_dir_layout; #endif /* HAVE_LUSTRE_LUSTREAPI */ char * saveRankDetailsCSV; /* save the details about the performance to a file */ const char *prologue; const char *epilogue; mdtest_results_t * summary_table; pid_t pid; uid_t uid; /* Use the POSIX backend by default */ const ior_aiori_t *backend; void * backend_options; aiori_xfer_hint_t hints; char * api; } mdtest_options_t; static mdtest_options_t o; /* This structure describes the processing status for stonewalling */ typedef struct{ double start_time; int stone_wall_timer_seconds; uint64_t items_start; uint64_t items_done; uint64_t items_per_dir; } rank_progress_t; #define CHECK_STONE_WALL(p) (((p)->stone_wall_timer_seconds != 0) && ((GetTimeStamp() - (p)->start_time) > (p)->stone_wall_timer_seconds)) /* for making/removing unique directory && stating/deleting subdirectory */ enum {MK_UNI_DIR, STAT_SUB_DIR, READ_SUB_DIR, RM_SUB_DIR, RM_UNI_DIR}; #define PRINT(...) fprintf(out_logfile, __VA_ARGS__); /* a helper function for passing debug and verbose messages. use the MACRO as it will insert __LINE__ for you. Pass the verbose level for root to print, then the verbose level for anyone to print. Pass -1 to suppress the print for anyone. Then do the standard printf stuff. This function adds the newline for you. */ #define VERBOSE(root,any,...) VerboseMessage(root,any,__LINE__,__VA_ARGS__) void VerboseMessage (int root_level, int any_level, int line, char * format, ...) { if ((rank==0 && verbose >= root_level) || (any_level > 0 && verbose >= any_level)) { char buffer[1024]; va_list args; va_start (args, format); vsnprintf (buffer, 1024, format, args); va_end (args); if (root_level == 0 && any_level == -1) { /* No header when it is just the standard output */ fprintf( out_logfile, "%s\n", buffer ); } else { /* add a header when the verbose is greater than 0 */ fprintf( out_logfile, "V-%d: Rank %3d Line %5d %s\n", root_level, rank, line, buffer ); } fflush(out_logfile); } } void parse_dirpath(char *dirpath_arg) { char * tmp, * token; char delimiter_string[3] = { '@', '\n', '\0' }; int i = 0; VERBOSE(1,-1, "Entering parse_dirpath on %s...", dirpath_arg ); tmp = dirpath_arg; if (* tmp != '\0') o.path_count++; while (* tmp != '\0') { if (* tmp == '@') { o.path_count++; } tmp++; } // prevent changes to the original dirpath_arg dirpath_arg = strdup(dirpath_arg); o.filenames = (char **) safeMalloc(o.path_count * sizeof(char **)); token = strtok(dirpath_arg, delimiter_string); while (token != NULL) { o.filenames[i] = token; token = strtok(NULL, delimiter_string); i++; } } static void prep_testdir(int j, int dir_iter){ int pos = sprintf(o.testdir, "%s", o.testdirpath); if ( o.testdir[strlen( o.testdir ) - 1] != '/' ) { pos += sprintf(& o.testdir[pos], "/"); } pos += sprintf(& o.testdir[pos], "%s", TEST_DIR); pos += sprintf(& o.testdir[pos], ".%d-%d", j, dir_iter); } static void phase_prepare(){ if (*o.prologue){ VERBOSE(0,5,"calling prologue: \"%s\"", o.prologue); system(o.prologue); if (o.barriers) { MPI_Barrier(testComm); } } } static void phase_end(){ if (o.call_sync){ if(! o.backend->sync){ FAIL("Error, backend does not provide the sync method, but you requested to use sync.\n"); } o.backend->sync(o.backend_options); } if (*o.epilogue){ VERBOSE(0,5,"calling epilogue: \"%s\"", o.epilogue); system(o.epilogue); } if (o.barriers) { MPI_Barrier(testComm); } } /* * This function copies the unique directory name for a given option to * the "to" parameter. Some memory must be allocated to the "to" parameter. */ void unique_dir_access(int opt, char *to) { if (opt == MK_UNI_DIR) { MPI_Barrier(testComm); sprintf( to, "%s/%s", o.testdir, o.unique_chdir_dir ); } else if (opt == STAT_SUB_DIR) { sprintf( to, "%s/%s", o.testdir, o.unique_stat_dir ); } else if (opt == READ_SUB_DIR) { sprintf( to, "%s/%s", o.testdir, o.unique_read_dir ); } else if (opt == RM_SUB_DIR) { sprintf( to, "%s/%s", o.testdir, o.unique_rm_dir ); } else if (opt == RM_UNI_DIR) { sprintf( to, "%s/%s", o.testdir, o.unique_rm_uni_dir ); } VERBOSE(1,-1,"Entering unique_dir_access, set it to %s", to ); } static void create_remove_dirs (const char *path, bool create, uint64_t itemNum) { char curr_item[MAX_PATHLEN]; const char *operation = create ? "create" : "remove"; if ( (itemNum % ITEM_COUNT==0 && (itemNum != 0))) { VERBOSE(3,5,"dir: "LLU"", operation, itemNum); } //create dirs sprintf(curr_item, "%s/dir.%s%" PRIu64, path, create ? o.mk_name : o.rm_name, itemNum); VERBOSE(3,5,"create_remove_items_helper (dirs %s): curr_item is '%s'", operation, curr_item); if (create) { if (o.backend->mkdir(curr_item, DIRMODE, o.backend_options) == -1) { EWARNF("unable to create directory %s", curr_item); } } else { if (o.backend->rmdir(curr_item, o.backend_options) == -1) { EWARNF("unable to remove directory %s", curr_item); } } } static void remove_file (const char *path, uint64_t itemNum) { char curr_item[MAX_PATHLEN]; if ( (itemNum % ITEM_COUNT==0 && (itemNum != 0))) { VERBOSE(3,5,"remove file: "LLU"\n", itemNum); } //remove files sprintf(curr_item, "%s/file.%s"LLU"", path, o.rm_name, itemNum); VERBOSE(3,5,"create_remove_items_helper (non-dirs remove): curr_item is '%s'", curr_item); if (!(o.shared_file && rank != 0)) { o.backend->delete (curr_item, o.backend_options); } } static void create_file (const char *path, uint64_t itemNum) { char curr_item[MAX_PATHLEN]; aiori_fd_t *aiori_fh = NULL; if ( (itemNum % ITEM_COUNT==0 && (itemNum != 0))) { VERBOSE(3,5,"create file: "LLU"", itemNum); } //create files sprintf(curr_item, "%s/file.%s"LLU"", path, o.mk_name, itemNum); VERBOSE(3,5,"create_remove_items_helper (non-dirs create): curr_item is '%s'", curr_item); if (o.make_node) { int ret; VERBOSE(3,5,"create_remove_items_helper : mknod..." ); ret = o.backend->mknod (curr_item); if (ret != 0) EWARNF("unable to mknode file %s", curr_item); return; } else if (o.collective_creates) { VERBOSE(3,5,"create_remove_items_helper (collective): open..." ); aiori_fh = o.backend->open (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options); if (NULL == aiori_fh){ EWARNF("unable to open file %s", curr_item); return; } /* * !collective_creates */ } else { o.hints.filePerProc = ! o.shared_file; VERBOSE(3,5,"create_remove_items_helper (non-collective, shared): open..." ); aiori_fh = o.backend->create (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options); if (NULL == aiori_fh){ EWARNF("unable to create file %s", curr_item); return; } } if (o.write_bytes > 0) { VERBOSE(3,5,"create_remove_items_helper: write..." ); o.hints.fsyncPerWrite = o.sync_file; update_write_memory_pattern(itemNum, o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType); if ( o.write_bytes != (size_t) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) o.write_buffer, o.write_bytes, 0, o.backend_options)) { EWARNF("unable to write file %s", curr_item); } if (o.verify_write) { o.write_buffer[0] = 42; if (o.write_bytes != (size_t) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) o.write_buffer, o.write_bytes, 0, o.backend_options)) { EWARNF("unable to verify write (read/back) file %s", curr_item); } int error = verify_memory_pattern(itemNum, o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType); o.verification_error += error; if(error){ VERBOSE(1,1,"verification error in file: %s", curr_item); } } } VERBOSE(3,5,"create_remove_items_helper: close..." ); o.backend->close (aiori_fh, o.backend_options); } /* helper for creating/removing items */ void create_remove_items_helper(const int dirs, const int create, const char *path, uint64_t itemNum, rank_progress_t * progress) { VERBOSE(1,-1,"Entering create_remove_items_helper on %s", path ); for (uint64_t i = progress->items_start; i < progress->items_per_dir ; ++i) { if (!dirs) { if (create) { create_file (path, itemNum + i); } else { remove_file (path, itemNum + i); } } else { create_remove_dirs (path, create, itemNum + i); } if(CHECK_STONE_WALL(progress)){ if(progress->items_done == 0){ progress->items_done = i + 1; } return; } } progress->items_done = progress->items_per_dir; } /* helper function to do collective operations */ void collective_helper(const int dirs, const int create, const char* path, uint64_t itemNum, rank_progress_t * progress) { char curr_item[MAX_PATHLEN]; VERBOSE(1,-1,"Entering collective_helper on %s", path ); for (uint64_t i = progress->items_start ; i < progress->items_per_dir ; ++i) { if (dirs) { create_remove_dirs (path, create, itemNum + i); continue; } sprintf(curr_item, "%s/file.%s"LLU"", path, create ? o.mk_name : o.rm_name, itemNum+i); VERBOSE(3,5,"create file: %s", curr_item); if (create) { aiori_fd_t *aiori_fh; //create files aiori_fh = o.backend->create (curr_item, IOR_WRONLY | IOR_CREAT, o.backend_options); if (NULL == aiori_fh) { EWARNF("unable to create file %s", curr_item); }else{ o.backend->close (aiori_fh, o.backend_options); } } else if (!(o.shared_file && rank != 0)) { //remove files o.backend->delete (curr_item, o.backend_options); } if(CHECK_STONE_WALL(progress)){ progress->items_done = i + 1; return; } } progress->items_done = progress->items_per_dir; } /* recursive function to create and remove files/directories from the directory tree */ void create_remove_items(int currDepth, const int dirs, const int create, const int collective, const char *path, uint64_t dirNum, rank_progress_t * progress) { unsigned i; char dir[MAX_PATHLEN]; char temp_path[MAX_PATHLEN]; unsigned long long currDir = dirNum; VERBOSE(1,-1,"Entering create_remove_items on %s, currDepth = %d...", path, currDepth ); memset(dir, 0, MAX_PATHLEN); strcpy(temp_path, path); VERBOSE(3,5,"create_remove_items (start): temp_path is '%s'", temp_path ); if (currDepth == 0) { /* create items at this depth */ if (! o.leaf_only || (o.depth == 0 && o.leaf_only)) { if (collective) { collective_helper(dirs, create, temp_path, 0, progress); } else { create_remove_items_helper(dirs, create, temp_path, 0, progress); } } if (o.depth > 0) { create_remove_items(++currDepth, dirs, create, collective, temp_path, ++dirNum, progress); } } else if (currDepth <= o.depth) { /* iterate through the branches */ for (i=0; i< o.branch_factor; i++) { /* determine the current branch and append it to the path */ sprintf(dir, "%s.%llu/", o.base_tree_name, currDir); strcat(temp_path, "/"); strcat(temp_path, dir); VERBOSE(3,5,"create_remove_items (for loop): temp_path is '%s'", temp_path ); /* create the items in this branch */ if (! o.leaf_only || (o.leaf_only && currDepth == o.depth)) { if (collective) { collective_helper(dirs, create, temp_path, currDir* o.items_per_dir, progress); } else { create_remove_items_helper(dirs, create, temp_path, currDir*o.items_per_dir, progress); } } /* make the recursive call for the next level below this branch */ create_remove_items( ++currDepth, dirs, create, collective, temp_path, ( currDir * ( unsigned long long ) o.branch_factor ) + 1, progress ); currDepth--; /* reset the path */ strcpy(temp_path, path); currDir++; } } } /* stats all of the items created as specified by the input parameters */ void mdtest_stat(const int random, const int dirs, const long dir_iter, const char *path, rank_progress_t * progress) { struct stat buf; uint64_t parent_dir, item_num = 0; char item[MAX_PATHLEN], temp[MAX_PATHLEN]; VERBOSE(1,-1,"Entering mdtest_stat on %s", path ); uint64_t stop_items = o.items; if( o.directory_loops != 1 ){ stop_items = o.items_per_dir; } /* iterate over all of the item IDs */ for (uint64_t i = 0 ; i < stop_items ; ++i) { /* * It doesn't make sense to pass the address of the array because that would * be like passing char **. Tested it on a Cray and it seems to work either * way, but it seems that it is correct without the "&". * memset(&item, 0, MAX_PATHLEN); */ memset(item, 0, MAX_PATHLEN); memset(temp, 0, MAX_PATHLEN); /* determine the item number to stat */ if (random) { item_num = o.rand_array[i]; } else { item_num = i; } /* make adjustments if in leaf only mode*/ if (o.leaf_only) { item_num += o.items_per_dir * (o.num_dirs_in_tree - (uint64_t) pow( o.branch_factor, o.depth )); } /* create name of file/dir to stat */ if (dirs) { if ( (i % ITEM_COUNT == 0) && (i != 0)) { VERBOSE(3,5,"stat dir: "LLU"", i); } sprintf(item, "dir.%s"LLU"", o.stat_name, item_num); } else { if ( (i % ITEM_COUNT == 0) && (i != 0)) { VERBOSE(3,5,"stat file: "LLU"", i); } sprintf(item, "file.%s"LLU"", o.stat_name, item_num); } /* determine the path to the file/dir to be stat'ed */ parent_dir = item_num / o.items_per_dir; if (parent_dir > 0) { //item is not in tree's root directory /* prepend parent directory to item's path */ sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); //still not at the tree's root dir while (parent_dir > o.branch_factor) { parent_dir = (uint64_t) ((parent_dir-1) / o.branch_factor); sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); } } /* Now get item to have the full path */ sprintf( temp, "%s/%s", path, item ); strcpy( item, temp ); /* below temp used to be hiername */ VERBOSE(3,5,"mdtest_stat %4s: %s", (dirs ? "dir" : "file"), item); if (-1 == o.backend->stat (item, &buf, o.backend_options)) { EWARNF("unable to stat %s %s", dirs ? "directory" : "file", item); } } } /* reads all of the items created as specified by the input parameters */ void mdtest_read(int random, int dirs, const long dir_iter, char *path) { uint64_t parent_dir, item_num = 0; char item[MAX_PATHLEN], temp[MAX_PATHLEN]; aiori_fd_t *aiori_fh; VERBOSE(1,-1,"Entering mdtest_read on %s", path ); char *read_buffer; /* allocate read buffer */ if (o.read_bytes > 0) { read_buffer = aligned_buffer_alloc(o.read_bytes, o.gpu_memory_flags); memset(read_buffer, -1, o.read_bytes); } uint64_t stop_items = o.items; if( o.directory_loops != 1 ){ stop_items = o.items_per_dir; } /* iterate over all of the item IDs */ for (uint64_t i = 0 ; i < stop_items ; ++i) { /* * It doesn't make sense to pass the address of the array because that would * be like passing char **. Tested it on a Cray and it seems to work either * way, but it seems that it is correct without the "&". * * NTH: Both are technically correct in C. * * memset(&item, 0, MAX_PATHLEN); */ memset(item, 0, MAX_PATHLEN); memset(temp, 0, MAX_PATHLEN); /* determine the item number to read */ if (random) { item_num = o.rand_array[i]; } else { item_num = i; } /* make adjustments if in leaf only mode*/ if (o.leaf_only) { item_num += o.items_per_dir * (o.num_dirs_in_tree - (uint64_t) pow (o.branch_factor, o.depth)); } /* create name of file to read */ if (!dirs) { if ((i%ITEM_COUNT == 0) && (i != 0)) { VERBOSE(3,5,"read file: "LLU"", i); } sprintf(item, "file.%s"LLU"", o.read_name, item_num); } /* determine the path to the file/dir to be read'ed */ parent_dir = item_num / o.items_per_dir; if (parent_dir > 0) { //item is not in tree's root directory /* prepend parent directory to item's path */ sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); /* still not at the tree's root dir */ while (parent_dir > o.branch_factor) { parent_dir = (unsigned long long) ((parent_dir-1) / o.branch_factor); sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); } } /* Now get item to have the full path */ sprintf( temp, "%s/%s", path, item ); strcpy( item, temp ); /* below temp used to be hiername */ VERBOSE(3,5,"mdtest_read file: %s", item); o.hints.filePerProc = ! o.shared_file; /* open file for reading */ aiori_fh = o.backend->open (item, O_RDONLY, o.backend_options); if (NULL == aiori_fh) { EWARNF("unable to open file %s", item); continue; } /* read file */ if (o.read_bytes > 0) { read_buffer[0] = 42; if (o.read_bytes != (size_t) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) read_buffer, o.read_bytes, 0, o.backend_options)) { EWARNF("unable to read file %s", item); continue; } int pretend_rank = (2 * o.nstride + rank) % o.size; if(o.verify_read){ if (o.shared_file) { pretend_rank = rank; } int error = verify_memory_pattern(item_num, read_buffer, o.read_bytes, o.random_buffer_offset, pretend_rank, o.dataPacketType); o.verification_error += error; if(error){ VERBOSE(1,1,"verification error in file: %s", item); } } } /* close file */ o.backend->close (aiori_fh, o.backend_options); } if(o.read_bytes){ aligned_buffer_free(read_buffer, o.gpu_memory_flags); } } /* This method should be called by rank 0. It subsequently does all of the creates and removes for the other ranks */ void collective_create_remove(const int create, const int dirs, const int ntasks, const char *path, rank_progress_t * progress) { char temp[MAX_PATHLEN]; VERBOSE(1,-1,"Entering collective_create_remove on %s", path ); /* rank 0 does all of the creates and removes for all of the ranks */ for (int i = 0 ; i < ntasks ; ++i) { memset(temp, 0, MAX_PATHLEN); strcpy(temp, o.testdir); strcat(temp, "/"); /* set the base tree name appropriately */ if (o.unique_dir_per_task) { sprintf(o.base_tree_name, "mdtest_tree.%d", i); } else { sprintf(o.base_tree_name, "mdtest_tree"); } /* Setup to do I/O to the appropriate test dir */ strcat(temp, o.base_tree_name); strcat(temp, ".0"); /* set all item names appropriately */ if (! o.shared_file) { sprintf(o.mk_name, "mdtest.%d.", (i+(0*o.nstride))%ntasks); sprintf(o.stat_name, "mdtest.%d.", (i+(1*o.nstride))%ntasks); sprintf(o.read_name, "mdtest.%d.", (i+(2*o.nstride))%ntasks); sprintf(o.rm_name, "mdtest.%d.", (i+(3*o.nstride))%ntasks); } if (o.unique_dir_per_task) { VERBOSE(3,5,"i %d nstride %d ntasks %d", i, o.nstride, ntasks); sprintf(o.unique_mk_dir, "%s/mdtest_tree.%d.0", o.testdir, (i+(0*o.nstride))%ntasks); sprintf(o.unique_chdir_dir, "%s/mdtest_tree.%d.0", o.testdir, (i+(1*o.nstride))%ntasks); sprintf(o.unique_stat_dir, "%s/mdtest_tree.%d.0", o.testdir, (i+(2*o.nstride))%ntasks); sprintf(o.unique_read_dir, "%s/mdtest_tree.%d.0", o.testdir, (i+(3*o.nstride))%ntasks); sprintf(o.unique_rm_dir, "%s/mdtest_tree.%d.0", o.testdir, (i+(4*o.nstride))%ntasks); sprintf(o.unique_rm_uni_dir, "%s", o.testdir); } /* Now that everything is set up as it should be, do the create or remove */ VERBOSE(3,5,"collective_create_remove (create_remove_items): temp is '%s'", temp); create_remove_items(0, dirs, create, 1, temp, 0, progress); } /* reset all of the item names */ if (o.unique_dir_per_task) { sprintf(o.base_tree_name, "mdtest_tree.0"); } else { sprintf(o.base_tree_name, "mdtest_tree"); } if (! o.shared_file) { sprintf(o.mk_name, "mdtest.%d.", (0+(0*o.nstride))%ntasks); sprintf(o.stat_name, "mdtest.%d.", (0+(1*o.nstride))%ntasks); sprintf(o.read_name, "mdtest.%d.", (0+(2*o.nstride))%ntasks); sprintf(o.rm_name, "mdtest.%d.", (0+(3*o.nstride))%ntasks); } if (o.unique_dir_per_task) { sprintf(o.unique_mk_dir, "%s/mdtest_tree.%d.0", o.testdir, (0+(0*o.nstride))%ntasks); sprintf(o.unique_chdir_dir, "%s/mdtest_tree.%d.0", o.testdir, (0+(1*o.nstride))%ntasks); sprintf(o.unique_stat_dir, "%s/mdtest_tree.%d.0", o.testdir, (0+(2*o.nstride))%ntasks); sprintf(o.unique_read_dir, "%s/mdtest_tree.%d.0", o.testdir, (0+(3*o.nstride))%ntasks); sprintf(o.unique_rm_dir, "%s/mdtest_tree.%d.0", o.testdir, (0+(4*o.nstride))%ntasks); sprintf(o.unique_rm_uni_dir, "%s", o.testdir); } } void rename_dir_test(const int dirs, const long dir_iter, const char *path, rank_progress_t * progress) { uint64_t parent_dir, item_num = 0; char item[MAX_PATHLEN], temp[MAX_PATHLEN]; char item_last[MAX_PATHLEN]; if(o.backend->rename == NULL){ WARN("Backend doesn't support rename\n"); return; } VERBOSE(1,-1,"Entering mdtest_rename on %s", path ); uint64_t stop_items = o.items; if( o.directory_loops != 1 ){ stop_items = o.items_per_dir; } if(stop_items == 1) return; /* iterate over all of the item IDs */ char first_item_name[MAX_PATHLEN]; for (uint64_t i = 0 ; i < stop_items; ++i) { item_num = i; /* make adjustments if in leaf only mode*/ if (o.leaf_only) { item_num += o.items_per_dir * (o.num_dirs_in_tree - (uint64_t) pow( o.branch_factor, o.depth )); } /* create name of file/dir to stat */ if (dirs) { sprintf(item, "dir.%s"LLU"", o.stat_name, item_num); } else { sprintf(item, "file.%s"LLU"", o.stat_name, item_num); } /* determine the path to the file/dir to be stat'ed */ parent_dir = item_num / o.items_per_dir; if (parent_dir > 0) { //item is not in tree's root directory /* prepend parent directory to item's path */ sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); //still not at the tree's root dir while (parent_dir > o.branch_factor) { parent_dir = (uint64_t) ((parent_dir-1) / o.branch_factor); sprintf(temp, "%s."LLU"/%s", o.base_tree_name, parent_dir, item); strcpy(item, temp); } } /* Now get item to have the full path */ sprintf( temp, "%s/%s", path, item ); strcpy( item, temp ); VERBOSE(3,5,"mdtest_rename %4s: %s", (dirs ? "dir" : "file"), item); if(i == 0){ sprintf(first_item_name, "%s-XX", item); strcpy(item_last, first_item_name); }else if(i == stop_items - 1){ strcpy(item, first_item_name); } if (-1 == o.backend->rename(item, item_last, o.backend_options)) { EWARNF("unable to rename %s %s", dirs ? "directory" : "file", item); } strcpy(item_last, item); } } static void updateResult(mdtest_results_t * res, mdtest_test_num_t test, uint64_t item_count, double t_start, double t_end, double t_end_before_barrier){ res->time[test] = t_end - t_start; if(isfinite(t_end_before_barrier)){ res->time_before_barrier[test] = t_end_before_barrier - t_start; }else{ res->time_before_barrier[test] = res->time[test]; } if(item_count == 0){ res->rate[test] = 0.0; res->rate_before_barrier[test] = 0.0; }else{ res->rate[test] = item_count/res->time[test]; res->rate_before_barrier[test] = item_count/res->time_before_barrier[test]; } res->items[test] = item_count; res->stonewall_last_item[test] = o.items; } static void sync_dir(const char *testdir, const char *path) { char temp_path[MAX_PATHLEN]; sprintf(temp_path, "%s/%s", testdir, path); int fd = open(temp_path, O_RDONLY); if (fd >= 0) { fsync(fd); close(fd); } else { printf("failed to open %s for fsync: %s\n", temp_path, strerror(errno)); } } void directory_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) { int size; double t_start, t_end, t_end_before_barrier; char temp_path[MAX_PATHLEN]; mdtest_results_t * res = & o.summary_table[iteration]; MPI_Comm_size(testComm, &size); VERBOSE(1,-1,"Entering directory_test on %s", path ); MPI_Barrier(testComm); /* create phase */ if(o.create_only) { phase_prepare(); t_start = GetTimeStamp(); progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds; progress->items_done = 0; progress->start_time = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(MK_UNI_DIR, temp_path); if (! o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,-1,"directory_test: create path is '%s'", temp_path ); /* "touch" the files */ if (o.collective_creates) { if (rank == 0) { collective_create_remove(1, 1, ntasks, temp_path, progress); } } else { /* create directories */ create_remove_items(0, 1, 1, 0, temp_path, 0, progress); } } sync_dir(o.testdir, path); progress->stone_wall_timer_seconds = 0; t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_DIR_CREATE_NUM, o.items, t_start, t_end, t_end_before_barrier); } /* stat phase */ if (o.stat_only) { phase_prepare(); t_start = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(STAT_SUB_DIR, temp_path); if (! o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"stat path is '%s'", temp_path ); /* stat directories */ if (o.random_seed > 0) { mdtest_stat(1, 1, dir_iter, temp_path, progress); } else { mdtest_stat(0, 1, dir_iter, temp_path, progress); } } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_DIR_STAT_NUM, o.items, t_start, t_end, t_end_before_barrier); } /* read phase */ if (o.read_only) { phase_prepare(); t_start = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(READ_SUB_DIR, temp_path); if (! o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"directory_test: read path is '%s'", temp_path ); /* read directories */ if (o.random_seed > 0) { ; /* N/A */ } else { ; /* N/A */ } } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_DIR_READ_NUM, o.items, t_start, t_end, t_end_before_barrier); } /* rename phase */ if(o.rename_dirs && o.items > 1){ phase_prepare(); t_start = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(STAT_SUB_DIR, temp_path); if (! o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"rename path is '%s'", temp_path ); rename_dir_test(1, dir_iter, temp_path, progress); } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_DIR_RENAME_NUM, o.items, t_start, t_end, t_end_before_barrier); } /* remove phase */ if (o.remove_only) { phase_prepare(); t_start = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(RM_SUB_DIR, temp_path); if (!o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"directory_test: remove directories path is '%s'", temp_path ); /* remove directories */ if (o.collective_creates) { if (rank == 0) { collective_create_remove(0, 1, ntasks, temp_path, progress); } } else { create_remove_items(0, 1, 0, 0, temp_path, 0, progress); } } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_DIR_REMOVE_NUM, o.items, t_start, t_end, t_end_before_barrier); } if (o.remove_only) { if (o.unique_dir_per_task) { unique_dir_access(RM_UNI_DIR, temp_path); } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"directory_test: remove unique directories path is '%s'\n", temp_path ); } VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_CREATE_NUM], o.summary_table[iteration].rate[MDTEST_DIR_CREATE_NUM]); VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_STAT_NUM], o.summary_table[iteration].rate[MDTEST_DIR_STAT_NUM]); VERBOSE(1,-1," Directory rename : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_RENAME_NUM], o.summary_table[iteration].rate[MDTEST_DIR_RENAME_NUM]); VERBOSE(1,-1," Directory removal : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_DIR_REMOVE_NUM], o.summary_table[iteration].rate[MDTEST_DIR_REMOVE_NUM]); } /* Returns if the stonewall was hit */ int updateStoneWallIterations(int iteration, uint64_t items_done, double tstart, uint64_t * out_max_iter){ int hit = 0; long long unsigned max_iter = 0; VERBOSE(1,1,"stonewall hit with %lld items", (long long) items_done ); MPI_Allreduce(& items_done, & max_iter, 1, MPI_LONG_LONG_INT, MPI_MAX, testComm); o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM] = GetTimeStamp() - tstart; o.summary_table[iteration].stonewall_last_item[MDTEST_FILE_CREATE_NUM] = items_done; *out_max_iter = max_iter; // continue to the maximum... long long min_accessed = 0; MPI_Reduce(& items_done, & min_accessed, 1, MPI_LONG_LONG_INT, MPI_MIN, 0, testComm); long long sum_accessed = 0; MPI_Reduce(& items_done, & sum_accessed, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, testComm); o.summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM] = sum_accessed; o.summary_table[iteration].stonewall_item_min[MDTEST_FILE_CREATE_NUM] = min_accessed * o.size; if(o.items != (sum_accessed / o.size)){ VERBOSE(0,-1, "Continue stonewall hit min: %lld max: %lld avg: %.1f \n", min_accessed, max_iter, ((double) sum_accessed) / o.size); hit = 1; } return hit; } void file_test_create(const int iteration, const int ntasks, const char *path, rank_progress_t * progress, double *t_start){ char temp_path[MAX_PATHLEN]; for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(MK_UNI_DIR, temp_path); VERBOSE(5,5,"operating on %s", temp_path); if (! o.time_unique_dir_overhead) { *t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,-1,"file_test: create path is '%s'", temp_path ); /* "touch" the files */ if (o.collective_creates) { if (rank == 0) { collective_create_remove(1, 0, ntasks, temp_path, progress); } MPI_Barrier(testComm); } /* create files */ create_remove_items(0, 0, 1, 0, temp_path, 0, progress); if(o.stone_wall_timer_seconds){ // hit the stonewall uint64_t max_iter = 0; uint64_t items_done = progress->items_done + dir_iter * o.items_per_dir; int hit = updateStoneWallIterations(iteration, items_done, *t_start, & max_iter); progress->items_start = items_done; progress->items_per_dir = max_iter; if (hit){ progress->stone_wall_timer_seconds = 0; VERBOSE(1,1,"stonewall: %lld of %lld", (long long) progress->items_start, (long long) progress->items_per_dir); create_remove_items(0, 0, 1, 0, temp_path, 0, progress); // now reset the values progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds; o.items = progress->items_done; } if (o.stoneWallingStatusFile){ StoreStoneWallingIterations(o.stoneWallingStatusFile, max_iter); } // reset stone wall timer to allow proper cleanup progress->stone_wall_timer_seconds = 0; // at the moment, stonewall can be done only with one directory_loop, so we can return here safely break; } } } void file_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) { int size; double t_start, t_end, t_end_before_barrier; char temp_path[MAX_PATHLEN]; mdtest_results_t * res = & o.summary_table[iteration]; MPI_Comm_size(testComm, &size); VERBOSE(3,5,"Entering file_test on %s", path); MPI_Barrier(testComm); /* create phase */ if (o.create_only ) { phase_prepare(); t_start = GetTimeStamp(); progress->stone_wall_timer_seconds = o.stone_wall_timer_seconds; progress->items_done = 0; progress->start_time = GetTimeStamp(); file_test_create(iteration, ntasks, path, progress, &t_start); sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_FILE_CREATE_NUM, o.items, t_start, t_end, t_end_before_barrier); }else{ if (o.stoneWallingStatusFile){ int64_t expected_items; /* The number of items depends on the stonewalling file */ expected_items = ReadStoneWallingIterations(o.stoneWallingStatusFile, testComm); if(expected_items >= 0){ if(o.directory_loops > 1){ o.directory_loops = expected_items / o.items_per_dir; o.items = o.items_per_dir; }else{ o.items = expected_items; progress->items_per_dir = o.items; } } if (rank == 0) { if(expected_items == -1){ WARN("Could not read stonewall status file"); }else { VERBOSE(1,1, "Read stonewall status; items: "LLU"\n", o.items); } } } } /* stat phase */ if (o.stat_only ) { phase_prepare(); t_start = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(STAT_SUB_DIR, temp_path); if (!o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"file_test: stat path is '%s'", temp_path ); /* stat files */ mdtest_stat((o.random_seed > 0 ? 1 : 0), 0, dir_iter, temp_path, progress); } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_FILE_STAT_NUM, o.items, t_start, t_end, t_end_before_barrier); } /* read phase */ if (o.read_only ) { phase_prepare(); t_start = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(READ_SUB_DIR, temp_path); if (! o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"file_test: read path is '%s'", temp_path ); /* read files */ if (o.random_seed > 0) { mdtest_read(1,0, dir_iter, temp_path); } else { mdtest_read(0,0, dir_iter, temp_path); } } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_FILE_READ_NUM, o.items, t_start, t_end, t_end_before_barrier); } /* remove phase */ if (o.remove_only) { phase_prepare(); t_start = GetTimeStamp(); progress->items_start = 0; for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(iteration, dir_iter); if (o.unique_dir_per_task) { unique_dir_access(RM_SUB_DIR, temp_path); if (! o.time_unique_dir_overhead) { t_start = GetTimeStamp(); } } else { sprintf( temp_path, "%s/%s", o.testdir, path ); } VERBOSE(3,5,"file_test: rm directories path is '%s'", temp_path ); if (o.collective_creates) { if (rank == 0) { collective_create_remove(0, 0, ntasks, temp_path, progress); } } else { VERBOSE(3,5,"gonna create %s", temp_path); create_remove_items(0, 0, 0, 0, temp_path, 0, progress); } } sync_dir(o.testdir, path); t_end_before_barrier = GetTimeStamp(); phase_end(); t_end = GetTimeStamp(); updateResult(res, MDTEST_FILE_REMOVE_NUM, o.items, t_start, t_end, t_end_before_barrier); } if (o.remove_only) { if (o.unique_dir_per_task) { unique_dir_access(RM_UNI_DIR, temp_path); } else { strcpy( temp_path, path ); } VERBOSE(3,5,"file_test: rm unique directories path is '%s'", temp_path ); } if(o.num_dirs_in_tree_calc){ /* this is temporary fix needed when using -n and -i together */ o.items *= o.num_dirs_in_tree_calc; } VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_CREATE_NUM], o.summary_table[iteration].rate[MDTEST_FILE_CREATE_NUM]); if(o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM]){ VERBOSE(1,-1," File creation (stonewall): %14.3f sec, %14.3f ops/sec", o.summary_table[iteration].stonewall_time[MDTEST_FILE_CREATE_NUM], o.summary_table[iteration].stonewall_item_sum[MDTEST_FILE_CREATE_NUM]); } VERBOSE(1,-1," File stat : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_STAT_NUM], o.summary_table[iteration].rate[MDTEST_FILE_STAT_NUM]); VERBOSE(1,-1," File read : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_READ_NUM], o.summary_table[iteration].rate[MDTEST_FILE_READ_NUM]); VERBOSE(1,-1," File removal : %14.3f sec, %14.3f ops/sec", res->time[MDTEST_FILE_REMOVE_NUM], o.summary_table[iteration].rate[MDTEST_FILE_REMOVE_NUM]); } char const * mdtest_test_name(int i){ switch (i) { case MDTEST_DIR_CREATE_NUM: return "Directory creation"; case MDTEST_DIR_STAT_NUM: return "Directory stat"; case MDTEST_DIR_READ_NUM: return "Directory read"; case MDTEST_DIR_REMOVE_NUM: return "Directory removal"; case MDTEST_DIR_RENAME_NUM: return "Directory rename"; case MDTEST_FILE_CREATE_NUM: return "File creation"; case MDTEST_FILE_STAT_NUM: return "File stat"; case MDTEST_FILE_READ_NUM: return "File read"; case MDTEST_FILE_REMOVE_NUM: return "File removal"; case MDTEST_TREE_CREATE_NUM: return "Tree creation"; case MDTEST_TREE_REMOVE_NUM: return "Tree removal"; default: return "ERR INVALID TESTNAME :"; } return NULL; } /* * Store the results of each process in a file */ static void StoreRankInformation(int iterations, mdtest_results_t * agg){ const size_t size = sizeof(mdtest_results_t) * iterations; if(rank == 0){ FILE* fd = fopen(o.saveRankDetailsCSV, "a"); if (fd == NULL){ FAIL("Cannot open saveRankPerformanceDetails file for writes!"); } mdtest_results_t * results = safeMalloc(size * o.size); MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, results, size / sizeof(double), MPI_DOUBLE, 0, testComm); char buff[4096]; char * cpos = buff; cpos += sprintf(cpos, "all,%llu", (long long unsigned) o.items); for(int e = 0; e < MDTEST_LAST_NUM; e++){ if(agg->items[e] == 0){ cpos += sprintf(cpos, ",,"); }else{ cpos += sprintf(cpos, ",%.10e,%.10e", agg->items[e] / agg->time[e], agg->time[e]); } } cpos += sprintf(cpos, "\n"); int ret = fwrite(buff, cpos - buff, 1, fd); for(int iter = 0; iter < iterations; iter++){ for(int i=0; i < o.size; i++){ mdtest_results_t * cur = & results[i * iterations + iter]; cpos = buff; cpos += sprintf(cpos, "%d,", i); for(int e = 0; e < MDTEST_TREE_CREATE_NUM; e++){ if(cur->items[e] == 0){ cpos += sprintf(cpos, ",,"); }else{ cpos += sprintf(cpos, ",%.10e,%.10e", cur->items[e] / cur->time_before_barrier[e], cur->time_before_barrier[e]); } } cpos += sprintf(cpos, "\n"); ret = fwrite(buff, cpos - buff, 1, fd); if(ret != 1){ WARN("Couln't append to saveRankPerformanceDetailsCSV file\n"); break; } } } fclose(fd); free(results); }else{ /* this is a hack for now assuming all datatypes in the structure are double */ MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm); } } static mdtest_results_t* get_result_index(mdtest_results_t* all_results, int proc, int iter, int interation_count){ return & all_results[proc * interation_count + iter]; } static void summarize_results_rank0(int iterations, mdtest_results_t * all_results, int print_time) { int start, stop; double min, max, mean, sd, sum, var, curr = 0; double imin, imax, imean, isum, icur; // calculation per iteration char const * access; /* if files only access, skip entries 0-3 (the dir tests) */ if (o.files_only && ! o.dirs_only) { start = MDTEST_FILE_CREATE_NUM; } else { start = 0; } /* if directories only access, skip entries 4-7 (the file tests) */ if (o.dirs_only && !o.files_only) { stop = MDTEST_FILE_CREATE_NUM; } else { stop = MDTEST_TREE_CREATE_NUM; } /* special case: if no directory or file tests, skip all */ if (!o.dirs_only && !o.files_only) { start = stop = 0; } if(o.print_all_proc){ fprintf(out_logfile, "\nPer process result (%s):\n", print_time ? "time" : "rate"); for (int j = 0; j < iterations; j++) { fprintf(out_logfile, "iteration: %d\n", j); for (int i = start; i < MDTEST_LAST_NUM; i++) { access = mdtest_test_name(i); if(access == NULL){ continue; } fprintf(out_logfile, "Test %s", access); for (int k=0; k < o.size; k++) { mdtest_results_t * cur = get_result_index(all_results, k, j, iterations); if(print_time){ curr = cur->time_before_barrier[i]; }else{ curr = cur->rate_before_barrier[i]; } fprintf(out_logfile, "%c%e", (k==0 ? ' ': ','), curr); } fprintf(out_logfile, "\n"); } } } VERBOSE(0, -1, "\nSUMMARY %s: (of %d iterations)", print_time ? "time" : "rate", iterations); PRINT(" Operation "); if(o.show_perrank_statistics){ PRINT("per Rank: Max Min Mean per Iteration:"); }else{ PRINT(" "); } PRINT(" Max Min Mean Std Dev\n"); PRINT(" --------- "); if(o.show_perrank_statistics){ PRINT(" --- --- ---- "); } PRINT(" --- --- ---- -------\n"); for (int i = start; i < stop; i++) { min = 1e308; max = 0; sum = var = 0; imin = 1e308; isum = imax = 0; double iter_result[iterations]; for (int j = 0; j < iterations; j++) { icur = print_time ? 0 : 1e308; for (int k = 0; k < o.size; k++) { mdtest_results_t * cur = get_result_index(all_results, k, j, iterations); if(print_time){ curr = cur->time_before_barrier[i]; }else{ curr = cur->rate_before_barrier[i]; } if (min > curr) { min = curr; } if (max < curr) { max = curr; } sum += curr; if (print_time) { curr = cur->time[i]; if (icur < curr) { icur = curr; } } else { curr = cur->rate[i]; if (icur > curr) { icur = curr; } } } if (icur > imax) { imax = icur; } if (icur < imin) { imin = icur; } isum += icur; if(print_time){ iter_result[j] = icur; }else{ iter_result[j] = icur * o.size; } } mean = sum / iterations / o.size; imean = isum / iterations; if(! print_time){ imax *= o.size; imin *= o.size; isum *= o.size; imean *= o.size; } for (int j = 0; j < iterations; j++) { var += (imean - iter_result[j]) * (imean - iter_result[j]); } var = var / (iterations - 1); sd = sqrt(var); access = mdtest_test_name(i); if (i != 2) { fprintf(out_logfile, " %-18s ", access); if(o.show_perrank_statistics){ fprintf(out_logfile, "%14.3f ", max); fprintf(out_logfile, "%14.3f ", min); fprintf(out_logfile, "%14.3f ", mean); fprintf(out_logfile, " "); } fprintf(out_logfile, " "); fprintf(out_logfile, "%14.3f ", imax); fprintf(out_logfile, "%14.3f ", imin); fprintf(out_logfile, "%14.3f ", imean); fprintf(out_logfile, "%14.3f\n", iterations == 1 ? 0 : sd); fflush(out_logfile); } } /* calculate tree create/remove rates, applies only to Rank 0 */ for (int i = MDTEST_TREE_CREATE_NUM; i < MDTEST_LAST_NUM; i++) { min = imin = 1e308; max = imax = 0; sum = var = 0; for (int j = 0; j < iterations; j++) { if(print_time){ curr = o.summary_table[j].time[i]; }else{ curr = o.summary_table[j].rate[i]; } if (min > curr) { min = curr; } if (max < curr) { max = curr; } sum += curr; if(curr > imax){ imax = curr; } if(curr < imin){ imin = curr; } } mean = sum / (iterations); for (int j = 0; j < iterations; j++) { if(print_time){ curr = o.summary_table[j].time[i]; }else{ curr = o.summary_table[j].rate[i]; } var += (mean - curr)*(mean - curr); } var = var / (iterations - 1); sd = sqrt(var); access = mdtest_test_name(i); fprintf(out_logfile, " %-22s ", access); if(o.show_perrank_statistics){ fprintf(out_logfile, "%14.3f ", max); fprintf(out_logfile, "%14.3f ", min); fprintf(out_logfile, "%14.3f ", mean); fprintf(out_logfile, " "); } fprintf(out_logfile, "%14.3f ", imax); fprintf(out_logfile, "%14.3f ", imin); fprintf(out_logfile, "%14.3f ", sum / iterations); fprintf(out_logfile, "%14.3f\n", iterations == 1 ? 0 : sd); fflush(out_logfile); } } /* Output the results and summarize them into rank 0's o.summary_table */ void summarize_results(int iterations, mdtest_results_t * results) { const size_t size = sizeof(mdtest_results_t) * iterations; mdtest_results_t * all_results = NULL; if(rank == 0){ all_results = safeMalloc(size * o.size); memset(all_results, 0, size * o.size); MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, all_results, size / sizeof(double), MPI_DOUBLE, 0, testComm); // calculate the aggregated values for all processes for(int j=0; j < iterations; j++){ for(int i=0; i < MDTEST_LAST_NUM; i++){ //double sum_rate = 0; double max_time = 0; double max_stonewall_time = 0; uint64_t sum_items = 0; // reduce over the processes for(int p=0; p < o.size; p++){ mdtest_results_t * cur = get_result_index(all_results, p, j, iterations); //sum_rate += all_results[p + j*p]->rate[i]; double t = cur->time[i]; max_time = max_time < t ? t : max_time; sum_items += cur->items[i]; t = cur->stonewall_time[i]; max_stonewall_time = max_stonewall_time < t ? t : max_stonewall_time; } results[j].items[i] = sum_items; results[j].time[i] = max_time; results[j].stonewall_time[i] = max_stonewall_time; if(sum_items == 0){ results[j].rate[i] = 0.0; }else{ results[j].rate[i] = sum_items / max_time; } /* These results have already been reduced to Rank 0 */ results[j].stonewall_item_sum[i] = o.summary_table[j].stonewall_item_sum[i]; results[j].stonewall_item_min[i] = o.summary_table[j].stonewall_item_min[i]; results[j].stonewall_time[i] = o.summary_table[j].stonewall_time[i]; } } }else{ MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm); } /* share global results across processes as these are returned by the API */ MPI_Bcast(results, size / sizeof(double), MPI_DOUBLE, 0, testComm); /* update relevant result values with local values as these are returned by the API */ for(int j=0; j < iterations; j++){ for(int i=0; i < MDTEST_LAST_NUM; i++){ results[j].time_before_barrier[i] = o.summary_table[j].time_before_barrier[i]; results[j].stonewall_last_item[i] = o.summary_table[j].stonewall_last_item[i]; } } if(rank != 0){ return; } if (o.print_rate_and_time){ summarize_results_rank0(iterations, all_results, 0); summarize_results_rank0(iterations, all_results, 1); }else{ summarize_results_rank0(iterations, all_results, o.print_time); } free(all_results); } /* Checks to see if the test setup is valid. If it isn't, fail. */ void md_validate_tests() { if (((o.stone_wall_timer_seconds > 0) && (o.branch_factor > 1)) || ! o.barriers) { FAIL( "Error, stone wall timer does only work with a branch factor <= 1 (current is %d) and with barriers\n", o.branch_factor); } if (!o.create_only && ! o.stat_only && ! o.read_only && !o.remove_only && !o.rename_dirs) { o.create_only = o.stat_only = o.read_only = o.remove_only = o.rename_dirs = 1; VERBOSE(1,-1,"main: Setting create/stat/read/remove_only to True" ); } VERBOSE(1,-1,"Entering md_validate_tests..." ); /* if dirs_only and files_only were both left unset, set both now */ if (!o.dirs_only && !o.files_only) { o.dirs_only = o.files_only = 1; } /* if shared file 'S' access, no directory tests */ if (o.shared_file) { o.dirs_only = 0; } /* check for no barriers with shifting processes for different phases. that is, one may not specify both -B and -N as it will introduce race conditions that may cause errors stat'ing or deleting after creates. */ if (( o.barriers == 0 ) && ( o.nstride != 0 ) && ( rank == 0 )) { FAIL( "Possible race conditions will occur: -B not compatible with -N"); } /* check for collective_creates incompatibilities */ if (o.shared_file && o.collective_creates && rank == 0) { FAIL("-c not compatible with -S"); } if (o.path_count > 1 && o.collective_creates && rank == 0) { FAIL("-c not compatible with multiple test directories"); } if (o.collective_creates && !o.barriers) { FAIL("-c not compatible with -B"); } /* check for shared file incompatibilities */ if (o.unique_dir_per_task && o.shared_file && rank == 0) { FAIL("-u not compatible with -S"); } /* check multiple directory paths and strided option */ if (o.path_count > 1 && o.nstride > 0) { FAIL("cannot have multiple directory paths with -N strides between neighbor tasks"); } /* check for shared directory and multiple directories incompatibility */ if (o.path_count > 1 && o.unique_dir_per_task != 1) { FAIL("shared directory mode is not compatible with multiple directory paths"); } /* check if more directory paths than ranks */ if (o.path_count > o.size) { FAIL("cannot have more directory paths than MPI tasks"); } /* check depth */ if (o.depth < 0) { FAIL("depth must be greater than or equal to zero"); } /* check branch_factor */ if (o.branch_factor < 1 && o.depth > 0) { FAIL("branch factor must be greater than or equal to zero"); } /* check for valid number of items */ if ((o.items > 0) && (o.items_per_dir > 0)) { if(o.unique_dir_per_task){ FAIL("only specify the number of items or the number of items per directory"); }else if( o.items % o.items_per_dir != 0){ FAIL("items must be a multiple of items per directory"); } } /* check for using mknod */ if (o.write_bytes > 0 && o.make_node) { FAIL("-k not compatible with -w"); } if(o.verify_read && ! o.read_only) FAIL("Verify read requires that the read test is used"); if(o.verify_read && o.read_bytes <= 0) FAIL("Verify read requires that read bytes is > 0"); if(o.read_only && o.read_bytes <= 0) WARN("Read bytes is 0, thus, a read test will actually just open/close"); if(o.create_only && o.read_only && o.read_bytes > o.write_bytes) FAIL("When writing and reading files, read bytes must be smaller than write bytes"); if (rank == 0 && o.saveRankDetailsCSV){ // check that the file is writeable, truncate it and add header FILE* fd = fopen(o.saveRankDetailsCSV, "w"); if (fd == NULL){ FAIL("Cannot open saveRankPerformanceDetails file for write!"); } char * head = "rank,items"; int ret = fwrite(head, strlen(head), 1, fd); for(int e = 0; e < MDTEST_LAST_NUM; e++){ char buf[1024]; const char * str = mdtest_test_name(e); sprintf(buf, ",rate-%s,time-%s", str, str); ret = fwrite(buf, strlen(buf), 1, fd); if(ret != 1){ FAIL("Cannot write header to saveRankPerformanceDetails file"); } } fwrite("\n", 1, 1, fd); fclose(fd); } } void show_file_system_size(char *file_system) { char real_path[MAX_PATHLEN]; char file_system_unit_str[MAX_PATHLEN] = "GiB"; char inode_unit_str[MAX_PATHLEN] = "Mi"; int64_t file_system_unit_val = 1024 * 1024 * 1024; int64_t inode_unit_val = 1024 * 1024; int64_t total_file_system_size, free_file_system_size, total_inodes, free_inodes; double total_file_system_size_hr, used_file_system_percentage, used_inode_percentage; ior_aiori_statfs_t stat_buf; int ret; VERBOSE(1,-1,"Entering show_file_system_size on %s", file_system ); ret = o.backend->statfs (file_system, &stat_buf, o.backend_options); if (0 != ret) { FAIL("unable to stat file system %s", file_system); } total_file_system_size = stat_buf.f_blocks * stat_buf.f_bsize; free_file_system_size = stat_buf.f_bfree * stat_buf.f_bsize; used_file_system_percentage = (1 - ((double)free_file_system_size / (double)total_file_system_size)) * 100; total_file_system_size_hr = (double)total_file_system_size / (double)file_system_unit_val; if (total_file_system_size_hr > 1024) { total_file_system_size_hr = total_file_system_size_hr / 1024; strcpy(file_system_unit_str, "TiB"); } /* inodes */ total_inodes = stat_buf.f_files; free_inodes = stat_buf.f_ffree; used_inode_percentage = (1 - ((double)free_inodes/(double)total_inodes)) * 100; if (realpath(file_system, real_path) == NULL) { WARN("unable to use realpath() on file system"); } /* show results */ VERBOSE(0,-1,"Path: %s", real_path); VERBOSE(0,-1,"FS: %.1f %s Used FS: %2.1f%% Inodes: %.1f %s Used Inodes: %2.1f%%\n", total_file_system_size_hr, file_system_unit_str, used_file_system_percentage, (double)total_inodes / (double)inode_unit_val, inode_unit_str, used_inode_percentage); return; } void create_remove_directory_tree(int create, int currDepth, char* path, int dirNum, rank_progress_t * progress) { unsigned i; char dir[MAX_PATHLEN]; VERBOSE(1,5,"Entering create_remove_directory_tree on %s, currDepth = %d...", path, currDepth ); if (currDepth == 0) { sprintf(dir, "%s/%s.%d/", path, o.base_tree_name, dirNum); if (create) { VERBOSE(2,5,"Making directory '%s'", dir); if (-1 == o.backend->mkdir (dir, DIRMODE, o.backend_options)) { EWARNF("unable to create tree directory '%s'", dir); } #ifdef HAVE_LUSTRE_LUSTREAPI /* internal node for branching, can be non-striped for children */ if (o.global_dir_layout && \ llapi_dir_set_default_lmv_stripe(dir, -1, 0, LMV_HASH_TYPE_FNV_1A_64, NULL) == -1) { FAIL("Unable to reset to global default directory layout"); } #endif /* HAVE_LUSTRE_LUSTREAPI */ } create_remove_directory_tree(create, ++currDepth, dir, ++dirNum, progress); if (!create) { VERBOSE(2,5,"Remove directory '%s'", dir); if (-1 == o.backend->rmdir(dir, o.backend_options)) { EWARNF("Unable to remove directory %s", dir); } } } else if (currDepth <= o.depth) { char temp_path[MAX_PATHLEN]; strcpy(temp_path, path); int currDir = dirNum; for (i=0; i < o.branch_factor; i++) { sprintf(dir, "%s.%d/", o.base_tree_name, currDir); strcat(temp_path, dir); if (create) { VERBOSE(2,5,"Making directory '%s'", temp_path); if (-1 == o.backend->mkdir(temp_path, DIRMODE, o.backend_options)) { EWARNF("Unable to create directory %s", temp_path); } } create_remove_directory_tree(create, ++currDepth, temp_path, (o.branch_factor*currDir)+1, progress); currDepth--; if (!create) { VERBOSE(2,5,"Remove directory '%s'", temp_path); if (-1 == o.backend->rmdir(temp_path, o.backend_options)) { EWARNF("Unable to remove directory %s", temp_path); } } strcpy(temp_path, path); currDir++; } } } static void mdtest_iteration(int i, int j, mdtest_results_t * summary_table){ rank_progress_t progress_o; memset(& progress_o, 0 , sizeof(progress_o)); progress_o.stone_wall_timer_seconds = 0; progress_o.items_per_dir = o.items_per_dir; rank_progress_t * progress = & progress_o; /* start and end times of directory tree create/remove */ double startCreate, endCreate; int k; VERBOSE(1,-1,"main: * iteration %d *", j+1); if(o.create_only){ for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ if (rank >= o.path_count) { continue; } prep_testdir(j, dir_iter); VERBOSE(2,5,"main (for j loop): making o.testdir, '%s'", o.testdir ); if (o.backend->access(o.testdir, F_OK, o.backend_options) != 0) { if (o.backend->mkdir(o.testdir, DIRMODE, o.backend_options) != 0) { EWARNF("Unable to create test directory %s", o.testdir); } #ifdef HAVE_LUSTRE_LUSTREAPI /* internal node for branching, can be non-striped for children */ if (o.global_dir_layout && o.unique_dir_per_task && llapi_dir_set_default_lmv_stripe(o.testdir, -1, 0, LMV_HASH_TYPE_FNV_1A_64, NULL) == -1) { EWARNF("Unable to reset to global default directory layout"); } #endif /* HAVE_LUSTRE_LUSTREAPI */ } } /* create hierarchical directory structure */ MPI_Barrier(testComm); startCreate = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(j, dir_iter); if (o.unique_dir_per_task) { if (o.collective_creates && (rank == 0)) { /* * This is inside two loops, one of which already uses "i" and the other uses "j". * I don't know how this ever worked. I'm changing this loop to use "k". */ for (k=0; k < o.size; k++) { sprintf(o.base_tree_name, "mdtest_tree.%d", k); VERBOSE(3,5,"main (create hierarchical directory loop-collective): Calling create_remove_directory_tree with '%s'", o.testdir ); /* * Let's pass in the path to the directory we most recently made so that we can use * full paths in the other calls. */ create_remove_directory_tree(1, 0, o.testdir, 0, progress); if(CHECK_STONE_WALL(progress)){ o.size = k; break; } } } else if (! o.collective_creates) { VERBOSE(3,5,"main (create hierarchical directory loop-!collective_creates): Calling create_remove_directory_tree with '%s'", o.testdir ); /* * Let's pass in the path to the directory we most recently made so that we can use * full paths in the other calls. */ create_remove_directory_tree(1, 0, o.testdir, 0, progress); } } else { if (rank == 0) { VERBOSE(3,5,"main (create hierarchical directory loop-!unque_dir_per_task): Calling create_remove_directory_tree with '%s'", o.testdir ); /* * Let's pass in the path to the directory we most recently made so that we can use * full paths in the other calls. */ create_remove_directory_tree(1, 0 , o.testdir, 0, progress); } } } MPI_Barrier(testComm); endCreate = GetTimeStamp(); summary_table->rate[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree / (endCreate - startCreate); summary_table->time[MDTEST_TREE_CREATE_NUM] = (endCreate - startCreate); summary_table->items[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree; summary_table->stonewall_last_item[MDTEST_TREE_CREATE_NUM] = o.num_dirs_in_tree; VERBOSE(1,-1,"V-1: main: Tree creation : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[MDTEST_TREE_CREATE_NUM]); } sprintf(o.unique_mk_dir, "%s.0", o.base_tree_name); sprintf(o.unique_chdir_dir, "%s.0", o.base_tree_name); sprintf(o.unique_stat_dir, "%s.0", o.base_tree_name); sprintf(o.unique_read_dir, "%s.0", o.base_tree_name); sprintf(o.unique_rm_dir, "%s.0", o.base_tree_name); o.unique_rm_uni_dir[0] = 0; if (! o.unique_dir_per_task) { VERBOSE(3,-1,"V-3: main: Using unique_mk_dir, '%s'", o.unique_mk_dir ); } if (rank < i) { if (! o.shared_file) { sprintf(o.mk_name, "mdtest.%d.", (rank+(0*o.nstride))%i); sprintf(o.stat_name, "mdtest.%d.", (rank+(1*o.nstride))%i); sprintf(o.read_name, "mdtest.%d.", (rank+(2*o.nstride))%i); sprintf(o.rm_name, "mdtest.%d.", (rank+(3*o.nstride))%i); } if (o.unique_dir_per_task) { VERBOSE(3,5,"i %d nstride %d", i, o.nstride); sprintf(o.unique_mk_dir, "mdtest_tree.%d.0", (rank+(0*o.nstride))%i); sprintf(o.unique_chdir_dir, "mdtest_tree.%d.0", (rank+(1*o.nstride))%i); sprintf(o.unique_stat_dir, "mdtest_tree.%d.0", (rank+(2*o.nstride))%i); sprintf(o.unique_read_dir, "mdtest_tree.%d.0", (rank+(3*o.nstride))%i); sprintf(o.unique_rm_dir, "mdtest_tree.%d.0", (rank+(4*o.nstride))%i); o.unique_rm_uni_dir[0] = 0; VERBOSE(5,5,"mk_dir %s chdir %s stat_dir %s read_dir %s rm_dir %s\n", o.unique_mk_dir, o.unique_chdir_dir, o.unique_stat_dir, o.unique_read_dir, o.unique_rm_dir); } VERBOSE(3,-1,"V-3: main: Copied unique_mk_dir, '%s', to topdir", o.unique_mk_dir ); if (o.dirs_only && ! o.shared_file) { if (o.pre_delay) { DelaySecs(o.pre_delay); } directory_test(j, i, o.unique_mk_dir, progress); } if (o.files_only) { if (o.pre_delay) { DelaySecs(o.pre_delay); } VERBOSE(3,5,"will file_test on %s", o.unique_mk_dir); file_test(j, i, o.unique_mk_dir, progress); } } /* remove directory structure */ if (! o.unique_dir_per_task) { VERBOSE(3,-1,"main: Using o.testdir, '%s'", o.testdir ); } MPI_Barrier(testComm); if (o.remove_only) { progress->items_start = 0; startCreate = GetTimeStamp(); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(j, dir_iter); if (o.unique_dir_per_task) { if (o.collective_creates && (rank == 0)) { /* * This is inside two loops, one of which already uses "i" and the other uses "j". * I don't know how this ever worked. I'm changing this loop to use "k". */ for (k=0; k < o.size; k++) { sprintf(o.base_tree_name, "mdtest_tree.%d", k); VERBOSE(3,-1,"main (remove hierarchical directory loop-collective): Calling create_remove_directory_tree with '%s'", o.testdir ); /* * Let's pass in the path to the directory we most recently made so that we can use * full paths in the other calls. */ create_remove_directory_tree(0, 0, o.testdir, 0, progress); if(CHECK_STONE_WALL(progress)){ o.size = k; break; } } } else if (! o.collective_creates) { VERBOSE(3,-1,"main (remove hierarchical directory loop-!collective): Calling create_remove_directory_tree with '%s'", o.testdir ); /* * Let's pass in the path to the directory we most recently made so that we can use * full paths in the other calls. */ create_remove_directory_tree(0, 0, o.testdir, 0, progress); } } else { if (rank == 0) { VERBOSE(3,-1,"V-3: main (remove hierarchical directory loop-!unique_dir_per_task): Calling create_remove_directory_tree with '%s'", o.testdir ); /* * Let's pass in the path to the directory we most recently made so that we can use * full paths in the other calls. */ create_remove_directory_tree(0, 0 , o.testdir, 0, progress); } } } MPI_Barrier(testComm); endCreate = GetTimeStamp(); summary_table->rate[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree / (endCreate - startCreate); summary_table->time[MDTEST_TREE_REMOVE_NUM] = endCreate - startCreate; summary_table->items[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree; summary_table->stonewall_last_item[MDTEST_TREE_REMOVE_NUM] = o.num_dirs_in_tree; VERBOSE(1,-1,"main Tree removal : %14.3f sec, %14.3f ops/sec", (endCreate - startCreate), summary_table->rate[MDTEST_TREE_REMOVE_NUM]); VERBOSE(2,-1,"main (at end of for j loop): Removing o.testdir of '%s'\n", o.testdir ); for (int dir_iter = 0; dir_iter < o.directory_loops; dir_iter ++){ prep_testdir(j, dir_iter); if ((rank < o.path_count) && o.backend->access(o.testdir, F_OK, o.backend_options) == 0) { //if (( rank == 0 ) && access(o.testdir, F_OK) == 0) { if (o.backend->rmdir(o.testdir, o.backend_options) == -1) { EWARNF("unable to remove directory %s", o.testdir); } } } } else { summary_table->rate[MDTEST_TREE_REMOVE_NUM] = 0; } } void mdtest_init_args(){ o = (mdtest_options_t) { .barriers = 1, .branch_factor = 1, .random_buffer_offset = -1, .prologue = "", .epilogue = "", }; } mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * world_out) { testComm = world_com; out_logfile = world_out; out_resultfile = world_out; init_clock(world_com); mdtest_init_args(); int i, j; int numNodes; int numTasksOnNode0 = 0; MPI_Group worldgroup; struct { int first; int last; int stride; } range = {0, 0, 1}; int first = 1; int last = 0; int stride = 1; int iterations = 1; int created_root_dir = 0; // was the root directory existing or newly created verbose = 0; int no_barriers = 0; char * path = "./out"; int randomize = 0; char APIs[1024]; char APIs_legacy[1024]; aiori_supported_apis(APIs, APIs_legacy, MDTEST); char apiStr[1024]; sprintf(apiStr, "API for I/O [%s]", APIs); memset(& o.hints, 0, sizeof(o.hints)); char * packetType = "t"; option_help options [] = { {'a', NULL, apiStr, OPTION_OPTIONAL_ARGUMENT, 's', & o.api}, {'b', NULL, "branching factor of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.branch_factor}, {'d', NULL, "the directory in which the tests will run", OPTION_OPTIONAL_ARGUMENT, 's', & path}, {'B', NULL, "no barriers between phases", OPTION_OPTIONAL_ARGUMENT, 'd', & no_barriers}, {'C', NULL, "only create files/dirs", OPTION_FLAG, 'd', & o.create_only}, {'T', NULL, "only stat files/dirs", OPTION_FLAG, 'd', & o.stat_only}, {'E', NULL, "only read files/dir", OPTION_FLAG, 'd', & o.read_only}, {'r', NULL, "only remove files or directories left behind by previous runs", OPTION_FLAG, 'd', & o.remove_only}, {'D', NULL, "perform test on directories only (no files)", OPTION_FLAG, 'd', & o.dirs_only}, {'e', NULL, "bytes to read from each file", OPTION_OPTIONAL_ARGUMENT, 'l', & o.read_bytes}, {'f', NULL, "first number of tasks on which the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & first}, {'F', NULL, "perform test on files only (no directories)", OPTION_FLAG, 'd', & o.files_only}, #ifdef HAVE_LUSTRE_LUSTREAPI {'g', NULL, "global default directory layout for test subdirectories (deletes inherited striping layout)", OPTION_FLAG, 'd', & o.global_dir_layout}, #endif /* HAVE_LUSTRE_LUSTREAPI */ {'G', NULL, "Offset for the data in the read/write buffer, if not set, a random value is used", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_buffer_offset}, {'i', NULL, "number of iterations the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & iterations}, {'I', NULL, "number of items per directory in tree", OPTION_OPTIONAL_ARGUMENT, 'l', & o.items_per_dir}, {'k', NULL, "use mknod to create file", OPTION_FLAG, 'd', & o.make_node}, {'l', NULL, "last number of tasks on which the test will run", OPTION_OPTIONAL_ARGUMENT, 'd', & last}, {'L', NULL, "files only at leaf level of tree", OPTION_FLAG, 'd', & o.leaf_only}, {'n', NULL, "every process will creat/stat/read/remove # directories and files", OPTION_OPTIONAL_ARGUMENT, 'l', & o.items}, {'N', NULL, "stride # between tasks for file/dir operation (local=0; set to 1 to avoid client cache)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.nstride}, {'p', NULL, "pre-iteration delay (in seconds)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.pre_delay}, {'P', NULL, "print rate AND time", OPTION_FLAG, 'd', & o.print_rate_and_time}, {0, "print-all-procs", "all processes print an excerpt of their results", OPTION_FLAG, 'd', & o.print_all_proc}, {'R', NULL, "random access to files (only for stat)", OPTION_FLAG, 'd', & randomize}, {0, "random-seed", "random seed for -R", OPTION_OPTIONAL_ARGUMENT, 'd', & o.random_seed}, {'s', NULL, "stride between the number of tasks for each test", OPTION_OPTIONAL_ARGUMENT, 'd', & stride}, {'S', NULL, "shared file access (file only, no directories)", OPTION_FLAG, 'd', & o.shared_file}, {'c', NULL, "collective creates: task 0 does all creates", OPTION_FLAG, 'd', & o.collective_creates}, {'t', NULL, "time unique working directory overhead", OPTION_FLAG, 'd', & o.time_unique_dir_overhead}, {'u', NULL, "unique working directory for each task", OPTION_FLAG, 'd', & o.unique_dir_per_task}, {'v', NULL, "verbosity (each instance of option increments by one)", OPTION_FLAG, 'd', & verbose}, {'V', NULL, "verbosity value", OPTION_OPTIONAL_ARGUMENT, 'd', & verbose}, {'w', NULL, "bytes to write to each file after it is created", OPTION_OPTIONAL_ARGUMENT, 'l', & o.write_bytes}, {'W', NULL, "number in seconds; stonewall timer, write as many seconds and ensure all processes did the same number of operations (currently only stops during create phase and files)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stone_wall_timer_seconds}, {'x', NULL, "StoneWallingStatusFile; contains the number of iterations of the creation phase, can be used to split phases across runs", OPTION_OPTIONAL_ARGUMENT, 's', & o.stoneWallingStatusFile}, {'X', "verify-read", "Verify the data read", OPTION_FLAG, 'd', & o.verify_read}, {0, "verify-write", "Verify the data after a write by reading it back immediately", OPTION_FLAG, 'd', & o.verify_write}, {'y', NULL, "sync file after writing", OPTION_FLAG, 'd', & o.sync_file}, {'Y', NULL, "call the sync command after each phase (included in the timing; note it causes all IO to be flushed from your node)", OPTION_FLAG, 'd', & o.call_sync}, {'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth}, {'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time}, {0, "run-cmd-before-phase", "call this external command before each phase (excluded from the timing)", OPTION_OPTIONAL_ARGUMENT, 's', & o.prologue}, {0, "run-cmd-after-phase", "call this external command after each phase (included in the timing)", OPTION_OPTIONAL_ARGUMENT, 's', & o.epilogue}, {0, "dataPacketType", "type of packet that will be created [offset|incompressible|timestamp|o|i|t]", OPTION_OPTIONAL_ARGUMENT, 's', & packetType}, {0, "allocateBufferOnGPU", "Allocate the buffer on the GPU.", OPTION_FLAG, 'd', & o.gpu_memory_flags}, {0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors}, {0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV}, {0, "showRankStatistics", "Include statistics per rank", OPTION_FLAG, 'd', & o.show_perrank_statistics}, LAST_OPTION }; options_all_t * global_options = airoi_create_all_module_options(options); option_parse(argc, argv, global_options); o.backend = aiori_select(o.api); if (o.backend == NULL) ERR("Unrecognized I/O API"); if (! o.backend->enable_mdtest) ERR("Backend doesn't support MDTest"); o.backend_options = airoi_update_module_options(o.backend, global_options); free(global_options->modules); free(global_options); o.dataPacketType = parsePacketType(packetType[0]); MPI_Comm_rank(testComm, &rank); MPI_Comm_size(testComm, &o.size); if(o.backend->xfer_hints){ o.backend->xfer_hints(& o.hints); } if(o.backend->check_params){ o.backend->check_params(o.backend_options); } if (o.backend->initialize){ o.backend->initialize(o.backend_options); } o.pid = getpid(); o.uid = getuid(); numNodes = GetNumNodes(testComm); numTasksOnNode0 = GetNumTasksOnNode0(testComm); char cmd_buffer[4096]; strncpy(cmd_buffer, argv[0], 4096); for (i = 1; i < argc; i++) { snprintf(&cmd_buffer[strlen(cmd_buffer)], 4096-strlen(cmd_buffer), " '%s'", argv[i]); } VERBOSE(0,-1,"-- started at %s --\n", PrintTimestamp()); VERBOSE(0,-1,"mdtest-%s was launched with %d total task(s) on %d node(s)", RELEASE_VERS, o.size, numNodes); VERBOSE(0,-1,"Command line used: %s", cmd_buffer); /* adjust special variables */ o.barriers = ! no_barriers; if (path != NULL){ parse_dirpath(path); } if( randomize > 0 ){ if (o.random_seed == 0) { /* Ensure all procs have the same random number */ o.random_seed = time(NULL); MPI_Barrier(testComm); MPI_Bcast(& o.random_seed, 1, MPI_INT, 0, testComm); } o.random_seed += rank; } if( o.random_buffer_offset == -1 ){ o.random_buffer_offset = time(NULL); MPI_Bcast(& o.random_buffer_offset, 1, MPI_INT, 0, testComm); } if ((o.items > 0) && (o.items_per_dir > 0) && (! o.unique_dir_per_task)) { o.directory_loops = o.items / o.items_per_dir; }else{ o.directory_loops = 1; } md_validate_tests(); // option_print_current(options); VERBOSE(1,-1, "api : %s", o.api); VERBOSE(1,-1, "barriers : %s", ( o.barriers ? "True" : "False" )); VERBOSE(1,-1, "collective_creates : %s", ( o.collective_creates ? "True" : "False" )); VERBOSE(1,-1, "create_only : %s", ( o.create_only ? "True" : "False" )); VERBOSE(1,-1, "dirpath(s):" ); for ( i = 0; i < o.path_count; i++ ) { VERBOSE(1,-1, "\t%s", o.filenames[i] ); } VERBOSE(1,-1, "dirs_only : %s", ( o.dirs_only ? "True" : "False" )); VERBOSE(1,-1, "read_bytes : "LLU"", o.read_bytes ); VERBOSE(1,-1, "read_only : %s", ( o.read_only ? "True" : "False" )); VERBOSE(1,-1, "first : %d", first ); VERBOSE(1,-1, "files_only : %s", ( o.files_only ? "True" : "False" )); #ifdef HAVE_LUSTRE_LUSTREAPI VERBOSE(1,-1, "global_dir_layout : %s", ( o.global_dir_layout ? "True" : "False" )); #endif /* HAVE_LUSTRE_LUSTREAPI */ VERBOSE(1,-1, "iterations : %d", iterations ); VERBOSE(1,-1, "items_per_dir : "LLU"", o.items_per_dir ); VERBOSE(1,-1, "last : %d", last ); VERBOSE(1,-1, "leaf_only : %s", ( o.leaf_only ? "True" : "False" )); VERBOSE(1,-1, "items : "LLU"", o.items ); VERBOSE(1,-1, "nstride : %d", o.nstride ); VERBOSE(1,-1, "pre_delay : %d", o.pre_delay ); VERBOSE(1,-1, "remove_only : %s", ( o.leaf_only ? "True" : "False" )); VERBOSE(1,-1, "random_seed : %d", o.random_seed ); VERBOSE(1,-1, "stride : %d", stride ); VERBOSE(1,-1, "shared_file : %s", ( o.shared_file ? "True" : "False" )); VERBOSE(1,-1, "time_unique_dir_overhead: %s", ( o.time_unique_dir_overhead ? "True" : "False" )); VERBOSE(1,-1, "stone_wall_timer_seconds: %d", o.stone_wall_timer_seconds); VERBOSE(1,-1, "stat_only : %s", ( o.stat_only ? "True" : "False" )); VERBOSE(1,-1, "unique_dir_per_task : %s", ( o.unique_dir_per_task ? "True" : "False" )); VERBOSE(1,-1, "write_bytes : "LLU"", o.write_bytes ); VERBOSE(1,-1, "sync_file : %s", ( o.sync_file ? "True" : "False" )); VERBOSE(1,-1, "call_sync : %s", ( o.call_sync ? "True" : "False" )); VERBOSE(1,-1, "depth : %d", o.depth ); VERBOSE(1,-1, "make_node : %d", o.make_node ); /* setup total number of items and number of items per dir */ if (o.depth <= 0) { o.num_dirs_in_tree = 1; } else { if (o.branch_factor < 1) { o.num_dirs_in_tree = 1; } else if (o.branch_factor == 1) { o.num_dirs_in_tree = o.depth + 1; } else { o.num_dirs_in_tree = (pow(o.branch_factor, o.depth+1) - 1) / (o.branch_factor - 1); } } if (o.items_per_dir > 0) { if(o.items == 0){ if (o.leaf_only) { o.items = o.items_per_dir * (uint64_t) pow(o.branch_factor, o.depth); } else { o.items = o.items_per_dir * o.num_dirs_in_tree; } }else{ o.num_dirs_in_tree_calc = o.num_dirs_in_tree; } } else { if (o.leaf_only) { if (o.branch_factor <= 1) { o.items_per_dir = o.items; } else { o.items_per_dir = (uint64_t) (o.items / pow(o.branch_factor, o.depth)); o.items = o.items_per_dir * (uint64_t) pow(o.branch_factor, o.depth); } } else { o.items_per_dir = o.items / o.num_dirs_in_tree; o.items = o.items_per_dir * o.num_dirs_in_tree; } } /* initialize rand_array */ if (o.random_seed > 0) { srand(o.random_seed); uint64_t s; o.rand_array = (uint64_t *) safeMalloc( o.items * sizeof(*o.rand_array)); for (s=0; s < o.items; s++) { o.rand_array[s] = s; } /* shuffle list randomly */ uint64_t n = o.items; while (n>1) { n--; /* * Generate a random number in the range 0 .. n * * rand() returns a number from 0 .. RAND_MAX. Divide that * by RAND_MAX and you get a floating point number in the * range 0 .. 1. Multiply that by n and you get a number in * the range 0 .. n. */ uint64_t k = ( uint64_t ) ((( double )rand() / ( double )RAND_MAX ) * ( double )n ); /* * Now move the nth element to the kth (randomly chosen) * element, and the kth element to the nth element. */ uint64_t tmp = o.rand_array[k]; o.rand_array[k] = o.rand_array[n]; o.rand_array[n] = tmp; } } /* allocate and initialize write buffer with # */ if (o.write_bytes > 0) { o.write_buffer = aligned_buffer_alloc(o.write_bytes, o.gpu_memory_flags); generate_memory_pattern(o.write_buffer, o.write_bytes, o.random_buffer_offset, rank, o.dataPacketType); } /* setup directory path to work in */ if (o.path_count == 0) { /* special case where no directory path provided with '-d' option */ char *ret = getcwd(o.testdirpath, MAX_PATHLEN); if (ret == NULL) { FAIL("Unable to get current working directory on %s", o.testdirpath); } o.path_count = 1; } else { strcpy(o.testdirpath, o.filenames[rank % o.path_count]); } /* if directory does not exist, create it */ if ((rank < o.path_count) && o.backend->access(o.testdirpath, F_OK, o.backend_options) != 0) { if (o.backend->mkdir(o.testdirpath, DIRMODE, o.backend_options) != 0) { EWARNF("Unable to create test directory path %s", o.testdirpath); } created_root_dir = 1; } /* display disk usage */ VERBOSE(3,-1,"main (before display_freespace): o.testdirpath is '%s'", o.testdirpath ); if (rank == 0) ShowFileSystemSize(o.testdirpath, o.backend, o.backend_options); int tasksBlockMapping = QueryNodeMapping(testComm, true); /* set the shift to mimic IOR and shift by procs per node */ if (o.nstride > 0) { if ( numNodes > 1 && tasksBlockMapping ) { /* the user set the stride presumably to get the consumer tasks on a different node than the producer tasks however, if the mpirun scheduler placed the tasks by-slot (in a contiguous block) then we need to adjust the shift by ppn */ o.nstride *= numTasksOnNode0; } VERBOSE(0,5,"Shifting ranks by %d for each phase.", o.nstride); } VERBOSE(3,-1,"main (after display_freespace): o.testdirpath is '%s'", o.testdirpath ); if (rank == 0) { if (o.random_seed > 0) { VERBOSE(0,-1,"random seed: %d", o.random_seed); } } if (gethostname(o.hostname, MAX_PATHLEN) == -1) { perror("gethostname"); MPI_Abort(testComm, 2); } if (last == 0) { first = o.size; last = o.size; } if(first > last){ FAIL("process number: first > last doesn't make sense"); } if(last > o.size){ FAIL("process number: last > number of processes doesn't make sense"); } /* setup summary table for recording results */ o.summary_table = (mdtest_results_t *) safeMalloc(iterations * sizeof(mdtest_results_t)); memset(o.summary_table, 0, iterations * sizeof(mdtest_results_t)); if (o.unique_dir_per_task) { sprintf(o.base_tree_name, "mdtest_tree.%d", rank); } else { sprintf(o.base_tree_name, "mdtest_tree"); } mdtest_results_t * aggregated_results = safeMalloc(iterations * sizeof(mdtest_results_t)); /* default use shared directory */ strcpy(o.mk_name, "mdtest.shared."); strcpy(o.stat_name, "mdtest.shared."); strcpy(o.read_name, "mdtest.shared."); strcpy(o.rm_name, "mdtest.shared."); MPI_Comm_group(testComm, &worldgroup); last = o.size < last ? o.size : last; /* Run the tests */ for (i = first; i <= last; i += stride) { sleep(1); if(i < last){ MPI_Group testgroup; range.last = i - 1; MPI_Group_range_incl(worldgroup, 1, (void *)&range, &testgroup); MPI_Comm_create(world_com, testgroup, &testComm); MPI_Group_free(&testgroup); if(testComm == MPI_COMM_NULL){ continue; } }else{ MPI_Comm_dup(world_com, & testComm); } MPI_Comm_size(testComm, &o.size); if (rank == 0) { uint64_t items_all = i * o.items; if(o.num_dirs_in_tree_calc){ items_all *= o.num_dirs_in_tree_calc; } if (o.files_only && o.dirs_only) { VERBOSE(0,-1,"%d tasks, "LLU" files/directories", i, items_all); } else if (o.files_only) { if (! o.shared_file) { VERBOSE(0,-1,"%d tasks, "LLU" files", i, items_all); } else { VERBOSE(0,-1,"%d tasks, 1 file", i); } } else if (o.dirs_only) { VERBOSE(0,-1,"%d tasks, "LLU" directories", i, items_all); } } VERBOSE(1,-1,""); VERBOSE(1,-1," Operation Duration Rate"); VERBOSE(1,-1," --------- -------- ----"); for (j = 0; j < iterations; j++) { // keep track of the current status for stonewalling mdtest_iteration(i, j, & o.summary_table[j]); } summarize_results(iterations, aggregated_results); if(o.saveRankDetailsCSV){ StoreRankInformation(iterations, aggregated_results); } int total_errors = 0; MPI_Reduce(& o.verification_error, & total_errors, 1, MPI_INT, MPI_SUM, 0, testComm); if(rank == 0 && total_errors){ VERBOSE(0, -1, "\nERROR: verifying the data on read (%lld errors)! Take the performance values with care!\n", total_errors); } MPI_Comm_free(&testComm); } MPI_Group_free(&worldgroup); testComm = world_com; if (created_root_dir && o.remove_only && o.backend->rmdir(o.testdirpath, o.backend_options) != 0) { FAIL("Unable to remove test directory path %s", o.testdirpath); } VERBOSE(0,-1,"-- finished at %s --\n", PrintTimestamp()); if (o.random_seed > 0) { free(o.rand_array); } if (o.backend->finalize){ o.backend->finalize(o.backend_options); } if (o.write_bytes > 0) { aligned_buffer_free(o.write_buffer, o.gpu_memory_flags); } free(o.summary_table); return aggregated_results; }