diff --git a/src/mdtest.c b/src/mdtest.c index ac9bdc4..cfd221a 100644 --- a/src/mdtest.c +++ b/src/mdtest.c @@ -163,6 +163,7 @@ typedef struct { #ifdef HAVE_LUSTRE_LUSTREAPI int global_dir_layout; #endif /* HAVE_LUSTRE_LUSTREAPI */ + char * saveRankDetailsCSV; /* save the details about the performance to a file */ mdtest_results_t * summary_table; pid_t pid; @@ -897,9 +898,20 @@ void rename_dir_test(const int dirs, const long dir_iter, const char *path, rank } } +static void updateResult(mdtest_results_t * res, mdtest_test_num_t test, uint64_t item_count, int t, double * times, double * tBefore){ + res->rate[test] = item_count/(times[t] - times[t-1]); + res->time[test] = times[t] - times[t-1]; + if(tBefore){ + res->time_before_barrier[test] = tBefore[t] - times[t-1]; + } + res->items[test] = item_count; + res->stonewall_last_item[test] = o.items; +} + void directory_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) { int size; double t[6] = {0}; + double tBefore[6] = {0}; char temp_path[MAX_PATHLEN]; mdtest_results_t * res = & o.summary_table[iteration]; @@ -907,6 +919,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran VERBOSE(1,-1,"Entering directory_test on %s", path ); + tBefore[0] = GetTimeStamp(); MPI_Barrier(testComm); t[0] = GetTimeStamp(); @@ -941,6 +954,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran progress->stone_wall_timer_seconds = 0; } + tBefore[1] = GetTimeStamp(); phase_end(); t[1] = GetTimeStamp(); @@ -967,6 +981,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran } } } + tBefore[2] = GetTimeStamp(); phase_end(); t[2] = GetTimeStamp(); @@ -993,6 +1008,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran } } } + tBefore[3] = GetTimeStamp(); phase_end(); t[3] = GetTimeStamp(); @@ -1013,6 +1029,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran rename_dir_test(1, dir_iter, temp_path, progress); } } + tBefore[4] = GetTimeStamp(); phase_end(); t[4] = GetTimeStamp(); @@ -1048,6 +1065,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran } } + tBefore[5] = GetTimeStamp(); phase_end(); t[5] = GetTimeStamp(); @@ -1067,28 +1085,16 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran /* calculate times */ if (o.create_only) { - res->rate[MDTEST_DIR_CREATE_NUM] = o.items*size/(t[1] - t[0]); - res->time[MDTEST_DIR_CREATE_NUM] = t[1] - t[0]; - res->items[MDTEST_DIR_CREATE_NUM] = o.items*size; - res->stonewall_last_item[MDTEST_DIR_CREATE_NUM] = o.items; + updateResult(res, MDTEST_DIR_CREATE_NUM, o.items*size, 1, t, tBefore); } if (o.stat_only) { - res->rate[MDTEST_DIR_STAT_NUM] = o.items*size/(t[2] - t[1]); - res->time[MDTEST_DIR_STAT_NUM] = t[2] - t[1]; - res->items[MDTEST_DIR_STAT_NUM] = o.items*size; - res->stonewall_last_item[MDTEST_DIR_STAT_NUM] = o.items; + updateResult(res, MDTEST_DIR_STAT_NUM, o.items*size, 2, t, tBefore); } if (o.read_only) { - res->rate[MDTEST_DIR_READ_NUM] = o.items*size/(t[3] - t[2]); - res->time[MDTEST_DIR_READ_NUM] = t[3] - t[2]; - res->items[MDTEST_DIR_READ_NUM] = o.items*size; - res->stonewall_last_item[MDTEST_DIR_READ_NUM] = o.items; + updateResult(res, MDTEST_DIR_READ_NUM, o.items*size, 3, t, tBefore); } if (o.remove_only) { - res->rate[MDTEST_DIR_REMOVE_NUM] = o.items*size/(t[5] - t[4]); - res->time[MDTEST_DIR_REMOVE_NUM] = t[5] - t[4]; - res->items[MDTEST_DIR_REMOVE_NUM] = o.items*size; - res->stonewall_last_item[MDTEST_DIR_REMOVE_NUM] = o.items; + updateResult(res, MDTEST_DIR_REMOVE_NUM, o.items*size, 5, t, tBefore); } VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", t[1] - t[0], o.summary_table[iteration].rate[0]); VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", t[2] - t[1], o.summary_table[iteration].rate[1]); @@ -1177,11 +1183,13 @@ void file_test_create(const int iteration, const int ntasks, const char *path, r void file_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) { int size; double t[5] = {0}; + double tBefore[5] = {0}; char temp_path[MAX_PATHLEN]; MPI_Comm_size(testComm, &size); VERBOSE(3,5,"Entering file_test on %s", path); + tBefore[0] = GetTimeStamp(); MPI_Barrier(testComm); t[0] = GetTimeStamp(); @@ -1215,6 +1223,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro } } + tBefore[1] = GetTimeStamp(); phase_end(); t[1] = GetTimeStamp(); @@ -1238,6 +1247,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro } } + tBefore[2] = GetTimeStamp(); phase_end(); t[2] = GetTimeStamp(); @@ -1265,6 +1275,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro } } + tBefore[3] = GetTimeStamp(); phase_end(); t[3] = GetTimeStamp(); @@ -1295,6 +1306,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro } } + tBefore[4] = GetTimeStamp(); phase_end(); t[4] = GetTimeStamp(); if (o.remove_only) { @@ -1318,28 +1330,16 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro mdtest_results_t * res = & o.summary_table[iteration]; /* calculate times */ if (o.create_only) { - res->rate[MDTEST_FILE_CREATE_NUM] = o.items*size/(t[1] - t[0]); - res->time[MDTEST_FILE_CREATE_NUM] = t[1] - t[0]; - res->items[MDTEST_FILE_CREATE_NUM] = o.items*o.size; - res->stonewall_last_item[MDTEST_FILE_CREATE_NUM] = o.items; + updateResult(res, MDTEST_FILE_CREATE_NUM, o.items*size, 1, t, tBefore); } if (o.stat_only) { - res->rate[MDTEST_FILE_STAT_NUM] = o.items*size/(t[2] - t[1]); - res->time[MDTEST_FILE_STAT_NUM] = t[2] - t[1]; - res->items[MDTEST_FILE_STAT_NUM] = o.items*o.size; - res->stonewall_last_item[MDTEST_FILE_STAT_NUM] = o.items; + updateResult(res, MDTEST_FILE_STAT_NUM, o.items*size, 2, t, tBefore); } if (o.read_only) { - res->rate[MDTEST_FILE_READ_NUM] = o.items*o.size/(t[3] - t[2]); - res->time[MDTEST_FILE_READ_NUM] = t[3] - t[2]; - res->items[MDTEST_FILE_READ_NUM] = o.items*o.size; - res->stonewall_last_item[MDTEST_FILE_READ_NUM] = o.items; + updateResult(res, MDTEST_FILE_READ_NUM, o.items*size, 3, t, tBefore); } if (o.remove_only) { - res->rate[MDTEST_FILE_REMOVE_NUM] = o.items*o.size/(t[4] - t[3]); - res->time[MDTEST_FILE_REMOVE_NUM] = t[4] - t[3]; - res->items[MDTEST_FILE_REMOVE_NUM] = o.items*o.size; - res->stonewall_last_item[MDTEST_FILE_REMOVE_NUM] = o.items; + updateResult(res, MDTEST_FILE_REMOVE_NUM, o.items*size, 4, t, tBefore); } VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", t[1] - t[0], o.summary_table[iteration].rate[4]); @@ -1353,17 +1353,17 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro char const * mdtest_test_name(int i){ switch (i) { - case MDTEST_DIR_CREATE_NUM: return "Directory creation :"; - case MDTEST_DIR_STAT_NUM: return "Directory stat :"; - case MDTEST_DIR_READ_NUM: return NULL; - case MDTEST_DIR_REMOVE_NUM: return "Directory removal :"; - case MDTEST_DIR_RENAME_NUM: return "Directory rename :"; - case MDTEST_FILE_CREATE_NUM: return "File creation :"; - case MDTEST_FILE_STAT_NUM: return "File stat :"; - case MDTEST_FILE_READ_NUM: return "File read :"; - case MDTEST_FILE_REMOVE_NUM: return "File removal :"; - case MDTEST_TREE_CREATE_NUM: return "Tree creation :"; - case MDTEST_TREE_REMOVE_NUM: return "Tree removal :"; + case MDTEST_DIR_CREATE_NUM: return "Directory creation"; + case MDTEST_DIR_STAT_NUM: return "Directory stat"; + case MDTEST_DIR_READ_NUM: return "Directory read"; + case MDTEST_DIR_REMOVE_NUM: return "Directory removal"; + case MDTEST_DIR_RENAME_NUM: return "Directory rename"; + case MDTEST_FILE_CREATE_NUM: return "File creation"; + case MDTEST_FILE_STAT_NUM: return "File stat"; + case MDTEST_FILE_READ_NUM: return "File read"; + case MDTEST_FILE_REMOVE_NUM: return "File removal"; + case MDTEST_TREE_CREATE_NUM: return "Tree creation"; + case MDTEST_TREE_REMOVE_NUM: return "Tree removal"; default: return "ERR INVALID TESTNAME :"; } return NULL; @@ -1374,6 +1374,48 @@ int calc_allreduce_index(int iter, int rank, int op){ return iter * tableSize * o.size + rank * tableSize + op; } +/* + * Store the results of each process in a file + */ +static void StoreRankInformation(int iterations){ + const size_t size = sizeof(mdtest_results_t) * iterations; + if(rank == 0){ + FILE* fd = fopen(o.saveRankDetailsCSV, "a"); + if (fd == NULL){ + FAIL("Cannot open saveRankPerformanceDetails file for writes!"); + } + + mdtest_results_t * results = malloc(size * o.size); + MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, results, size / sizeof(double), MPI_DOUBLE, 0, testComm); + + for(int iter = 0; iter < iterations; iter++){ + for(int i=0; i < o.size; i++){ + mdtest_results_t * cur = & results[i * iterations + iter]; + char buff[4096]; + char * cpos = buff; + cpos += sprintf(cpos, "%d,%llu", i, (long long unsigned) o.items); + for(int e = 0; (e < MDTEST_TREE_CREATE_NUM) || (i == 0 && e < MDTEST_LAST_NUM); e++){ + if(cur->items[e] == 0){ + cpos += sprintf(cpos, ",,"); + }else{ + cpos += sprintf(cpos, ",%.10e,%.10e", cur->items[e] / cur->time_before_barrier[e], cur->time_before_barrier[e]); + } + } + cpos += sprintf(cpos, "\n"); + int ret = fwrite(buff, cpos - buff, 1, fd); + if(ret != 1){ + WARN("Couln't append to saveRankPerformanceDetailsCSV file\n"); + break; + } + } + } + fclose(fd); + }else{ + /* this is a hack for now assuming all datatypes in the structure are double */ + MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm); + } +} + void summarize_results(int iterations, int print_time) { char const * access; int i, j, k; @@ -1404,7 +1446,7 @@ void summarize_results(int iterations, int print_time) { continue; } curr = o.summary_table[j].rate[i]; - fprintf(out_logfile, "Rank %d Iter %d Test %s Rate: %e\n", rank, j, access, curr); + fprintf(out_logfile, "Rank %d Iter %d Test %-20s Rate: %e\n", rank, j, access, curr); } } } @@ -1452,8 +1494,8 @@ void summarize_results(int iterations, int print_time) { } VERBOSE(0,-1,"\nSUMMARY %s: (of %d iterations)", print_time ? "time": "rate", iterations); - VERBOSE(0,-1," Operation per Rank: Max Min Mean Std Dev per Iteration: Max Min Mean"); - VERBOSE(0,-1," --------- --- --- ---- ------- --- --- ----"); + VERBOSE(0,-1," Operation per Rank: Max Min Mean Std Dev per Iteration: Max Min Mean"); + VERBOSE(0,-1," --------- --- --- ---- ------- --- --- ----"); for (i = start; i < stop; i++) { min = max = all[i]; @@ -1500,7 +1542,7 @@ void summarize_results(int iterations, int print_time) { sd = sqrt(var); access = mdtest_test_name(i); if (i != 2) { - fprintf(out_logfile, " %s ", access); + fprintf(out_logfile, " %-22s ", access); fprintf(out_logfile, "%14.3f ", max); fprintf(out_logfile, "%14.3f ", min); fprintf(out_logfile, "%14.3f ", mean); @@ -1522,8 +1564,8 @@ void summarize_results(int iterations, int print_time) { } } if(stonewall_items != 0){ - fprintf(out_logfile, " File create (stonewall) : "); - fprintf(out_logfile, "%14s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA"); + fprintf(out_logfile, " File create (stonewall) "); + fprintf(out_logfile, "%13s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA"); } /* calculate tree create/remove rates, applies only to Rank 0 */ @@ -1565,7 +1607,7 @@ void summarize_results(int iterations, int print_time) { var = var / (iterations); sd = sqrt(var); access = mdtest_test_name(i); - fprintf(out_logfile, " %s ", access); + fprintf(out_logfile, " %-22s ", access); fprintf(out_logfile, "%14.3f ", max); fprintf(out_logfile, "%14.3f ", min); fprintf(out_logfile, "%14.3f ", mean); @@ -1673,6 +1715,28 @@ void md_validate_tests() { if(o.create_only && o.read_only && o.read_bytes > o.write_bytes) FAIL("When writing and reading files, read bytes must be smaller than write bytes"); + + if (rank == 0 && o.saveRankDetailsCSV){ + // check that the file is writeable, truncate it and add header + FILE* fd = fopen(o.saveRankDetailsCSV, "w"); + if (fd == NULL){ + FAIL("Cannot open saveRankPerformanceDetails file for write!"); + } + char * head = "rank,items"; + int ret = fwrite(head, strlen(head), 1, fd); + for(int e = 0; e < MDTEST_LAST_NUM; e++){ + char buf[1024]; + const char * str = mdtest_test_name(e); + + sprintf(buf, ",rate-%s,time-%s", str, str); + ret = fwrite(buf, strlen(buf), 1, fd); + if(ret != 1){ + FAIL("Cannot write header to saveRankPerformanceDetails file"); + } + } + fwrite("\n", 1, 1, fd); + fclose(fd); + } } void show_file_system_size(char *file_system) { @@ -2103,6 +2167,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * {'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth}, {'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time}, {0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors}, + {0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV}, LAST_OPTION }; options_all_t * global_options = airoi_create_all_module_options(options); @@ -2406,6 +2471,9 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE * }else{ summarize_results(iterations, o.print_time); } + if(o.saveRankDetailsCSV){ + StoreRankInformation(iterations); + } if (i == 1 && stride > 1) { i = 0; } diff --git a/src/mdtest.h b/src/mdtest.h index 32d37fe..dbafccb 100644 --- a/src/mdtest.h +++ b/src/mdtest.h @@ -24,6 +24,7 @@ typedef struct { double rate[MDTEST_LAST_NUM]; /* Calculated throughput */ double time[MDTEST_LAST_NUM]; /* Time */ + double time_before_barrier[MDTEST_TREE_CREATE_NUM]; /* individual time before executing the barrier */ uint64_t items[MDTEST_LAST_NUM]; /* Number of operations done */ /* Statistics when hitting the stonewall */