MDTest allow storing information per rank for later analysis (#335)

* MDTest allow storing information per rank for later analysis when using the --saveRankPerformanceDetails=<FILE> option
* MDTest: refactored calculation of results, added time_before_executing a barrier.
master
Julian Kunkel 2021-02-11 09:57:22 +00:00 committed by GitHub
parent b5963380ae
commit 106eebecb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 50 deletions

View File

@ -163,6 +163,7 @@ typedef struct {
#ifdef HAVE_LUSTRE_LUSTREAPI
int global_dir_layout;
#endif /* HAVE_LUSTRE_LUSTREAPI */
char * saveRankDetailsCSV; /* save the details about the performance to a file */
mdtest_results_t * summary_table;
pid_t pid;
@ -897,9 +898,20 @@ void rename_dir_test(const int dirs, const long dir_iter, const char *path, rank
}
}
static void updateResult(mdtest_results_t * res, mdtest_test_num_t test, uint64_t item_count, int t, double * times, double * tBefore){
res->rate[test] = item_count/(times[t] - times[t-1]);
res->time[test] = times[t] - times[t-1];
if(tBefore){
res->time_before_barrier[test] = tBefore[t] - times[t-1];
}
res->items[test] = item_count;
res->stonewall_last_item[test] = o.items;
}
void directory_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) {
int size;
double t[6] = {0};
double tBefore[6] = {0};
char temp_path[MAX_PATHLEN];
mdtest_results_t * res = & o.summary_table[iteration];
@ -907,6 +919,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
VERBOSE(1,-1,"Entering directory_test on %s", path );
tBefore[0] = GetTimeStamp();
MPI_Barrier(testComm);
t[0] = GetTimeStamp();
@ -941,6 +954,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
progress->stone_wall_timer_seconds = 0;
}
tBefore[1] = GetTimeStamp();
phase_end();
t[1] = GetTimeStamp();
@ -967,6 +981,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
}
}
}
tBefore[2] = GetTimeStamp();
phase_end();
t[2] = GetTimeStamp();
@ -993,6 +1008,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
}
}
}
tBefore[3] = GetTimeStamp();
phase_end();
t[3] = GetTimeStamp();
@ -1013,6 +1029,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
rename_dir_test(1, dir_iter, temp_path, progress);
}
}
tBefore[4] = GetTimeStamp();
phase_end();
t[4] = GetTimeStamp();
@ -1048,6 +1065,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
}
}
tBefore[5] = GetTimeStamp();
phase_end();
t[5] = GetTimeStamp();
@ -1067,28 +1085,16 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
/* calculate times */
if (o.create_only) {
res->rate[MDTEST_DIR_CREATE_NUM] = o.items*size/(t[1] - t[0]);
res->time[MDTEST_DIR_CREATE_NUM] = t[1] - t[0];
res->items[MDTEST_DIR_CREATE_NUM] = o.items*size;
res->stonewall_last_item[MDTEST_DIR_CREATE_NUM] = o.items;
updateResult(res, MDTEST_DIR_CREATE_NUM, o.items*size, 1, t, tBefore);
}
if (o.stat_only) {
res->rate[MDTEST_DIR_STAT_NUM] = o.items*size/(t[2] - t[1]);
res->time[MDTEST_DIR_STAT_NUM] = t[2] - t[1];
res->items[MDTEST_DIR_STAT_NUM] = o.items*size;
res->stonewall_last_item[MDTEST_DIR_STAT_NUM] = o.items;
updateResult(res, MDTEST_DIR_STAT_NUM, o.items*size, 2, t, tBefore);
}
if (o.read_only) {
res->rate[MDTEST_DIR_READ_NUM] = o.items*size/(t[3] - t[2]);
res->time[MDTEST_DIR_READ_NUM] = t[3] - t[2];
res->items[MDTEST_DIR_READ_NUM] = o.items*size;
res->stonewall_last_item[MDTEST_DIR_READ_NUM] = o.items;
updateResult(res, MDTEST_DIR_READ_NUM, o.items*size, 3, t, tBefore);
}
if (o.remove_only) {
res->rate[MDTEST_DIR_REMOVE_NUM] = o.items*size/(t[5] - t[4]);
res->time[MDTEST_DIR_REMOVE_NUM] = t[5] - t[4];
res->items[MDTEST_DIR_REMOVE_NUM] = o.items*size;
res->stonewall_last_item[MDTEST_DIR_REMOVE_NUM] = o.items;
updateResult(res, MDTEST_DIR_REMOVE_NUM, o.items*size, 5, t, tBefore);
}
VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", t[1] - t[0], o.summary_table[iteration].rate[0]);
VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", t[2] - t[1], o.summary_table[iteration].rate[1]);
@ -1177,11 +1183,13 @@ void file_test_create(const int iteration, const int ntasks, const char *path, r
void file_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) {
int size;
double t[5] = {0};
double tBefore[5] = {0};
char temp_path[MAX_PATHLEN];
MPI_Comm_size(testComm, &size);
VERBOSE(3,5,"Entering file_test on %s", path);
tBefore[0] = GetTimeStamp();
MPI_Barrier(testComm);
t[0] = GetTimeStamp();
@ -1215,6 +1223,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
}
}
tBefore[1] = GetTimeStamp();
phase_end();
t[1] = GetTimeStamp();
@ -1238,6 +1247,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
}
}
tBefore[2] = GetTimeStamp();
phase_end();
t[2] = GetTimeStamp();
@ -1265,6 +1275,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
}
}
tBefore[3] = GetTimeStamp();
phase_end();
t[3] = GetTimeStamp();
@ -1295,6 +1306,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
}
}
tBefore[4] = GetTimeStamp();
phase_end();
t[4] = GetTimeStamp();
if (o.remove_only) {
@ -1318,28 +1330,16 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
mdtest_results_t * res = & o.summary_table[iteration];
/* calculate times */
if (o.create_only) {
res->rate[MDTEST_FILE_CREATE_NUM] = o.items*size/(t[1] - t[0]);
res->time[MDTEST_FILE_CREATE_NUM] = t[1] - t[0];
res->items[MDTEST_FILE_CREATE_NUM] = o.items*o.size;
res->stonewall_last_item[MDTEST_FILE_CREATE_NUM] = o.items;
updateResult(res, MDTEST_FILE_CREATE_NUM, o.items*size, 1, t, tBefore);
}
if (o.stat_only) {
res->rate[MDTEST_FILE_STAT_NUM] = o.items*size/(t[2] - t[1]);
res->time[MDTEST_FILE_STAT_NUM] = t[2] - t[1];
res->items[MDTEST_FILE_STAT_NUM] = o.items*o.size;
res->stonewall_last_item[MDTEST_FILE_STAT_NUM] = o.items;
updateResult(res, MDTEST_FILE_STAT_NUM, o.items*size, 2, t, tBefore);
}
if (o.read_only) {
res->rate[MDTEST_FILE_READ_NUM] = o.items*o.size/(t[3] - t[2]);
res->time[MDTEST_FILE_READ_NUM] = t[3] - t[2];
res->items[MDTEST_FILE_READ_NUM] = o.items*o.size;
res->stonewall_last_item[MDTEST_FILE_READ_NUM] = o.items;
updateResult(res, MDTEST_FILE_READ_NUM, o.items*size, 3, t, tBefore);
}
if (o.remove_only) {
res->rate[MDTEST_FILE_REMOVE_NUM] = o.items*o.size/(t[4] - t[3]);
res->time[MDTEST_FILE_REMOVE_NUM] = t[4] - t[3];
res->items[MDTEST_FILE_REMOVE_NUM] = o.items*o.size;
res->stonewall_last_item[MDTEST_FILE_REMOVE_NUM] = o.items;
updateResult(res, MDTEST_FILE_REMOVE_NUM, o.items*size, 4, t, tBefore);
}
VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", t[1] - t[0], o.summary_table[iteration].rate[4]);
@ -1353,17 +1353,17 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
char const * mdtest_test_name(int i){
switch (i) {
case MDTEST_DIR_CREATE_NUM: return "Directory creation :";
case MDTEST_DIR_STAT_NUM: return "Directory stat :";
case MDTEST_DIR_READ_NUM: return NULL;
case MDTEST_DIR_REMOVE_NUM: return "Directory removal :";
case MDTEST_DIR_RENAME_NUM: return "Directory rename :";
case MDTEST_FILE_CREATE_NUM: return "File creation :";
case MDTEST_FILE_STAT_NUM: return "File stat :";
case MDTEST_FILE_READ_NUM: return "File read :";
case MDTEST_FILE_REMOVE_NUM: return "File removal :";
case MDTEST_TREE_CREATE_NUM: return "Tree creation :";
case MDTEST_TREE_REMOVE_NUM: return "Tree removal :";
case MDTEST_DIR_CREATE_NUM: return "Directory creation";
case MDTEST_DIR_STAT_NUM: return "Directory stat";
case MDTEST_DIR_READ_NUM: return "Directory read";
case MDTEST_DIR_REMOVE_NUM: return "Directory removal";
case MDTEST_DIR_RENAME_NUM: return "Directory rename";
case MDTEST_FILE_CREATE_NUM: return "File creation";
case MDTEST_FILE_STAT_NUM: return "File stat";
case MDTEST_FILE_READ_NUM: return "File read";
case MDTEST_FILE_REMOVE_NUM: return "File removal";
case MDTEST_TREE_CREATE_NUM: return "Tree creation";
case MDTEST_TREE_REMOVE_NUM: return "Tree removal";
default: return "ERR INVALID TESTNAME :";
}
return NULL;
@ -1374,6 +1374,48 @@ int calc_allreduce_index(int iter, int rank, int op){
return iter * tableSize * o.size + rank * tableSize + op;
}
/*
* Store the results of each process in a file
*/
static void StoreRankInformation(int iterations){
const size_t size = sizeof(mdtest_results_t) * iterations;
if(rank == 0){
FILE* fd = fopen(o.saveRankDetailsCSV, "a");
if (fd == NULL){
FAIL("Cannot open saveRankPerformanceDetails file for writes!");
}
mdtest_results_t * results = malloc(size * o.size);
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, results, size / sizeof(double), MPI_DOUBLE, 0, testComm);
for(int iter = 0; iter < iterations; iter++){
for(int i=0; i < o.size; i++){
mdtest_results_t * cur = & results[i * iterations + iter];
char buff[4096];
char * cpos = buff;
cpos += sprintf(cpos, "%d,%llu", i, (long long unsigned) o.items);
for(int e = 0; (e < MDTEST_TREE_CREATE_NUM) || (i == 0 && e < MDTEST_LAST_NUM); e++){
if(cur->items[e] == 0){
cpos += sprintf(cpos, ",,");
}else{
cpos += sprintf(cpos, ",%.10e,%.10e", cur->items[e] / cur->time_before_barrier[e], cur->time_before_barrier[e]);
}
}
cpos += sprintf(cpos, "\n");
int ret = fwrite(buff, cpos - buff, 1, fd);
if(ret != 1){
WARN("Couln't append to saveRankPerformanceDetailsCSV file\n");
break;
}
}
}
fclose(fd);
}else{
/* this is a hack for now assuming all datatypes in the structure are double */
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm);
}
}
void summarize_results(int iterations, int print_time) {
char const * access;
int i, j, k;
@ -1404,7 +1446,7 @@ void summarize_results(int iterations, int print_time) {
continue;
}
curr = o.summary_table[j].rate[i];
fprintf(out_logfile, "Rank %d Iter %d Test %s Rate: %e\n", rank, j, access, curr);
fprintf(out_logfile, "Rank %d Iter %d Test %-20s Rate: %e\n", rank, j, access, curr);
}
}
}
@ -1452,8 +1494,8 @@ void summarize_results(int iterations, int print_time) {
}
VERBOSE(0,-1,"\nSUMMARY %s: (of %d iterations)", print_time ? "time": "rate", iterations);
VERBOSE(0,-1," Operation per Rank: Max Min Mean Std Dev per Iteration: Max Min Mean");
VERBOSE(0,-1," --------- --- --- ---- ------- --- --- ----");
VERBOSE(0,-1," Operation per Rank: Max Min Mean Std Dev per Iteration: Max Min Mean");
VERBOSE(0,-1," --------- --- --- ---- ------- --- --- ----");
for (i = start; i < stop; i++) {
min = max = all[i];
@ -1500,7 +1542,7 @@ void summarize_results(int iterations, int print_time) {
sd = sqrt(var);
access = mdtest_test_name(i);
if (i != 2) {
fprintf(out_logfile, " %s ", access);
fprintf(out_logfile, " %-22s ", access);
fprintf(out_logfile, "%14.3f ", max);
fprintf(out_logfile, "%14.3f ", min);
fprintf(out_logfile, "%14.3f ", mean);
@ -1522,8 +1564,8 @@ void summarize_results(int iterations, int print_time) {
}
}
if(stonewall_items != 0){
fprintf(out_logfile, " File create (stonewall) : ");
fprintf(out_logfile, "%14s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA");
fprintf(out_logfile, " File create (stonewall) ");
fprintf(out_logfile, "%13s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA");
}
/* calculate tree create/remove rates, applies only to Rank 0 */
@ -1565,7 +1607,7 @@ void summarize_results(int iterations, int print_time) {
var = var / (iterations);
sd = sqrt(var);
access = mdtest_test_name(i);
fprintf(out_logfile, " %s ", access);
fprintf(out_logfile, " %-22s ", access);
fprintf(out_logfile, "%14.3f ", max);
fprintf(out_logfile, "%14.3f ", min);
fprintf(out_logfile, "%14.3f ", mean);
@ -1673,6 +1715,28 @@ void md_validate_tests() {
if(o.create_only && o.read_only && o.read_bytes > o.write_bytes)
FAIL("When writing and reading files, read bytes must be smaller than write bytes");
if (rank == 0 && o.saveRankDetailsCSV){
// check that the file is writeable, truncate it and add header
FILE* fd = fopen(o.saveRankDetailsCSV, "w");
if (fd == NULL){
FAIL("Cannot open saveRankPerformanceDetails file for write!");
}
char * head = "rank,items";
int ret = fwrite(head, strlen(head), 1, fd);
for(int e = 0; e < MDTEST_LAST_NUM; e++){
char buf[1024];
const char * str = mdtest_test_name(e);
sprintf(buf, ",rate-%s,time-%s", str, str);
ret = fwrite(buf, strlen(buf), 1, fd);
if(ret != 1){
FAIL("Cannot write header to saveRankPerformanceDetails file");
}
}
fwrite("\n", 1, 1, fd);
fclose(fd);
}
}
void show_file_system_size(char *file_system) {
@ -2103,6 +2167,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
{'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth},
{'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time},
{0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors},
{0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV},
LAST_OPTION
};
options_all_t * global_options = airoi_create_all_module_options(options);
@ -2406,6 +2471,9 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
}else{
summarize_results(iterations, o.print_time);
}
if(o.saveRankDetailsCSV){
StoreRankInformation(iterations);
}
if (i == 1 && stride > 1) {
i = 0;
}

View File

@ -24,6 +24,7 @@ typedef struct
{
double rate[MDTEST_LAST_NUM]; /* Calculated throughput */
double time[MDTEST_LAST_NUM]; /* Time */
double time_before_barrier[MDTEST_TREE_CREATE_NUM]; /* individual time before executing the barrier */
uint64_t items[MDTEST_LAST_NUM]; /* Number of operations done */
/* Statistics when hitting the stonewall */