MDTest allow storing information per rank for later analysis (#335)
* MDTest allow storing information per rank for later analysis when using the --saveRankPerformanceDetails=<FILE> option * MDTest: refactored calculation of results, added time_before_executing a barrier.master
parent
b5963380ae
commit
106eebecb3
164
src/mdtest.c
164
src/mdtest.c
|
@ -163,6 +163,7 @@ typedef struct {
|
|||
#ifdef HAVE_LUSTRE_LUSTREAPI
|
||||
int global_dir_layout;
|
||||
#endif /* HAVE_LUSTRE_LUSTREAPI */
|
||||
char * saveRankDetailsCSV; /* save the details about the performance to a file */
|
||||
|
||||
mdtest_results_t * summary_table;
|
||||
pid_t pid;
|
||||
|
@ -897,9 +898,20 @@ void rename_dir_test(const int dirs, const long dir_iter, const char *path, rank
|
|||
}
|
||||
}
|
||||
|
||||
static void updateResult(mdtest_results_t * res, mdtest_test_num_t test, uint64_t item_count, int t, double * times, double * tBefore){
|
||||
res->rate[test] = item_count/(times[t] - times[t-1]);
|
||||
res->time[test] = times[t] - times[t-1];
|
||||
if(tBefore){
|
||||
res->time_before_barrier[test] = tBefore[t] - times[t-1];
|
||||
}
|
||||
res->items[test] = item_count;
|
||||
res->stonewall_last_item[test] = o.items;
|
||||
}
|
||||
|
||||
void directory_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) {
|
||||
int size;
|
||||
double t[6] = {0};
|
||||
double tBefore[6] = {0};
|
||||
char temp_path[MAX_PATHLEN];
|
||||
mdtest_results_t * res = & o.summary_table[iteration];
|
||||
|
||||
|
@ -907,6 +919,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
|
||||
VERBOSE(1,-1,"Entering directory_test on %s", path );
|
||||
|
||||
tBefore[0] = GetTimeStamp();
|
||||
MPI_Barrier(testComm);
|
||||
t[0] = GetTimeStamp();
|
||||
|
||||
|
@ -941,6 +954,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
progress->stone_wall_timer_seconds = 0;
|
||||
}
|
||||
|
||||
tBefore[1] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[1] = GetTimeStamp();
|
||||
|
||||
|
@ -967,6 +981,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
}
|
||||
}
|
||||
}
|
||||
tBefore[2] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[2] = GetTimeStamp();
|
||||
|
||||
|
@ -993,6 +1008,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
}
|
||||
}
|
||||
}
|
||||
tBefore[3] = GetTimeStamp();
|
||||
phase_end();
|
||||
|
||||
t[3] = GetTimeStamp();
|
||||
|
@ -1013,6 +1029,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
rename_dir_test(1, dir_iter, temp_path, progress);
|
||||
}
|
||||
}
|
||||
tBefore[4] = GetTimeStamp();
|
||||
phase_end();
|
||||
|
||||
t[4] = GetTimeStamp();
|
||||
|
@ -1048,6 +1065,7 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
}
|
||||
}
|
||||
|
||||
tBefore[5] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[5] = GetTimeStamp();
|
||||
|
||||
|
@ -1067,28 +1085,16 @@ void directory_test(const int iteration, const int ntasks, const char *path, ran
|
|||
|
||||
/* calculate times */
|
||||
if (o.create_only) {
|
||||
res->rate[MDTEST_DIR_CREATE_NUM] = o.items*size/(t[1] - t[0]);
|
||||
res->time[MDTEST_DIR_CREATE_NUM] = t[1] - t[0];
|
||||
res->items[MDTEST_DIR_CREATE_NUM] = o.items*size;
|
||||
res->stonewall_last_item[MDTEST_DIR_CREATE_NUM] = o.items;
|
||||
updateResult(res, MDTEST_DIR_CREATE_NUM, o.items*size, 1, t, tBefore);
|
||||
}
|
||||
if (o.stat_only) {
|
||||
res->rate[MDTEST_DIR_STAT_NUM] = o.items*size/(t[2] - t[1]);
|
||||
res->time[MDTEST_DIR_STAT_NUM] = t[2] - t[1];
|
||||
res->items[MDTEST_DIR_STAT_NUM] = o.items*size;
|
||||
res->stonewall_last_item[MDTEST_DIR_STAT_NUM] = o.items;
|
||||
updateResult(res, MDTEST_DIR_STAT_NUM, o.items*size, 2, t, tBefore);
|
||||
}
|
||||
if (o.read_only) {
|
||||
res->rate[MDTEST_DIR_READ_NUM] = o.items*size/(t[3] - t[2]);
|
||||
res->time[MDTEST_DIR_READ_NUM] = t[3] - t[2];
|
||||
res->items[MDTEST_DIR_READ_NUM] = o.items*size;
|
||||
res->stonewall_last_item[MDTEST_DIR_READ_NUM] = o.items;
|
||||
updateResult(res, MDTEST_DIR_READ_NUM, o.items*size, 3, t, tBefore);
|
||||
}
|
||||
if (o.remove_only) {
|
||||
res->rate[MDTEST_DIR_REMOVE_NUM] = o.items*size/(t[5] - t[4]);
|
||||
res->time[MDTEST_DIR_REMOVE_NUM] = t[5] - t[4];
|
||||
res->items[MDTEST_DIR_REMOVE_NUM] = o.items*size;
|
||||
res->stonewall_last_item[MDTEST_DIR_REMOVE_NUM] = o.items;
|
||||
updateResult(res, MDTEST_DIR_REMOVE_NUM, o.items*size, 5, t, tBefore);
|
||||
}
|
||||
VERBOSE(1,-1," Directory creation: %14.3f sec, %14.3f ops/sec", t[1] - t[0], o.summary_table[iteration].rate[0]);
|
||||
VERBOSE(1,-1," Directory stat : %14.3f sec, %14.3f ops/sec", t[2] - t[1], o.summary_table[iteration].rate[1]);
|
||||
|
@ -1177,11 +1183,13 @@ void file_test_create(const int iteration, const int ntasks, const char *path, r
|
|||
void file_test(const int iteration, const int ntasks, const char *path, rank_progress_t * progress) {
|
||||
int size;
|
||||
double t[5] = {0};
|
||||
double tBefore[5] = {0};
|
||||
char temp_path[MAX_PATHLEN];
|
||||
MPI_Comm_size(testComm, &size);
|
||||
|
||||
VERBOSE(3,5,"Entering file_test on %s", path);
|
||||
|
||||
tBefore[0] = GetTimeStamp();
|
||||
MPI_Barrier(testComm);
|
||||
t[0] = GetTimeStamp();
|
||||
|
||||
|
@ -1215,6 +1223,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
|
|||
}
|
||||
}
|
||||
|
||||
tBefore[1] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[1] = GetTimeStamp();
|
||||
|
||||
|
@ -1238,6 +1247,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
|
|||
}
|
||||
}
|
||||
|
||||
tBefore[2] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[2] = GetTimeStamp();
|
||||
|
||||
|
@ -1265,6 +1275,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
|
|||
}
|
||||
}
|
||||
|
||||
tBefore[3] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[3] = GetTimeStamp();
|
||||
|
||||
|
@ -1295,6 +1306,7 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
|
|||
}
|
||||
}
|
||||
|
||||
tBefore[4] = GetTimeStamp();
|
||||
phase_end();
|
||||
t[4] = GetTimeStamp();
|
||||
if (o.remove_only) {
|
||||
|
@ -1318,28 +1330,16 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
|
|||
mdtest_results_t * res = & o.summary_table[iteration];
|
||||
/* calculate times */
|
||||
if (o.create_only) {
|
||||
res->rate[MDTEST_FILE_CREATE_NUM] = o.items*size/(t[1] - t[0]);
|
||||
res->time[MDTEST_FILE_CREATE_NUM] = t[1] - t[0];
|
||||
res->items[MDTEST_FILE_CREATE_NUM] = o.items*o.size;
|
||||
res->stonewall_last_item[MDTEST_FILE_CREATE_NUM] = o.items;
|
||||
updateResult(res, MDTEST_FILE_CREATE_NUM, o.items*size, 1, t, tBefore);
|
||||
}
|
||||
if (o.stat_only) {
|
||||
res->rate[MDTEST_FILE_STAT_NUM] = o.items*size/(t[2] - t[1]);
|
||||
res->time[MDTEST_FILE_STAT_NUM] = t[2] - t[1];
|
||||
res->items[MDTEST_FILE_STAT_NUM] = o.items*o.size;
|
||||
res->stonewall_last_item[MDTEST_FILE_STAT_NUM] = o.items;
|
||||
updateResult(res, MDTEST_FILE_STAT_NUM, o.items*size, 2, t, tBefore);
|
||||
}
|
||||
if (o.read_only) {
|
||||
res->rate[MDTEST_FILE_READ_NUM] = o.items*o.size/(t[3] - t[2]);
|
||||
res->time[MDTEST_FILE_READ_NUM] = t[3] - t[2];
|
||||
res->items[MDTEST_FILE_READ_NUM] = o.items*o.size;
|
||||
res->stonewall_last_item[MDTEST_FILE_READ_NUM] = o.items;
|
||||
updateResult(res, MDTEST_FILE_READ_NUM, o.items*size, 3, t, tBefore);
|
||||
}
|
||||
if (o.remove_only) {
|
||||
res->rate[MDTEST_FILE_REMOVE_NUM] = o.items*o.size/(t[4] - t[3]);
|
||||
res->time[MDTEST_FILE_REMOVE_NUM] = t[4] - t[3];
|
||||
res->items[MDTEST_FILE_REMOVE_NUM] = o.items*o.size;
|
||||
res->stonewall_last_item[MDTEST_FILE_REMOVE_NUM] = o.items;
|
||||
updateResult(res, MDTEST_FILE_REMOVE_NUM, o.items*size, 4, t, tBefore);
|
||||
}
|
||||
|
||||
VERBOSE(1,-1," File creation : %14.3f sec, %14.3f ops/sec", t[1] - t[0], o.summary_table[iteration].rate[4]);
|
||||
|
@ -1353,17 +1353,17 @@ void file_test(const int iteration, const int ntasks, const char *path, rank_pro
|
|||
|
||||
char const * mdtest_test_name(int i){
|
||||
switch (i) {
|
||||
case MDTEST_DIR_CREATE_NUM: return "Directory creation :";
|
||||
case MDTEST_DIR_STAT_NUM: return "Directory stat :";
|
||||
case MDTEST_DIR_READ_NUM: return NULL;
|
||||
case MDTEST_DIR_REMOVE_NUM: return "Directory removal :";
|
||||
case MDTEST_DIR_RENAME_NUM: return "Directory rename :";
|
||||
case MDTEST_FILE_CREATE_NUM: return "File creation :";
|
||||
case MDTEST_FILE_STAT_NUM: return "File stat :";
|
||||
case MDTEST_FILE_READ_NUM: return "File read :";
|
||||
case MDTEST_FILE_REMOVE_NUM: return "File removal :";
|
||||
case MDTEST_TREE_CREATE_NUM: return "Tree creation :";
|
||||
case MDTEST_TREE_REMOVE_NUM: return "Tree removal :";
|
||||
case MDTEST_DIR_CREATE_NUM: return "Directory creation";
|
||||
case MDTEST_DIR_STAT_NUM: return "Directory stat";
|
||||
case MDTEST_DIR_READ_NUM: return "Directory read";
|
||||
case MDTEST_DIR_REMOVE_NUM: return "Directory removal";
|
||||
case MDTEST_DIR_RENAME_NUM: return "Directory rename";
|
||||
case MDTEST_FILE_CREATE_NUM: return "File creation";
|
||||
case MDTEST_FILE_STAT_NUM: return "File stat";
|
||||
case MDTEST_FILE_READ_NUM: return "File read";
|
||||
case MDTEST_FILE_REMOVE_NUM: return "File removal";
|
||||
case MDTEST_TREE_CREATE_NUM: return "Tree creation";
|
||||
case MDTEST_TREE_REMOVE_NUM: return "Tree removal";
|
||||
default: return "ERR INVALID TESTNAME :";
|
||||
}
|
||||
return NULL;
|
||||
|
@ -1374,6 +1374,48 @@ int calc_allreduce_index(int iter, int rank, int op){
|
|||
return iter * tableSize * o.size + rank * tableSize + op;
|
||||
}
|
||||
|
||||
/*
|
||||
* Store the results of each process in a file
|
||||
*/
|
||||
static void StoreRankInformation(int iterations){
|
||||
const size_t size = sizeof(mdtest_results_t) * iterations;
|
||||
if(rank == 0){
|
||||
FILE* fd = fopen(o.saveRankDetailsCSV, "a");
|
||||
if (fd == NULL){
|
||||
FAIL("Cannot open saveRankPerformanceDetails file for writes!");
|
||||
}
|
||||
|
||||
mdtest_results_t * results = malloc(size * o.size);
|
||||
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, results, size / sizeof(double), MPI_DOUBLE, 0, testComm);
|
||||
|
||||
for(int iter = 0; iter < iterations; iter++){
|
||||
for(int i=0; i < o.size; i++){
|
||||
mdtest_results_t * cur = & results[i * iterations + iter];
|
||||
char buff[4096];
|
||||
char * cpos = buff;
|
||||
cpos += sprintf(cpos, "%d,%llu", i, (long long unsigned) o.items);
|
||||
for(int e = 0; (e < MDTEST_TREE_CREATE_NUM) || (i == 0 && e < MDTEST_LAST_NUM); e++){
|
||||
if(cur->items[e] == 0){
|
||||
cpos += sprintf(cpos, ",,");
|
||||
}else{
|
||||
cpos += sprintf(cpos, ",%.10e,%.10e", cur->items[e] / cur->time_before_barrier[e], cur->time_before_barrier[e]);
|
||||
}
|
||||
}
|
||||
cpos += sprintf(cpos, "\n");
|
||||
int ret = fwrite(buff, cpos - buff, 1, fd);
|
||||
if(ret != 1){
|
||||
WARN("Couln't append to saveRankPerformanceDetailsCSV file\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fd);
|
||||
}else{
|
||||
/* this is a hack for now assuming all datatypes in the structure are double */
|
||||
MPI_Gather(o.summary_table, size / sizeof(double), MPI_DOUBLE, NULL, size / sizeof(double), MPI_DOUBLE, 0, testComm);
|
||||
}
|
||||
}
|
||||
|
||||
void summarize_results(int iterations, int print_time) {
|
||||
char const * access;
|
||||
int i, j, k;
|
||||
|
@ -1404,7 +1446,7 @@ void summarize_results(int iterations, int print_time) {
|
|||
continue;
|
||||
}
|
||||
curr = o.summary_table[j].rate[i];
|
||||
fprintf(out_logfile, "Rank %d Iter %d Test %s Rate: %e\n", rank, j, access, curr);
|
||||
fprintf(out_logfile, "Rank %d Iter %d Test %-20s Rate: %e\n", rank, j, access, curr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1500,7 +1542,7 @@ void summarize_results(int iterations, int print_time) {
|
|||
sd = sqrt(var);
|
||||
access = mdtest_test_name(i);
|
||||
if (i != 2) {
|
||||
fprintf(out_logfile, " %s ", access);
|
||||
fprintf(out_logfile, " %-22s ", access);
|
||||
fprintf(out_logfile, "%14.3f ", max);
|
||||
fprintf(out_logfile, "%14.3f ", min);
|
||||
fprintf(out_logfile, "%14.3f ", mean);
|
||||
|
@ -1522,8 +1564,8 @@ void summarize_results(int iterations, int print_time) {
|
|||
}
|
||||
}
|
||||
if(stonewall_items != 0){
|
||||
fprintf(out_logfile, " File create (stonewall) : ");
|
||||
fprintf(out_logfile, "%14s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA");
|
||||
fprintf(out_logfile, " File create (stonewall) ");
|
||||
fprintf(out_logfile, "%13s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA");
|
||||
}
|
||||
|
||||
/* calculate tree create/remove rates, applies only to Rank 0 */
|
||||
|
@ -1565,7 +1607,7 @@ void summarize_results(int iterations, int print_time) {
|
|||
var = var / (iterations);
|
||||
sd = sqrt(var);
|
||||
access = mdtest_test_name(i);
|
||||
fprintf(out_logfile, " %s ", access);
|
||||
fprintf(out_logfile, " %-22s ", access);
|
||||
fprintf(out_logfile, "%14.3f ", max);
|
||||
fprintf(out_logfile, "%14.3f ", min);
|
||||
fprintf(out_logfile, "%14.3f ", mean);
|
||||
|
@ -1673,6 +1715,28 @@ void md_validate_tests() {
|
|||
|
||||
if(o.create_only && o.read_only && o.read_bytes > o.write_bytes)
|
||||
FAIL("When writing and reading files, read bytes must be smaller than write bytes");
|
||||
|
||||
if (rank == 0 && o.saveRankDetailsCSV){
|
||||
// check that the file is writeable, truncate it and add header
|
||||
FILE* fd = fopen(o.saveRankDetailsCSV, "w");
|
||||
if (fd == NULL){
|
||||
FAIL("Cannot open saveRankPerformanceDetails file for write!");
|
||||
}
|
||||
char * head = "rank,items";
|
||||
int ret = fwrite(head, strlen(head), 1, fd);
|
||||
for(int e = 0; e < MDTEST_LAST_NUM; e++){
|
||||
char buf[1024];
|
||||
const char * str = mdtest_test_name(e);
|
||||
|
||||
sprintf(buf, ",rate-%s,time-%s", str, str);
|
||||
ret = fwrite(buf, strlen(buf), 1, fd);
|
||||
if(ret != 1){
|
||||
FAIL("Cannot write header to saveRankPerformanceDetails file");
|
||||
}
|
||||
}
|
||||
fwrite("\n", 1, 1, fd);
|
||||
fclose(fd);
|
||||
}
|
||||
}
|
||||
|
||||
void show_file_system_size(char *file_system) {
|
||||
|
@ -2103,6 +2167,7 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
|
|||
{'z', NULL, "depth of hierarchical directory structure", OPTION_OPTIONAL_ARGUMENT, 'd', & o.depth},
|
||||
{'Z', NULL, "print time instead of rate", OPTION_FLAG, 'd', & o.print_time},
|
||||
{0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & aiori_warning_as_errors},
|
||||
{0, "saveRankPerformanceDetails", "Save the individual rank information into this CSV file.", OPTION_OPTIONAL_ARGUMENT, 's', & o.saveRankDetailsCSV},
|
||||
LAST_OPTION
|
||||
};
|
||||
options_all_t * global_options = airoi_create_all_module_options(options);
|
||||
|
@ -2406,6 +2471,9 @@ mdtest_results_t * mdtest_run(int argc, char **argv, MPI_Comm world_com, FILE *
|
|||
}else{
|
||||
summarize_results(iterations, o.print_time);
|
||||
}
|
||||
if(o.saveRankDetailsCSV){
|
||||
StoreRankInformation(iterations);
|
||||
}
|
||||
if (i == 1 && stride > 1) {
|
||||
i = 0;
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ typedef struct
|
|||
{
|
||||
double rate[MDTEST_LAST_NUM]; /* Calculated throughput */
|
||||
double time[MDTEST_LAST_NUM]; /* Time */
|
||||
double time_before_barrier[MDTEST_TREE_CREATE_NUM]; /* individual time before executing the barrier */
|
||||
uint64_t items[MDTEST_LAST_NUM]; /* Number of operations done */
|
||||
|
||||
/* Statistics when hitting the stonewall */
|
||||
|
|
Loading…
Reference in New Issue