Merge pull request #281 from hpc/fix-mdtest-iter

Bugfix MDTest calculation of multiple iterations was incorrect.

Fix the bug reported by Rick to increase clarity. The previous offset calculation when using multiple iterations was:
for (i = start; i < stop; i++) // i = table position == test number
for (k=0; k < size; k++)
for (j = 0; j < iterations; j++)
value = all[(k * tableSize * iterations) + (j*tableSize) + i];

Note that the mean and min/max was then computed over these values.
But as the values were stored in memory in the order: iteration, rank, table
the correct term is: value = all[j * tableSize * size + k * tableSize + i];

Assume iterations = 2 and size = 3, the value for the test i=0 was computed from:
all[0 * 2 *tbl + 0 * tbl] = 0tbl
all[0 * 2 *tbl + 1 * tbl] = 1tbl
all[1 * 2 *tbl + 0 * tbl] = 2tbl
all[1 * 2 *tbl + 1 * tbl] = 3tbl
all[2 * 2 *tbl + 0 * tbl] = 4tbl
all[2 * 2 *tbl + 1 * tbl] = 5tbl

A more clear traversal would have been:
all[0 * 3 *tbl + 0 * tbl] = 0tbl
all[0 * 3 *tbl + 1 * tbl] = 1tbl
all[0 * 3 *tbl + 2 * tbl] = 2tbl
all[1 * 3 *tbl + 0 * tbl] = 3tbl
all[1 * 3 *tbl + 1 * tbl] = 4tbl
all[1 * 3 *tbl + 2 * tbl] = 5tbl

In that sense, it wasn't a functional bug but it decreased readability and now that we want to print the performance of the individual ranks, it is useful to fix this.
master
Julian Kunkel 2020-11-30 14:17:42 +00:00 committed by GitHub
commit 4a3e4806bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 22 additions and 6 deletions

View File

@ -1284,6 +1284,11 @@ char const * mdtest_test_name(int i){
return NULL;
}
int calc_allreduce_index(int iter, int rank, int op){
int tableSize = MDTEST_LAST_NUM;
return iter * tableSize * o.size + rank * tableSize + op;
}
void summarize_results(int iterations, int print_time) {
char const * access;
int i, j, k;
@ -1304,6 +1309,20 @@ void summarize_results(int iterations, int print_time) {
}
}
if(o.print_all_proc && 0){
// This code prints the result table for debugging
for (i = 0; i < tableSize; i++) {
for (j = 0; j < iterations; j++) {
access = mdtest_test_name(i);
if(access == NULL){
continue;
}
curr = o.summary_table[j].rate[i];
fprintf(out_logfile, "Rank %d Iter %d Test %s Rate: %e\n", rank, j, access, curr);
}
}
}
if (rank != 0) {
return;
}
@ -1327,7 +1346,6 @@ void summarize_results(int iterations, int print_time) {
start = stop = 0;
}
if(o.print_all_proc){
fprintf(out_logfile, "\nPer process result (%s):\n", print_time ? "time" : "rate");
for (j = 0; j < iterations; j++) {
@ -1339,7 +1357,7 @@ void summarize_results(int iterations, int print_time) {
}
fprintf(out_logfile, "Test %s", access);
for (k=0; k < o.size; k++) {
curr = all[(k*tableSize*iterations) + (j*tableSize) + i];
curr = all[calc_allreduce_index(j, k, i)];
fprintf(out_logfile, "%c%e", (k==0 ? ' ': ','), curr);
}
fprintf(out_logfile, "\n");
@ -1355,8 +1373,7 @@ void summarize_results(int iterations, int print_time) {
min = max = all[i];
for (k=0; k < o.size; k++) {
for (j = 0; j < iterations; j++) {
curr = all[(k*tableSize*iterations)
+ (j*tableSize) + i];
curr = all[calc_allreduce_index(j, k, i)];
if (min > curr) {
min = curr;
}
@ -1385,7 +1402,6 @@ void summarize_results(int iterations, int print_time) {
fflush(out_logfile);
}
sum = var = 0;
}
// TODO generalize once more stonewall timers are supported
@ -1402,7 +1418,7 @@ void summarize_results(int iterations, int print_time) {
fprintf(out_logfile, "%14s %14s %14.3f %14s\n", "NA", "NA", print_time ? stonewall_time : stonewall_items / stonewall_time, "NA");
}
/* calculate tree create/remove rates */
/* calculate tree create/remove rates, applies only to Rank 0 */
for (i = 8; i < tableSize; i++) {
min = max = all[i];
for (j = 0; j < iterations; j++) {