From c28ed6dc728e9a14831d0705fb5ddd60d415ba08 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 17:34:11 +0000 Subject: [PATCH 01/15] Partial conversion of existing md-workbench to IOR APIs. --- src/md-workbench.c | 1031 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1031 insertions(+) create mode 100644 src/md-workbench.c diff --git a/src/md-workbench.c b/src/md-workbench.c new file mode 100644 index 0000000..6e664a3 --- /dev/null +++ b/src/md-workbench.c @@ -0,0 +1,1031 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "aiori.h" +#include "utilities.h" +#include "parse_options.h" + +/* +This is the modified version md-workbench-fs that can utilize AIORI. +It follows the hierarchical file system semantics in contrast to the md-workbench (without -fs) which has dataset and object semantics. + */ + +// successfull, errors +typedef struct { + int suc; + int err; +} op_stat_t; + +// A runtime for an operation and when the operation was started +typedef struct{ + float time_since_app_start; + float runtime; +} time_result_t; + +typedef struct{ + float min; + float q1; + float median; + float q3; + float q90; + float q99; + float max; +} time_statistics_t; + +// statistics for running a single phase +typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! + double t; // maximum time + double * t_all; + + op_stat_t dset_name; + op_stat_t dset_create; + op_stat_t dset_delete; + + op_stat_t obj_name; + op_stat_t obj_create; + op_stat_t obj_read; + op_stat_t obj_stat; + op_stat_t obj_delete; + + // time measurements individual runs + uint64_t repeats; + time_result_t * time_create; + time_result_t * time_read; + time_result_t * time_stat; + time_result_t * time_delete; + + time_statistics_t stats_create; + time_statistics_t stats_read; + time_statistics_t stats_stat; + time_statistics_t stats_delete; + + // the maximum time for any single operation + double max_op_time; + double phase_start_timer; + int stonewall_iterations; +} phase_stat_t; + +#define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} +#define LLU (long long unsigned) +#define min(a,b) (a < b ? a : b) + +struct benchmark_options{ + ior_aiori_t const * backend; + void * backend_options; + + char * interface; + int num; + int precreate; + int dset_count; + + int offset; + int iterations; + int file_size; + int read_only; + int stonewall_timer; + int stonewall_timer_wear_out; + + char * latency_file_prefix; + int latency_keep_all; + + int phase_cleanup; + int phase_precreate; + int phase_benchmark; + + //int limit_memory; + //int limit_memory_between_phases; + + int verbosity; + int process_report; + + int print_detailed_stats; + int quiet_output; + + char * run_info_file; + + int ignore_precreate_errors; + int rank; + int size; + + float relative_waiting_factor; + int adaptive_waiting_mode; + + uint64_t start_item_number; +}; + +static int global_iteration = 0; + +struct benchmark_options o; + +void init_options(){ + memset(& o, 0, sizeof(o)); + o.interface = "POSIX"; + o.num = 1000; + o.precreate = 3000; + o.dset_count = 10; + o.offset = 1; + o.iterations = 3; + o.file_size = 3901; + o.run_info_file = "mdtest.status"; +} + +static void wait(double runtime){ + double waittime = runtime * o.relative_waiting_factor; + //printf("waittime: %e\n", waittime); + if(waittime < 0.01){ + double start; + start = GetTimeStamp(); + double cur = GetTimeStamp(); + double end = cur + waittime; + while (cur < end){ + cur = GetTimeStamp(); + } + }else{ + struct timespec w; + w.tv_sec = (time_t) (waittime); + w.tv_nsec = (long) ((waittime - w.tv_sec) * 1000 * 1000 * 1000); + nanosleep(& w, NULL); + } +} + +static void init_stats(phase_stat_t * p, size_t repeats){ + memset(p, 0, sizeof(phase_stat_t)); + p->repeats = repeats; + size_t timer_size = repeats * sizeof(time_result_t); + p->time_create = (time_result_t *) malloc(timer_size); + p->time_read = (time_result_t *) malloc(timer_size); + p->time_stat = (time_result_t *) malloc(timer_size); + p->time_delete = (time_result_t *) malloc(timer_size); +} + +static float add_timed_result(double start, double phase_start_timer, time_result_t * results, size_t pos, double * max_time, double * out_op_time){ + float curtime = start - phase_start_timer; + double op_time = GetTimeStamp(); + results[pos].runtime = (float) op_time; + results[pos].time_since_app_start = curtime; + if (op_time > *max_time){ + *max_time = op_time; + } + *out_op_time = op_time; + return curtime; +} + +static void print_detailed_stat_header(){ + printf("phase\t\td name\tcreate\tdelete\tob nam\tcreate\tread\tstat\tdelete\tt_inc_b\tt_no_bar\tthp\tmax_t\n"); +} + +static int sum_err(phase_stat_t * p){ + return p->dset_name.err + p->dset_create.err + p->dset_delete.err + p->obj_name.err + p->obj_create.err + p->obj_read.err + p->obj_stat.err + p->obj_delete.err; +} + +static double statistics_mean(int count, double * arr){ + double sum = 0; + for(int i=0; i < o.size; i++){ + sum += arr[i]; + } + return sum / o.size; +} + +static double statistics_std_dev(int count, double * arr){ + double mean = statistics_mean(count, arr); + double sum = 0; + for(int i=0; i < o.size; i++){ + sum += (mean - arr[i])*(mean - arr[i]); + } + return sqrt(sum / (o.size-1)); +} + +static void statistics_minmax(int count, double * arr, double * out_min, double * out_max){ + double min = 1e308; + double max = 0; + for(int i=0; i < o.size; i++){ + min = (arr[i] < min) ? arr[i] : min; + max = (arr[i] > max) ? arr[i] : max; + } + *out_min = min; + *out_max = max; +} + +static void print_p_stat(char * buff, const char * name, phase_stat_t * p, double t, int print_global){ + const double tp = (double)(p->obj_create.suc + p->obj_read.suc) * o.file_size / t / 1024 / 1024; + + const int errs = sum_err(p); + double r_min = 0; + double r_max = 0; + double r_mean = 0; + double r_std = 0; + + if(p->t_all){ + // we can compute several derived values that provide insight about quality of service, latency distribution and load balancing + statistics_minmax(o.size, p->t_all, & r_min, & r_max); + r_mean = statistics_mean(o.size, p->t_all); + r_std = statistics_std_dev(o.size, p->t_all); + } + + if (o.print_detailed_stats){ + sprintf(buff, "%s \t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.3fs\t%.3fs\t%.2f MiB/s %.4e", name, p->dset_name.suc, p->dset_create.suc, p->dset_delete.suc, p->obj_name.suc, p->obj_create.suc, p->obj_read.suc, p->obj_stat.suc, p->obj_delete.suc, p->t, t, tp, p->max_op_time); + + if (errs > 0){ + sprintf(buff, "%s err\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", name, p->dset_name.err, p->dset_create.err, p->dset_delete.err, p->obj_name.err, p->obj_create.err, p->obj_read.err, p->obj_stat.err, p->obj_delete.err); + } + }else{ + int pos = 0; + // single line + pos += sprintf(buff, "%s process max:%.2fs ", name, t); + if(print_global){ + pos += sprintf(buff + pos, "min:%.1fs mean: %.1fs balance:%.1f stddev:%.1f ", r_min, r_mean, r_min/r_max * 100.0, r_std); + } + int ioops_per_iter = 4; + if(o.read_only){ + ioops_per_iter = 2; + } + + switch(name[0]){ + case('b'): + pos += sprintf(buff + pos, "rate:%.1f iops/s objects:%d rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es", + p->obj_read.suc * ioops_per_iter / t, // write, stat, read, delete + p->obj_read.suc, + p->obj_read.suc / t, + tp, + p->max_op_time); + + if(o.relative_waiting_factor > 1e-9){ + pos += sprintf(buff + pos, " waiting_factor:%.2f", o.relative_waiting_factor); + } + break; + case('p'): + pos += sprintf(buff + pos, "rate:%.1f iops/s dsets: %d objects:%d rate:%.3f dset/s rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es", + (p->dset_create.suc + p->obj_create.suc) / t, + p->dset_create.suc, + p->obj_create.suc, + p->dset_create.suc / t, + p->obj_create.suc / t, + tp, + p->max_op_time); + break; + case('c'): + pos += sprintf(buff + pos, "rate:%.1f iops/s objects:%d dsets: %d rate:%.1f obj/s rate:%.3f dset/s op-max:%.4es", + (p->obj_delete.suc + p->dset_delete.suc) / t, + p->obj_delete.suc, + p->dset_delete.suc, + p->obj_delete.suc / t, + p->dset_delete.suc / t, + p->max_op_time); + break; + default: + pos = sprintf(buff, "%s: unknown phase", name); + break; + } + + if(! o.quiet_output || errs > 0){ + pos += sprintf(buff + pos, " (%d errs", errs); + if(errs > 0){ + pos += sprintf(buff + pos, "!!!)" ); + }else{ + pos += sprintf(buff + pos, ")" ); + } + } + if(! o.quiet_output && p->stonewall_iterations){ + pos += sprintf(buff + pos, " stonewall-iter:%d", p->stonewall_iterations); + } + + if(p->stats_read.max > 1e-9){ + time_statistics_t stat = p->stats_read; + pos += sprintf(buff + pos, " read(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + if(p->stats_stat.max > 1e-9){ + time_statistics_t stat = p->stats_stat; + pos += sprintf(buff + pos, " stat(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + if(p->stats_create.max > 1e-9){ + time_statistics_t stat = p->stats_create; + pos += sprintf(buff + pos, " create(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + if(p->stats_delete.max > 1e-9){ + time_statistics_t stat = p->stats_delete; + pos += sprintf(buff + pos, " delete(%.4es, %.4es, %.4es, %.4es, %.4es, %.4es, %.4es)", stat.min, stat.q1, stat.median, stat.q3, stat.q90, stat.q99, stat.max); + } + } +} + +static int compare_floats(time_result_t * x, time_result_t * y){ + return x->runtime < y->runtime ? -1 : (x->runtime > y->runtime ? +1 : 0); +} + +static double runtime_quantile(int repeats, time_result_t * times, float quantile){ + int pos = round(quantile * repeats + 0.49); + return times[pos].runtime; +} + +static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t * times, time_result_t * global_times){ + uint64_t count = 0; + int ret; + // due to stonewall, the number of repeats may be different per process + if(o.rank == 0){ + MPI_Status status; + memcpy(global_times, times, repeats * 2 * sizeof(float)); + count += repeats; + for(int i=1; i < o.size; i++){ + int cnt; + ret = MPI_Recv(& global_times[count], max_repeats*2, MPI_FLOAT, i, 888, MPI_COMM_WORLD, & status); + CHECK_MPI_RET(ret) + MPI_Get_count(& status, MPI_FLOAT, & cnt); + count += cnt / 2; + } + }else{ + ret = MPI_Send(times, repeats * 2, MPI_FLOAT, 0, 888, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + } + + return count; +} + +static void compute_histogram(const char * name, time_result_t * times, time_statistics_t * stats, size_t repeats, int writeLatencyFile){ + if(writeLatencyFile && o.latency_file_prefix ){ + char file[1024]; + sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, global_iteration, name); + FILE * f = fopen(file, "w+"); + if(f == NULL){ + printf("%d: Error writing to latency file: %s\n", o.rank, file); + return; + } + fprintf(f, "time,runtime\n"); + for(size_t i = 0; i < repeats; i++){ + fprintf(f, "%.7f,%.4e\n", times[i].time_since_app_start, times[i].runtime); + } + fclose(f); + } + // now sort the times and pick the quantiles + qsort(times, repeats, sizeof(time_result_t), (int (*)(const void *, const void *)) compare_floats); + stats->min = times[0].runtime; + stats->q1 = runtime_quantile(repeats, times, 0.25); + if(repeats % 2 == 0){ + stats->median = (times[repeats/2].runtime + times[repeats/2 - 1].runtime)/2.0; + }else{ + stats->median = times[repeats/2].runtime; + } + stats->q3 = runtime_quantile(repeats, times, 0.75); + stats->q90 = runtime_quantile(repeats, times, 0.90); + stats->q99 = runtime_quantile(repeats, times, 0.99); + stats->max = times[repeats - 1].runtime; +} + +static void end_phase(const char * name, phase_stat_t * p){ + int ret; + char buff[4096]; + + //char * limit_memory_P = NULL; + MPI_Barrier(MPI_COMM_WORLD); + + int max_repeats = o.precreate * o.dset_count; + if(strcmp(name,"benchmark") == 0){ + max_repeats = o.num * o.dset_count; + } + + // prepare the summarized report + phase_stat_t g_stat; + init_stats(& g_stat, (o.rank == 0 ? 1 : 0) * ((size_t) max_repeats) * o.size); + // reduce timers + ret = MPI_Reduce(& p->t, & g_stat.t, 2, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + if(o.rank == 0) { + g_stat.t_all = (double*) malloc(sizeof(double) * o.size); + } + ret = MPI_Gather(& p->t, 1, MPI_DOUBLE, g_stat.t_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + ret = MPI_Reduce(& p->dset_name, & g_stat.dset_name, 2*(3+5), MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + ret = MPI_Reduce(& p->max_op_time, & g_stat.max_op_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + if( p->stonewall_iterations ){ + ret = MPI_Reduce(& p->repeats, & g_stat.repeats, 1, MPI_UINT64_T, MPI_MIN, 0, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + g_stat.stonewall_iterations = p->stonewall_iterations; + } + int write_rank0_latency_file = (o.rank == 0) && ! o.latency_keep_all; + + if(strcmp(name,"precreate") == 0){ + uint64_t repeats = aggregate_timers(p->repeats, max_repeats, p->time_create, g_stat.time_create); + if(o.rank == 0){ + compute_histogram("precreate-all", g_stat.time_create, & g_stat.stats_create, repeats, o.latency_keep_all); + } + compute_histogram("precreate", p->time_create, & p->stats_create, p->repeats, write_rank0_latency_file); + }else if(strcmp(name,"cleanup") == 0){ + uint64_t repeats = aggregate_timers(p->repeats, max_repeats, p->time_delete, g_stat.time_delete); + if(o.rank == 0) { + compute_histogram("cleanup-all", g_stat.time_delete, & g_stat.stats_delete, repeats, o.latency_keep_all); + } + compute_histogram("cleanup", p->time_delete, & p->stats_delete, p->repeats, write_rank0_latency_file); + }else if(strcmp(name,"benchmark") == 0){ + uint64_t repeats = aggregate_timers(p->repeats, max_repeats, p->time_read, g_stat.time_read); + if(o.rank == 0) { + compute_histogram("read-all", g_stat.time_read, & g_stat.stats_read, repeats, o.latency_keep_all); + } + compute_histogram("read", p->time_read, & p->stats_read, p->repeats, write_rank0_latency_file); + + repeats = aggregate_timers(p->repeats, max_repeats, p->time_stat, g_stat.time_stat); + if(o.rank == 0) { + compute_histogram("stat-all", g_stat.time_stat, & g_stat.stats_stat, repeats, o.latency_keep_all); + } + compute_histogram("stat", p->time_stat, & p->stats_stat, p->repeats, write_rank0_latency_file); + + if(! o.read_only){ + repeats = aggregate_timers(p->repeats, max_repeats, p->time_create, g_stat.time_create); + if(o.rank == 0) { + compute_histogram("create-all", g_stat.time_create, & g_stat.stats_create, repeats, o.latency_keep_all); + } + compute_histogram("create", p->time_create, & p->stats_create, p->repeats, write_rank0_latency_file); + + repeats = aggregate_timers(p->repeats, max_repeats, p->time_delete, g_stat.time_delete); + if(o.rank == 0) { + compute_histogram("delete-all", g_stat.time_delete, & g_stat.stats_delete, repeats, o.latency_keep_all); + } + compute_histogram("delete", p->time_delete, & p->stats_delete, p->repeats, write_rank0_latency_file); + } + } + + if (o.rank == 0){ + //print the stats: + print_p_stat(buff, name, & g_stat, g_stat.t, 1); + printf("%s\n", buff); + } + + if(o.process_report){ + if(o.rank == 0){ + print_p_stat(buff, name, p, p->t, 0); + printf("0: %s\n", buff); + for(int i=1; i < o.size; i++){ + MPI_Recv(buff, 4096, MPI_CHAR, i, 4711, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + printf("%d: %s\n", i, buff); + } + }else{ + print_p_stat(buff, name, p, p->t, 0); + MPI_Send(buff, 4096, MPI_CHAR, 0, 4711, MPI_COMM_WORLD); + } + } + + if(g_stat.t_all){ + free(g_stat.t_all); + } + if(p->time_create){ + free(p->time_create); + free(p->time_read); + free(p->time_stat); + free(p->time_delete); + } + if(g_stat.time_create){ + free(g_stat.time_create); + free(g_stat.time_read); + free(g_stat.time_stat); + free(g_stat.time_delete); + } + + // allocate memory if necessary + // ret = mem_preallocate(& limit_memory_P, o.limit_memory_between_phases, o.verbosity >= 3); + // if( ret != 0){ + // printf("%d: Error allocating memory!\n", o.rank); + // } + // mem_free_preallocated(& limit_memory_P); +} + +void run_precreate(phase_stat_t * s, int current_index){ + char dset[4096]; + char obj_name[4096]; + int ret; + + for(int i=0; i < o.dset_count; i++){ + ret = o.plugin->def_dset_name(dset, o.rank, i); + if (ret != 0){ + if (! o.ignore_precreate_errors){ + printf("Error defining the dataset name\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + s->dset_name.err++; + continue; + } + s->dset_name.suc++; + ret = o.plugin->create_dset(dset); + if (ret == MD_NOOP){ + // do not increment any counter + }else if (ret == 0){ + s->dset_create.suc++; + }else{ + s->dset_create.err++; + if (! o.ignore_precreate_errors){ + printf("%d: Error while creating the dset: %s\n", o.rank, dset); + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + } + + char * buf = malloc(o.file_size); + memset(buf, o.rank % 256, o.file_size); + double op_timer; // timer for individual operations + size_t pos = -1; // position inside the individual measurement array + double op_time; + + // create the obj + for(int f=current_index; f < o.precreate; f++){ + for(int d=0; d < o.dset_count; d++){ + ret = o.plugin->def_dset_name(dset, o.rank, d); + pos++; + ret = o.plugin->def_obj_name(obj_name, o.rank, d, f); + if (ret != 0){ + s->dset_name.err++; + if (! o.ignore_precreate_errors){ + printf("%d: Error while creating the obj name\n", o.rank); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); + } + s->obj_name.err++; + continue; + } + + op_timer = GetTimeStamp(); + ret = o.plugin->write_obj(dset, obj_name, buf, o.file_size); + add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); + + if (o.verbosity >= 2){ + printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); + } + + if (ret == MD_NOOP){ + // do not increment any counter + }else if (ret == 0){ + s->obj_create.suc++; + }else{ + s->obj_create.err++; + if (! o.ignore_precreate_errors){ + printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + } + } + free(buf); +} + +/* FIFO: create a new file, write to it. Then read from the first created file, delete it... */ +void run_benchmark(phase_stat_t * s, int * current_index_p){ + char dset[4096]; + char obj_name[4096]; + int ret; + char * buf = malloc(o.file_size); + memset(buf, o.rank % 256, o.file_size); + double op_timer; // timer for individual operations + size_t pos = -1; // position inside the individual measurement array + int start_index = *current_index_p; + int total_num = o.num; + int armed_stone_wall = (o.stonewall_timer > 0); + int f; + double phase_allreduce_time = 0; + + for(f=0; f < total_num; f++){ + float bench_runtime = 0; // the time since start + for(int d=0; d < o.dset_count; d++){ + double op_time; + const int prevFile = f + start_index; + pos++; + + int readRank = (o.rank - o.offset * (d+1)) % o.size; + readRank = readRank < 0 ? readRank + o.size : readRank; + ret = o.plugin->def_obj_name(obj_name, readRank, d, prevFile); + if (ret != 0){ + s->obj_name.err++; + continue; + } + ret = o.plugin->def_dset_name(dset, readRank, d); + + op_timer = GetTimeStamp(); + ret = o.plugin->stat_obj(dset, obj_name, o.file_size); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_stat, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + wait(op_time); + } + + if (o.verbosity >= 2){ + printf("%d: stat %s:%s (%d)\n", o.rank, dset, obj_name, ret); + } + + if(ret != 0 && ret != MD_NOOP){ + if (o.verbosity) + printf("%d: Error while stating the obj: %s\n", o.rank, dset); + s->obj_stat.err++; + continue; + } + s->obj_stat.suc++; + + if (o.verbosity >= 2){ + printf("%d: read %s:%s \n", o.rank, dset, obj_name); + } + + op_timer = GetTimeStamp(); + ret = o.plugin->read_obj(dset, obj_name, buf, o.file_size); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_read, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + wait(op_time); + } + + if (ret == 0){ + s->obj_read.suc++; + }else if (ret == MD_NOOP){ + // nothing to do + }else if (ret == MD_ERROR_FIND){ + printf("%d: Error while accessing the file %s (%s)\n", o.rank, dset, strerror(errno)); + s->obj_read.err++; + }else{ + printf("%d: Error while reading the file %s (%s)\n", o.rank, dset, strerror(errno)); + s->obj_read.err++; + } + + if(o.read_only){ + continue; + } + + op_timer = GetTimeStamp(); + ret = o.plugin->delete_obj(dset, obj_name); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + wait(op_time); + } + + if (o.verbosity >= 2){ + printf("%d: delete %s:%s (%d)\n", o.rank, dset, obj_name, ret); + } + + if (ret == 0){ + s->obj_delete.suc++; + }else if (ret == MD_NOOP){ + // nothing to do + }else{ + printf("%d: Error while deleting the object %s:%s\n", o.rank, dset, obj_name); + s->obj_delete.err++; + } + + int writeRank = (o.rank + o.offset * (d+1)) % o.size; + ret = o.plugin->def_obj_name(obj_name, writeRank, d, o.precreate + prevFile); + if (ret != 0){ + s->obj_name.err++; + continue; + } + ret = o.plugin->def_dset_name(dset, writeRank, d); + + op_timer = GetTimeStamp(); + ret = o.plugin->write_obj(dset, obj_name, buf, o.file_size); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); + if(o.relative_waiting_factor > 1e-9) { + wait(op_time); + } + + if (o.verbosity >= 2){ + printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); + } + + if (ret == 0){ + s->obj_create.suc++; + }else if (ret == MD_ERROR_CREATE){ + if (o.verbosity) + printf("%d: Error while creating the obj: %s\n",o.rank, dset); + s->obj_create.err++; + }else if (ret == MD_NOOP){ + // do not increment any counter + }else{ + if (o.verbosity) + printf("%d: Error while writing the obj: %s\n", o.rank, dset); + s->obj_create.err++; + } + } // end loop + + if(armed_stone_wall && bench_runtime >= o.stonewall_timer){ + if(o.verbosity){ + printf("%d: stonewall runtime %fs (%ds)\n", o.rank, bench_runtime, o.stonewall_timer); + } + if(! o.stonewall_timer_wear_out){ + s->stonewall_iterations = f; + break; + } + armed_stone_wall = 0; + // wear out mode, now reduce the maximum + int cur_pos = f + 1; + phase_allreduce_time = GetTimeStamp() - s->phase_start_timer; + int ret = MPI_Allreduce(& cur_pos, & total_num, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + s->phase_start_timer = GetTimeStamp(); + s->stonewall_iterations = total_num; + if(o.rank == 0){ + printf("stonewall wear out %fs (%d iter)\n", bench_runtime, total_num); + } + if(f == total_num){ + break; + } + } + } + s->t = GetTimeStamp() - s->phase_start_timer + phase_allreduce_time; + if(armed_stone_wall && o.stonewall_timer_wear_out){ + int f = total_num; + int ret = MPI_Allreduce(& f, & total_num, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + s->stonewall_iterations = total_num; + } + if(o.stonewall_timer && ! o.stonewall_timer_wear_out){ + // TODO FIXME + int sh = s->stonewall_iterations; + int ret = MPI_Allreduce(& sh, & s->stonewall_iterations, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + CHECK_MPI_RET(ret) + } + + if(! o.read_only) { + *current_index_p += f; + } + s->repeats = pos + 1; + free(buf); +} + +void run_cleanup(phase_stat_t * s, int start_index){ + char dset[4096]; + char obj_name[4096]; + int ret; + double op_timer; // timer for individual operations + size_t pos = -1; // position inside the individual measurement array + + for(int d=0; d < o.dset_count; d++){ + ret = o.plugin->def_dset_name(dset, o.rank, d); + + for(int f=0; f < o.precreate; f++){ + double op_time; + pos++; + ret = o.plugin->def_obj_name(obj_name, o.rank, d, f + start_index); + + op_timer = GetTimeStamp(); + ret = o.plugin->delete_obj(dset, obj_name); + add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); + + if (o.verbosity >= 2){ + printf("%d: delete %s:%s (%d)\n", o.rank, dset, obj_name, ret); + } + + if (ret == MD_NOOP){ + // nothing to do + }else if (ret == 0){ + s->obj_delete.suc++; + }else if(ret != MD_NOOP){ + s->obj_delete.err++; + } + } + + ret = o.plugin->rm_dset(dset); + + if (o.verbosity >= 2){ + printf("%d: delete dset %s (%d)\n", o.rank, dset, ret); + } + + if (ret == 0){ + s->dset_delete.suc++; + }else if (ret != MD_NOOP){ + s->dset_delete.err++; + } + } +} + + +static option_help options [] = { + {'O', "offset", "Offset in o.ranks between writers and readers. Writers and readers should be located on different nodes.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.offset}, + {'a', "api", "The API (plugin) to use for the benchmark, use list to show all compiled plugins.", OPTION_OPTIONAL_ARGUMENT, 's', & o.interface}, + {'I', "obj-per-proc", "Number of I/O operations per data set.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.num}, + {'L', "latency", "Measure the latency for individual operations, prefix the result files with the provided filename.", OPTION_OPTIONAL_ARGUMENT, 's', & o.latency_file_prefix}, + {0, "latency-all", "Keep the latency files from all ranks.", OPTION_FLAG, 'd', & o.latency_keep_all}, + {'P', "precreate-per-set", "Number of object to precreate per data set.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.precreate}, + {'D', "data-sets", "Number of data sets covered per process and iteration.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.dset_count}, + {'q', "quiet", "Avoid irrelevant printing.", OPTION_FLAG, 'd', & o.quiet_output}, + //{'m', "lim-free-mem", "Allocate memory until this limit (in MiB) is reached.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.limit_memory}, + // {'M', "lim-free-mem-phase", "Allocate memory until this limit (in MiB) is reached between the phases, but free it before starting the next phase; the time is NOT included for the phase.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.limit_memory_between_phases}, + {'S', "object-size", "Size for the created objects.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.file_size}, + {'R', "iterations", "Number of times to rerun the main phase", OPTION_OPTIONAL_ARGUMENT, 'd', & o.iterations}, + {'t', "waiting-time", "Waiting time relative to runtime (1.0 is 100%%)", OPTION_OPTIONAL_ARGUMENT, 'f', & o.relative_waiting_factor}, + {'T', "adaptive-waiting", "Compute an adaptive waiting time", OPTION_FLAG, 'd', & o.adaptive_waiting_mode}, + {'1', "run-precreate", "Run precreate phase", OPTION_FLAG, 'd', & o.phase_precreate}, + {'2', "run-benchmark", "Run benchmark phase", OPTION_FLAG, 'd', & o.phase_benchmark}, + {'3', "run-cleanup", "Run cleanup phase (only run explicit phases)", OPTION_FLAG, 'd', & o.phase_cleanup}, + {'w', "stonewall-timer", "Stop each benchmark iteration after the specified seconds (if not used with -W this leads to process-specific progress!)", OPTION_OPTIONAL_ARGUMENT, 'd', & o.stonewall_timer}, + {'W', "stonewall-wear-out", "Stop with stonewall after specified time and use a soft wear-out phase -- all processes perform the same number of iterations", OPTION_FLAG, 'd', & o.stonewall_timer_wear_out}, + {0, "start-item", "The iteration number of the item to start with, allowing to offset the operations", OPTION_OPTIONAL_ARGUMENT, 'l', & o.start_item_number}, + {0, "print-detailed-stats", "Print detailed machine parsable statistics.", OPTION_FLAG, 'd', & o.print_detailed_stats}, + {0, "read-only", "Run read-only during benchmarking phase (no deletes/writes), probably use with -2", OPTION_FLAG, 'd', & o.read_only}, + {0, "ignore-precreate-errors", "Ignore errors occuring during the pre-creation phase", OPTION_FLAG, 'd', & o.ignore_precreate_errors}, + {0, "process-reports", "Independent report per process/rank", OPTION_FLAG, 'd', & o.process_report}, + {'v', "verbose", "Increase the verbosity level", OPTION_FLAG, 'd', & o.verbosity}, + {0, "run-info-file", "The log file for resuming a previous run", OPTION_OPTIONAL_ARGUMENT, 's', & o.run_info_file}, + LAST_OPTION + }; + +static void printTime(){ + char buff[100]; + time_t now = time(0); + strftime (buff, 100, "%Y-%m-%d %H:%M:%S", localtime (&now)); + printf("%s\n", buff); +} + +static int return_position(){ + int position, ret; + if( o.rank == 0){ + FILE * f = fopen(o.run_info_file, "r"); + if(! f){ + printf("[ERROR] Could not open %s for restart\n", o.run_info_file); + exit(1); + } + ret = fscanf(f, "pos: %d", & position); + if (ret != 1){ + printf("Could not read from %s for restart\n", o.run_info_file); + exit(1); + } + fclose(f); + } + ret = MPI_Bcast( & position, 1, MPI_INT, 0, MPI_COMM_WORLD ); + return position; +} + +static void store_position(int position){ + if (o.rank != 0){ + return; + } + FILE * f = fopen(o.run_info_file, "w"); + if(! f){ + printf("[ERROR] Could not open %s for saving data\n", o.run_info_file); + exit(1); + } + fprintf(f, "pos: %d\n", position); + fclose(f); +} + +int main(int argc, char ** argv){ + int ret; + int printhelp = 0; + char * limit_memory_P = NULL; + + init_options(); + + MPI_Init(& argc, & argv); + MPI_Comm_rank(MPI_COMM_WORLD, & o.rank); + MPI_Comm_size(MPI_COMM_WORLD, & o.size); + + if (o.rank == 0 && ! o.quiet_output){ + printf("Args: %s", argv[0]); + for(int i=1; i < argc; i++){ + printf(" \"%s\"", argv[i]); + } + printf("\n"); + } + + options_all_t * global_options = airoi_create_all_module_options(options); + int parsed = option_parse(argc, argv, global_options); + o.backend = aiori_select(o.interface); + if (o.backend == NULL) + ERR("Unrecognized I/O API"); + if (! o.backend->enable_mdtest) + ERR("Backend doesn't support MDWorbench"); + o.backend_options = airoi_update_module_options(o.backend, global_options); + + if (!(o.phase_cleanup || o.phase_precreate || o.phase_benchmark)){ + // enable all phases + o.phase_cleanup = o.phase_precreate = o.phase_benchmark = 1; + } + if (! o.phase_precreate && o.phase_benchmark && o.stonewall_timer && ! o.stonewall_timer_wear_out){ + if(o.rank == 0) + printf("Invalid options, if running only the benchmark phase using -2 with stonewall option then use stonewall wear-out\n"); + exit(1); + } + + ret = o.plugin->initialize(); + if (ret != 0){ + printf("%d: Error initializing module\n", o.rank); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int current_index = 0; + + if ( (o.phase_cleanup || o.phase_benchmark) && ! o.phase_precreate ){ + current_index = return_position(); + } + + if(o.start_item_number){ + printf("Using start position %lld\n", (long long) o.start_item_number); + current_index = o.start_item_number; + } + + size_t total_obj_count = o.dset_count * (size_t) (o.num * o.iterations + o.precreate) * o.size; + if (o.rank == 0 && ! o.quiet_output){ + printf("MD-Workbench total objects: %zu workingset size: %.3f MiB (version: %s) time: ", total_obj_count, ((double) o.size) * o.dset_count * o.precreate * o.file_size / 1024.0 / 1024.0, PACKAGE_VERSION); + printTime(); + if(o.num > o.precreate){ + printf("WARNING: num > precreate, this may cause the situation that no objects are available to read\n"); + } + } + + if ( o.rank == 0 && ! o.quiet_output ){ + // print the set output options + option_print_current(options); + printf("\n"); + } + + // preallocate memory if necessary + //ret = mem_preallocate(& limit_memory_P, o.limit_memory, o.verbosity >= 3); + //if(ret != 0){ + // printf("%d: Error allocating memory\n", o.rank); + // MPI_Abort(MPI_COMM_WORLD, 1); + //} + + double bench_start; + bench_start = GetTimeStamp(); + phase_stat_t phase_stats; + + if(o.rank == 0 && o.print_detailed_stats && ! o.quiet_output){ + print_detailed_stat_header(); + } + + if (o.phase_precreate){ + if (o.rank == 0){ + ret = o.plugin->prepare_global(); + if ( ret != 0 && ret != MD_NOOP ){ + if ( ! (ret == MD_EXISTS && o.ignore_precreate_errors)){ + printf("Rank 0 could not prepare the run, aborting\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + } + init_stats(& phase_stats, o.precreate * o.dset_count); + MPI_Barrier(MPI_COMM_WORLD); + + // pre-creation phase + phase_stats.phase_start_timer = GetTimeStamp(); + run_precreate(& phase_stats, current_index); + phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; + end_phase("precreate", & phase_stats); + } + + if (o.phase_benchmark){ + // benchmark phase + for(global_iteration = 0; global_iteration < o.iterations; global_iteration++){ + if(o.adaptive_waiting_mode){ + o.relative_waiting_factor = 0; + } + init_stats(& phase_stats, o.num * o.dset_count); + MPI_Barrier(MPI_COMM_WORLD); + phase_stats.phase_start_timer = GetTimeStamp(); + run_benchmark(& phase_stats, & current_index); + end_phase("benchmark", & phase_stats); + + if(o.adaptive_waiting_mode){ + o.relative_waiting_factor = 0.0625; + for(int r=0; r <= 6; r++){ + init_stats(& phase_stats, o.num * o.dset_count); + MPI_Barrier(MPI_COMM_WORLD); + phase_stats.phase_start_timer = GetTimeStamp(); + run_benchmark(& phase_stats, & current_index); + end_phase("benchmark", & phase_stats); + o.relative_waiting_factor *= 2; + } + } + } + } + + // cleanup phase + if (o.phase_cleanup){ + init_stats(& phase_stats, o.precreate * o.dset_count); + phase_stats.phase_start_timer = GetTimeStamp(); + run_cleanup(& phase_stats, current_index); + phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; + end_phase("cleanup", & phase_stats); + + if (o.rank == 0){ + ret = o.plugin->purge_global(); + if (ret != 0 && ret != MD_NOOP){ + printf("Rank 0: Error purging the global environment\n"); + } + } + }else{ + store_position(current_index); + } + + double t_all = GetTimeStamp(); + ret = o.plugin->finalize(); + if (ret != 0){ + printf("Error while finalization of module\n"); + } + if (o.rank == 0 && ! o.quiet_output){ + printf("Total runtime: %.0fs time: ", t_all); + printTime(); + } + + //mem_free_preallocated(& limit_memory_P); + + MPI_Finalize(); + return 0; +} From 9ace15cce35069062d2b067c4afd8cb10c0736ec Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 17:56:28 +0000 Subject: [PATCH 02/15] Workbench: further conversation. --- src/Makefile.am | 13 +++- src/md-workbench-main.c | 10 +++ src/md-workbench.c | 159 +++++++++++----------------------------- src/md-workbench.h | 63 ++++++++++++++++ 4 files changed, 124 insertions(+), 121 deletions(-) create mode 100644 src/md-workbench-main.c create mode 100644 src/md-workbench.h diff --git a/src/Makefile.am b/src/Makefile.am index 0adbf32..47ede87 100755 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,20 +1,25 @@ SUBDIRS = . test -bin_PROGRAMS = ior mdtest +bin_PROGRAMS = ior mdtest md-workbench if USE_CAPS -bin_PROGRAMS += IOR MDTEST +bin_PROGRAMS += IOR MDTEST MD-WORKBENCH endif -noinst_HEADERS = ior.h utilities.h parse_options.h aiori.h iordef.h ior-internal.h option.h mdtest.h aiori-debug.h aiori-POSIX.h +noinst_HEADERS = ior.h utilities.h parse_options.h aiori.h iordef.h ior-internal.h option.h mdtest.h aiori-debug.h aiori-POSIX.h md-workbench.h lib_LIBRARIES = libaiori.a -libaiori_a_SOURCES = ior.c mdtest.c utilities.c parse_options.c ior-output.c option.c +libaiori_a_SOURCES = ior.c mdtest.c utilities.c parse_options.c ior-output.c option.c md-workbench.c extraSOURCES = aiori.c aiori-DUMMY.c extraLDADD = extraLDFLAGS = extraCPPFLAGS = +md_workbench_SOURCES = md-workbench.c md-workbench-main.c +md_workbench_LDFLAGS = +md_workbench_LDADD = libaiori.a +md_workbench_CPPFLAGS = + ior_SOURCES = ior-main.c ior_LDFLAGS = ior_LDADD = libaiori.a diff --git a/src/md-workbench-main.c b/src/md-workbench-main.c new file mode 100644 index 0000000..bdd12f2 --- /dev/null +++ b/src/md-workbench-main.c @@ -0,0 +1,10 @@ +#include + +#include "md-workbench.h" + +int main(int argc, char ** argv){ + MPI_Init(& argc, & argv); + int ret = md_workbench(argc, argv); + MPI_Finalize(); + return ret; +} diff --git a/src/md-workbench.c b/src/md-workbench.c index 6e664a3..ddf250d 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -1,13 +1,14 @@ #include #include -#include #include #include #include #include #include +#include "md-workbench.h" +#include "config.h" #include "aiori.h" #include "utilities.h" #include "parse_options.h" @@ -17,60 +18,6 @@ This is the modified version md-workbench-fs that can utilize AIORI. It follows the hierarchical file system semantics in contrast to the md-workbench (without -fs) which has dataset and object semantics. */ -// successfull, errors -typedef struct { - int suc; - int err; -} op_stat_t; - -// A runtime for an operation and when the operation was started -typedef struct{ - float time_since_app_start; - float runtime; -} time_result_t; - -typedef struct{ - float min; - float q1; - float median; - float q3; - float q90; - float q99; - float max; -} time_statistics_t; - -// statistics for running a single phase -typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! - double t; // maximum time - double * t_all; - - op_stat_t dset_name; - op_stat_t dset_create; - op_stat_t dset_delete; - - op_stat_t obj_name; - op_stat_t obj_create; - op_stat_t obj_read; - op_stat_t obj_stat; - op_stat_t obj_delete; - - // time measurements individual runs - uint64_t repeats; - time_result_t * time_create; - time_result_t * time_read; - time_result_t * time_stat; - time_result_t * time_delete; - - time_statistics_t stats_create; - time_statistics_t stats_read; - time_statistics_t stats_stat; - time_statistics_t stats_delete; - - // the maximum time for any single operation - double max_op_time; - double phase_start_timer; - int stonewall_iterations; -} phase_stat_t; #define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} #define LLU (long long unsigned) @@ -79,6 +26,7 @@ typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! struct benchmark_options{ ior_aiori_t const * backend; void * backend_options; + aiori_xfer_hint_t hints; char * interface; int num; @@ -501,7 +449,7 @@ void run_precreate(phase_stat_t * s, int current_index){ int ret; for(int i=0; i < o.dset_count; i++){ - ret = o.plugin->def_dset_name(dset, o.rank, i); + ret = o.backend->def_dset_name(dset, o.rank, i); if (ret != 0){ if (! o.ignore_precreate_errors){ printf("Error defining the dataset name\n"); @@ -511,10 +459,8 @@ void run_precreate(phase_stat_t * s, int current_index){ continue; } s->dset_name.suc++; - ret = o.plugin->create_dset(dset); - if (ret == MD_NOOP){ - // do not increment any counter - }else if (ret == 0){ + ret = o.backend->create_dset(dset); + if (ret == 0){ s->dset_create.suc++; }else{ s->dset_create.err++; @@ -534,9 +480,9 @@ void run_precreate(phase_stat_t * s, int current_index){ // create the obj for(int f=current_index; f < o.precreate; f++){ for(int d=0; d < o.dset_count; d++){ - ret = o.plugin->def_dset_name(dset, o.rank, d); + ret = o.backend->def_dset_name(dset, o.rank, d); pos++; - ret = o.plugin->def_obj_name(obj_name, o.rank, d, f); + ret = o.backend->def_obj_name(obj_name, o.rank, d, f); if (ret != 0){ s->dset_name.err++; if (! o.ignore_precreate_errors){ @@ -549,16 +495,14 @@ void run_precreate(phase_stat_t * s, int current_index){ } op_timer = GetTimeStamp(); - ret = o.plugin->write_obj(dset, obj_name, buf, o.file_size); + ret = o.backend->write_obj(dset, obj_name, buf, o.file_size); add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); if (o.verbosity >= 2){ printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); } - if (ret == MD_NOOP){ - // do not increment any counter - }else if (ret == 0){ + if (ret == 0){ s->obj_create.suc++; }else{ s->obj_create.err++; @@ -597,15 +541,15 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ int readRank = (o.rank - o.offset * (d+1)) % o.size; readRank = readRank < 0 ? readRank + o.size : readRank; - ret = o.plugin->def_obj_name(obj_name, readRank, d, prevFile); + ret = o.backend->def_obj_name(obj_name, readRank, d, prevFile); if (ret != 0){ s->obj_name.err++; continue; } - ret = o.plugin->def_dset_name(dset, readRank, d); + ret = o.backend->def_dset_name(dset, readRank, d); op_timer = GetTimeStamp(); - ret = o.plugin->stat_obj(dset, obj_name, o.file_size); + ret = o.backend->stat_obj(dset, obj_name, o.file_size); bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_stat, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); @@ -615,7 +559,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ printf("%d: stat %s:%s (%d)\n", o.rank, dset, obj_name, ret); } - if(ret != 0 && ret != MD_NOOP){ + if(ret != 0){ if (o.verbosity) printf("%d: Error while stating the obj: %s\n", o.rank, dset); s->obj_stat.err++; @@ -628,7 +572,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } op_timer = GetTimeStamp(); - ret = o.plugin->read_obj(dset, obj_name, buf, o.file_size); + ret = o.backend->read_obj(dset, obj_name, buf, o.file_size); bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_read, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); @@ -636,11 +580,6 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ if (ret == 0){ s->obj_read.suc++; - }else if (ret == MD_NOOP){ - // nothing to do - }else if (ret == MD_ERROR_FIND){ - printf("%d: Error while accessing the file %s (%s)\n", o.rank, dset, strerror(errno)); - s->obj_read.err++; }else{ printf("%d: Error while reading the file %s (%s)\n", o.rank, dset, strerror(errno)); s->obj_read.err++; @@ -651,7 +590,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } op_timer = GetTimeStamp(); - ret = o.plugin->delete_obj(dset, obj_name); + ret = o.backend->delete_obj(dset, obj_name); bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); @@ -663,23 +602,21 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ if (ret == 0){ s->obj_delete.suc++; - }else if (ret == MD_NOOP){ - // nothing to do }else{ printf("%d: Error while deleting the object %s:%s\n", o.rank, dset, obj_name); s->obj_delete.err++; } int writeRank = (o.rank + o.offset * (d+1)) % o.size; - ret = o.plugin->def_obj_name(obj_name, writeRank, d, o.precreate + prevFile); + ret = o.backend->def_obj_name(obj_name, writeRank, d, o.precreate + prevFile); if (ret != 0){ s->obj_name.err++; continue; } - ret = o.plugin->def_dset_name(dset, writeRank, d); + ret = o.backend->def_dset_name(dset, writeRank, d); op_timer = GetTimeStamp(); - ret = o.plugin->write_obj(dset, obj_name, buf, o.file_size); + ret = o.backend->write_obj(dset, obj_name, buf, o.file_size); bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); @@ -691,12 +628,6 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ if (ret == 0){ s->obj_create.suc++; - }else if (ret == MD_ERROR_CREATE){ - if (o.verbosity) - printf("%d: Error while creating the obj: %s\n",o.rank, dset); - s->obj_create.err++; - }else if (ret == MD_NOOP){ - // do not increment any counter }else{ if (o.verbosity) printf("%d: Error while writing the obj: %s\n", o.rank, dset); @@ -757,31 +688,29 @@ void run_cleanup(phase_stat_t * s, int start_index){ size_t pos = -1; // position inside the individual measurement array for(int d=0; d < o.dset_count; d++){ - ret = o.plugin->def_dset_name(dset, o.rank, d); + ret = o.backend->def_dset_name(dset, o.rank, d); for(int f=0; f < o.precreate; f++){ double op_time; pos++; - ret = o.plugin->def_obj_name(obj_name, o.rank, d, f + start_index); + ret = o.backend->def_obj_name(obj_name, o.rank, d, f + start_index); op_timer = GetTimeStamp(); - ret = o.plugin->delete_obj(dset, obj_name); + ret = o.backend->delete_obj(dset, obj_name); add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); if (o.verbosity >= 2){ printf("%d: delete %s:%s (%d)\n", o.rank, dset, obj_name, ret); } - if (ret == MD_NOOP){ - // nothing to do - }else if (ret == 0){ + if (ret == 0){ s->obj_delete.suc++; - }else if(ret != MD_NOOP){ + }else{ s->obj_delete.err++; } } - ret = o.plugin->rm_dset(dset); + ret = o.backend->rm_dset(dset); if (o.verbosity >= 2){ printf("%d: delete dset %s (%d)\n", o.rank, dset, ret); @@ -789,7 +718,7 @@ void run_cleanup(phase_stat_t * s, int start_index){ if (ret == 0){ s->dset_delete.suc++; - }else if (ret != MD_NOOP){ + }else{ s->dset_delete.err++; } } @@ -865,14 +794,13 @@ static void store_position(int position){ fclose(f); } -int main(int argc, char ** argv){ +int md_workbench(int argc, char ** argv){ int ret; int printhelp = 0; char * limit_memory_P = NULL; init_options(); - MPI_Init(& argc, & argv); MPI_Comm_rank(MPI_COMM_WORLD, & o.rank); MPI_Comm_size(MPI_COMM_WORLD, & o.size); @@ -884,6 +812,7 @@ int main(int argc, char ** argv){ printf("\n"); } + memset(& o.hints, 0, sizeof(o.hints)); options_all_t * global_options = airoi_create_all_module_options(options); int parsed = option_parse(argc, argv, global_options); o.backend = aiori_select(o.interface); @@ -903,11 +832,14 @@ int main(int argc, char ** argv){ exit(1); } - ret = o.plugin->initialize(); - if (ret != 0){ - printf("%d: Error initializing module\n", o.rank); - MPI_Abort(MPI_COMM_WORLD, 1); + o.backend->initialize(o.backend_options); + if(o.backend->xfer_hints){ + o.backend->xfer_hints(& o.hints); } + if(o.backend->check_params){ + o.backend->check_params(o.backend_options); + } + int current_index = 0; @@ -952,12 +884,10 @@ int main(int argc, char ** argv){ if (o.phase_precreate){ if (o.rank == 0){ - ret = o.plugin->prepare_global(); - if ( ret != 0 && ret != MD_NOOP ){ - if ( ! (ret == MD_EXISTS && o.ignore_precreate_errors)){ - printf("Rank 0 could not prepare the run, aborting\n"); - MPI_Abort(MPI_COMM_WORLD, 1); - } + ret = o.backend->prepare_global(); + if ( ret != 0 ){ + printf("Rank 0 could not prepare the run, aborting\n"); + MPI_Abort(MPI_COMM_WORLD, 1); } } init_stats(& phase_stats, o.precreate * o.dset_count); @@ -1005,8 +935,8 @@ int main(int argc, char ** argv){ end_phase("cleanup", & phase_stats); if (o.rank == 0){ - ret = o.plugin->purge_global(); - if (ret != 0 && ret != MD_NOOP){ + ret = o.backend->purge_global(); + if (ret != 0){ printf("Rank 0: Error purging the global environment\n"); } } @@ -1015,17 +945,12 @@ int main(int argc, char ** argv){ } double t_all = GetTimeStamp(); - ret = o.plugin->finalize(); - if (ret != 0){ - printf("Error while finalization of module\n"); - } + o.backend->finalize(o.backend_options); if (o.rank == 0 && ! o.quiet_output){ printf("Total runtime: %.0fs time: ", t_all); printTime(); } //mem_free_preallocated(& limit_memory_P); - - MPI_Finalize(); return 0; } diff --git a/src/md-workbench.h b/src/md-workbench.h new file mode 100644 index 0000000..0be70b1 --- /dev/null +++ b/src/md-workbench.h @@ -0,0 +1,63 @@ +#ifndef IOR_MD_WORKBENCH_H +#define IOR_MD_WORKBENCH_H + +#include + +// successfull, errors +typedef struct { + int suc; + int err; +} op_stat_t; + +// A runtime for an operation and when the operation was started +typedef struct{ + float time_since_app_start; + float runtime; +} time_result_t; + +typedef struct{ + float min; + float q1; + float median; + float q3; + float q90; + float q99; + float max; +} time_statistics_t; + +// statistics for running a single phase +typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! + double t; // maximum time + double * t_all; + + op_stat_t dset_name; + op_stat_t dset_create; + op_stat_t dset_delete; + + op_stat_t obj_name; + op_stat_t obj_create; + op_stat_t obj_read; + op_stat_t obj_stat; + op_stat_t obj_delete; + + // time measurements individual runs + uint64_t repeats; + time_result_t * time_create; + time_result_t * time_read; + time_result_t * time_stat; + time_result_t * time_delete; + + time_statistics_t stats_create; + time_statistics_t stats_read; + time_statistics_t stats_stat; + time_statistics_t stats_delete; + + // the maximum time for any single operation + double max_op_time; + double phase_start_timer; + int stonewall_iterations; +} phase_stat_t; + +int md_workbench(int argc, char ** argv); + +#endif From 348754c87a99222bb7ab20921cc6eb465dd2ae8c Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 18:35:01 +0000 Subject: [PATCH 03/15] md-workbench code ported. --- src/md-workbench.c | 229 ++++++++++++++++++++++----------------------- src/md-workbench.h | 2 - 2 files changed, 112 insertions(+), 119 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index ddf250d..5b39c45 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -18,6 +18,8 @@ This is the modified version md-workbench-fs that can utilize AIORI. It follows the hierarchical file system semantics in contrast to the md-workbench (without -fs) which has dataset and object semantics. */ +#define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH +#define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH #define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} #define LLU (long long unsigned) @@ -57,6 +59,7 @@ struct benchmark_options{ int quiet_output; char * run_info_file; + char * prefix; // directory to work on int ignore_precreate_errors; int rank; @@ -68,20 +71,29 @@ struct benchmark_options{ uint64_t start_item_number; }; -static int global_iteration = 0; +static int global_iteration; struct benchmark_options o; +static void def_dset_name(char * out_name, int n, int d){ + sprintf(out_name, "%s/%d_%d", o.prefix, n, d); +} + +static void def_obj_name(char * out_name, char * dset, int n, int d, int i){ + sprintf(out_name, "%s/%d_%d/file-%d", dset, n, d, i); +} + void init_options(){ memset(& o, 0, sizeof(o)); o.interface = "POSIX"; + o.prefix = "./out"; o.num = 1000; o.precreate = 3000; o.dset_count = 10; o.offset = 1; o.iterations = 3; o.file_size = 3901; - o.run_info_file = "mdtest.status"; + o.run_info_file = "md-workbench.status"; } static void wait(double runtime){ @@ -130,7 +142,7 @@ static void print_detailed_stat_header(){ } static int sum_err(phase_stat_t * p){ - return p->dset_name.err + p->dset_create.err + p->dset_delete.err + p->obj_name.err + p->obj_create.err + p->obj_read.err + p->obj_stat.err + p->obj_delete.err; + return p->dset_create.err + p->dset_delete.err + p->obj_create.err + p->obj_read.err + p->obj_stat.err + p->obj_delete.err; } static double statistics_mean(int count, double * arr){ @@ -178,10 +190,10 @@ static void print_p_stat(char * buff, const char * name, phase_stat_t * p, doubl } if (o.print_detailed_stats){ - sprintf(buff, "%s \t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.3fs\t%.3fs\t%.2f MiB/s %.4e", name, p->dset_name.suc, p->dset_create.suc, p->dset_delete.suc, p->obj_name.suc, p->obj_create.suc, p->obj_read.suc, p->obj_stat.suc, p->obj_delete.suc, p->t, t, tp, p->max_op_time); + sprintf(buff, "%s \t%d\t%d\t%d\t%d\t%d\t%d\t%.3fs\t%.3fs\t%.2f MiB/s %.4e", name, p->dset_create.suc, p->dset_delete.suc, p->obj_create.suc, p->obj_read.suc, p->obj_stat.suc, p->obj_delete.suc, p->t, t, tp, p->max_op_time); if (errs > 0){ - sprintf(buff, "%s err\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", name, p->dset_name.err, p->dset_create.err, p->dset_delete.err, p->obj_name.err, p->obj_create.err, p->obj_read.err, p->obj_stat.err, p->obj_delete.err); + sprintf(buff, "%s err\t%d\t%d\t%d\t%d\t%d\t%d", name, p->dset_create.err, p->dset_delete.err, p->obj_create.err, p->obj_read.err, p->obj_stat.err, p->obj_delete.err); } }else{ int pos = 0; @@ -327,7 +339,7 @@ static void compute_histogram(const char * name, time_result_t * times, time_sta static void end_phase(const char * name, phase_stat_t * p){ int ret; - char buff[4096]; + char buff[MAX_PATHLEN]; //char * limit_memory_P = NULL; MPI_Barrier(MPI_COMM_WORLD); @@ -348,7 +360,7 @@ static void end_phase(const char * name, phase_stat_t * p){ } ret = MPI_Gather(& p->t, 1, MPI_DOUBLE, g_stat.t_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); CHECK_MPI_RET(ret) - ret = MPI_Reduce(& p->dset_name, & g_stat.dset_name, 2*(3+5), MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + ret = MPI_Reduce(& p->dset_create, & g_stat.dset_create, 2*(2+4), MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); CHECK_MPI_RET(ret) ret = MPI_Reduce(& p->max_op_time, & g_stat.max_op_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); CHECK_MPI_RET(ret) @@ -410,12 +422,12 @@ static void end_phase(const char * name, phase_stat_t * p){ print_p_stat(buff, name, p, p->t, 0); printf("0: %s\n", buff); for(int i=1; i < o.size; i++){ - MPI_Recv(buff, 4096, MPI_CHAR, i, 4711, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(buff, MAX_PATHLEN, MPI_CHAR, i, 4711, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("%d: %s\n", i, buff); } }else{ print_p_stat(buff, name, p, p->t, 0); - MPI_Send(buff, 4096, MPI_CHAR, 0, 4711, MPI_COMM_WORLD); + MPI_Send(buff, MAX_PATHLEN, MPI_CHAR, 0, 4711, MPI_COMM_WORLD); } } @@ -444,22 +456,14 @@ static void end_phase(const char * name, phase_stat_t * p){ } void run_precreate(phase_stat_t * s, int current_index){ - char dset[4096]; - char obj_name[4096]; + char dset[MAX_PATHLEN]; + char obj_name[MAX_PATHLEN]; int ret; for(int i=0; i < o.dset_count; i++){ - ret = o.backend->def_dset_name(dset, o.rank, i); - if (ret != 0){ - if (! o.ignore_precreate_errors){ - printf("Error defining the dataset name\n"); - MPI_Abort(MPI_COMM_WORLD, 1); - } - s->dset_name.err++; - continue; - } - s->dset_name.suc++; - ret = o.backend->create_dset(dset); + def_dset_name(dset, o.rank, i); + + ret = o.backend->mkdir(dset, DIRMODE, o.backend_options); if (ret == 0){ s->dset_create.suc++; }else{ @@ -480,38 +484,32 @@ void run_precreate(phase_stat_t * s, int current_index){ // create the obj for(int f=current_index; f < o.precreate; f++){ for(int d=0; d < o.dset_count; d++){ - ret = o.backend->def_dset_name(dset, o.rank, d); + def_dset_name(dset, o.rank, d); pos++; - ret = o.backend->def_obj_name(obj_name, o.rank, d, f); - if (ret != 0){ - s->dset_name.err++; - if (! o.ignore_precreate_errors){ - printf("%d: Error while creating the obj name\n", o.rank); - fflush(stdout); - MPI_Abort(MPI_COMM_WORLD, 1); - } - s->obj_name.err++; - continue; - } + def_obj_name(obj_name, dset, o.rank, d, f); op_timer = GetTimeStamp(); - ret = o.backend->write_obj(dset, obj_name, buf, o.file_size); - add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); - - if (o.verbosity >= 2){ - printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); + aiori_fd_t * aiori_fh = o.backend->open(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL == aiori_fh){ + FAIL("unable to open file %s", obj_name); } - - if (ret == 0){ + if ( o.file_size == (int) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { s->obj_create.suc++; }else{ s->obj_create.err++; if (! o.ignore_precreate_errors){ - printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); - fflush(stdout); - MPI_Abort(MPI_COMM_WORLD, 1); + printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); } } + o.backend->close(aiori_fh, o.backend_options); + + add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); + + if (o.verbosity >= 2){ + printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); + } } } free(buf); @@ -519,8 +517,8 @@ void run_precreate(phase_stat_t * s, int current_index){ /* FIFO: create a new file, write to it. Then read from the first created file, delete it... */ void run_benchmark(phase_stat_t * s, int * current_index_p){ - char dset[4096]; - char obj_name[4096]; + char dset[MAX_PATHLEN]; + char obj_name[MAX_PATHLEN]; int ret; char * buf = malloc(o.file_size); memset(buf, o.rank % 256, o.file_size); @@ -531,25 +529,26 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ int armed_stone_wall = (o.stonewall_timer > 0); int f; double phase_allreduce_time = 0; + aiori_fd_t * aiori_fh; for(f=0; f < total_num; f++){ float bench_runtime = 0; // the time since start for(int d=0; d < o.dset_count; d++){ double op_time; + struct stat stat_buf; const int prevFile = f + start_index; pos++; int readRank = (o.rank - o.offset * (d+1)) % o.size; readRank = readRank < 0 ? readRank + o.size : readRank; - ret = o.backend->def_obj_name(obj_name, readRank, d, prevFile); - if (ret != 0){ - s->obj_name.err++; - continue; - } - ret = o.backend->def_dset_name(dset, readRank, d); + def_dset_name(dset, readRank, d); + def_obj_name(obj_name, dset, readRank, d, prevFile); op_timer = GetTimeStamp(); - ret = o.backend->stat_obj(dset, obj_name, o.file_size); + + ret = o.backend->stat(obj_name, & stat_buf, o.backend_options); + // TODO potentially check return value must be identical to o.file_size + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_stat, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); @@ -572,51 +571,61 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } op_timer = GetTimeStamp(); - ret = o.backend->read_obj(dset, obj_name, buf, o.file_size); + aiori_fh = o.backend->open(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL == aiori_fh){ + FAIL("unable to open file %s", obj_name); + } + if ( o.file_size == (int) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { + s->obj_read.suc++; + }else{ + s->obj_read.err++; + printf("%d: Error while reading the obj: %s\n", o.rank, obj_name); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); + } + o.backend->close(aiori_fh, o.backend_options); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_read, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); } - - if (ret == 0){ - s->obj_read.suc++; - }else{ - printf("%d: Error while reading the file %s (%s)\n", o.rank, dset, strerror(errno)); - s->obj_read.err++; - } - if(o.read_only){ continue; } op_timer = GetTimeStamp(); - ret = o.backend->delete_obj(dset, obj_name); + o.backend->delete(obj_name, o.backend_options); bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); } if (o.verbosity >= 2){ - printf("%d: delete %s:%s (%d)\n", o.rank, dset, obj_name, ret); - } - - if (ret == 0){ - s->obj_delete.suc++; - }else{ - printf("%d: Error while deleting the object %s:%s\n", o.rank, dset, obj_name); - s->obj_delete.err++; + printf("%d: delete %s:%s\n", o.rank, dset, obj_name); } + s->obj_delete.suc++; int writeRank = (o.rank + o.offset * (d+1)) % o.size; - ret = o.backend->def_obj_name(obj_name, writeRank, d, o.precreate + prevFile); - if (ret != 0){ - s->obj_name.err++; - continue; - } - ret = o.backend->def_dset_name(dset, writeRank, d); + def_dset_name(dset, writeRank, d); + def_obj_name(obj_name, dset, writeRank, d, o.precreate + prevFile); op_timer = GetTimeStamp(); - ret = o.backend->write_obj(dset, obj_name, buf, o.file_size); + aiori_fh = o.backend->open(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + if (NULL == aiori_fh){ + FAIL("unable to open file %s", obj_name); + } + if ( o.file_size == (int) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { + s->obj_create.suc++; + }else{ + s->obj_create.err++; + if (! o.ignore_precreate_errors){ + printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + o.backend->close(aiori_fh, o.backend_options); + bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { wait(op_time); @@ -625,14 +634,6 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ if (o.verbosity >= 2){ printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); } - - if (ret == 0){ - s->obj_create.suc++; - }else{ - if (o.verbosity) - printf("%d: Error while writing the obj: %s\n", o.rank, dset); - s->obj_create.err++; - } } // end loop if(armed_stone_wall && bench_runtime >= o.stonewall_timer){ @@ -681,45 +682,36 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } void run_cleanup(phase_stat_t * s, int start_index){ - char dset[4096]; - char obj_name[4096]; - int ret; + char dset[MAX_PATHLEN]; + char obj_name[MAX_PATHLEN]; double op_timer; // timer for individual operations size_t pos = -1; // position inside the individual measurement array for(int d=0; d < o.dset_count; d++){ - ret = o.backend->def_dset_name(dset, o.rank, d); + def_dset_name(dset, o.rank, d); for(int f=0; f < o.precreate; f++){ double op_time; pos++; - ret = o.backend->def_obj_name(obj_name, o.rank, d, f + start_index); + def_obj_name(obj_name, dset, o.rank, d, f + start_index); op_timer = GetTimeStamp(); - ret = o.backend->delete_obj(dset, obj_name); + o.backend->delete(obj_name, o.backend_options); add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); if (o.verbosity >= 2){ - printf("%d: delete %s:%s (%d)\n", o.rank, dset, obj_name, ret); - } - - if (ret == 0){ - s->obj_delete.suc++; - }else{ - s->obj_delete.err++; + printf("%d: delete %s\n", o.rank, obj_name); } + s->obj_delete.suc++; } - ret = o.backend->rm_dset(dset); - - if (o.verbosity >= 2){ - printf("%d: delete dset %s (%d)\n", o.rank, dset, ret); - } - - if (ret == 0){ + if (o.backend->rmdir(dset, o.backend_options)) { s->dset_delete.suc++; }else{ - s->dset_delete.err++; + printf("unable to remove directory %s", dset); + } + if (o.verbosity >= 2){ + printf("%d: delete dset %s\n", o.rank, dset); } } } @@ -733,6 +725,7 @@ static option_help options [] = { {0, "latency-all", "Keep the latency files from all ranks.", OPTION_FLAG, 'd', & o.latency_keep_all}, {'P', "precreate-per-set", "Number of object to precreate per data set.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.precreate}, {'D', "data-sets", "Number of data sets covered per process and iteration.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.dset_count}, + {'o', NULL, "Output directory", OPTION_OPTIONAL_ARGUMENT, 's', & o.prefix}, {'q', "quiet", "Avoid irrelevant printing.", OPTION_FLAG, 'd', & o.quiet_output}, //{'m', "lim-free-mem", "Allocate memory until this limit (in MiB) is reached.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.limit_memory}, // {'M', "lim-free-mem-phase", "Allocate memory until this limit (in MiB) is reached between the phases, but free it before starting the next phase; the time is NOT included for the phase.", OPTION_OPTIONAL_ARGUMENT, 'd', & o.limit_memory_between_phases}, @@ -799,6 +792,7 @@ int md_workbench(int argc, char ** argv){ int printhelp = 0; char * limit_memory_P = NULL; + global_iteration = 0; init_options(); MPI_Comm_rank(MPI_COMM_WORLD, & o.rank); @@ -816,10 +810,12 @@ int md_workbench(int argc, char ** argv){ options_all_t * global_options = airoi_create_all_module_options(options); int parsed = option_parse(argc, argv, global_options); o.backend = aiori_select(o.interface); - if (o.backend == NULL) + if (o.backend == NULL){ ERR("Unrecognized I/O API"); - if (! o.backend->enable_mdtest) + } + if (! o.backend->enable_mdtest){ ERR("Backend doesn't support MDWorbench"); + } o.backend_options = airoi_update_module_options(o.backend, global_options); if (!(o.phase_cleanup || o.phase_precreate || o.phase_benchmark)){ @@ -832,7 +828,9 @@ int md_workbench(int argc, char ** argv){ exit(1); } - o.backend->initialize(o.backend_options); + if (o.backend->initialize){ + o.backend->initialize(o.backend_options); + } if(o.backend->xfer_hints){ o.backend->xfer_hints(& o.hints); } @@ -884,10 +882,8 @@ int md_workbench(int argc, char ** argv){ if (o.phase_precreate){ if (o.rank == 0){ - ret = o.backend->prepare_global(); - if ( ret != 0 ){ - printf("Rank 0 could not prepare the run, aborting\n"); - MPI_Abort(MPI_COMM_WORLD, 1); + if (o.backend->mkdir(o.prefix, DIRMODE, o.backend_options) != 0) { + EWARNF("Unable to create test directory %s", o.prefix); } } init_stats(& phase_stats, o.precreate * o.dset_count); @@ -935,9 +931,8 @@ int md_workbench(int argc, char ** argv){ end_phase("cleanup", & phase_stats); if (o.rank == 0){ - ret = o.backend->purge_global(); - if (ret != 0){ - printf("Rank 0: Error purging the global environment\n"); + if (! o.backend->rmdir(o.prefix, o.backend_options)) { + FAIL("unable to remove directory %s", o.prefix); } } }else{ diff --git a/src/md-workbench.h b/src/md-workbench.h index 0be70b1..c556af8 100644 --- a/src/md-workbench.h +++ b/src/md-workbench.h @@ -30,11 +30,9 @@ typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! double t; // maximum time double * t_all; - op_stat_t dset_name; op_stat_t dset_create; op_stat_t dset_delete; - op_stat_t obj_name; op_stat_t obj_create; op_stat_t obj_read; op_stat_t obj_stat; From d39ae556f0d8c8fc6e85da87762942c74dc2c64c Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 19:04:27 +0000 Subject: [PATCH 04/15] Bugfixing md-workbench errors. --- src/md-workbench.c | 54 +++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index 5b39c45..a1a916d 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -18,7 +18,6 @@ This is the modified version md-workbench-fs that can utilize AIORI. It follows the hierarchical file system semantics in contrast to the md-workbench (without -fs) which has dataset and object semantics. */ -#define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH #define DIRMODE S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IXOTH #define CHECK_MPI_RET(ret) if (ret != MPI_SUCCESS){ printf("Unexpected error in MPI on Line %d\n", __LINE__);} @@ -79,8 +78,8 @@ static void def_dset_name(char * out_name, int n, int d){ sprintf(out_name, "%s/%d_%d", o.prefix, n, d); } -static void def_obj_name(char * out_name, char * dset, int n, int d, int i){ - sprintf(out_name, "%s/%d_%d/file-%d", dset, n, d, i); +static void def_obj_name(char * out_name, int n, int d, int i){ + sprintf(out_name, "%s/%d_%d/file-%d", o.prefix, n, d, i); } void init_options(){ @@ -484,14 +483,13 @@ void run_precreate(phase_stat_t * s, int current_index){ // create the obj for(int f=current_index; f < o.precreate; f++){ for(int d=0; d < o.dset_count; d++){ - def_dset_name(dset, o.rank, d); pos++; - def_obj_name(obj_name, dset, o.rank, d, f); + def_obj_name(obj_name, o.rank, d, f); op_timer = GetTimeStamp(); - aiori_fd_t * aiori_fh = o.backend->open(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + aiori_fd_t * aiori_fh = o.backend->create(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); if (NULL == aiori_fh){ - FAIL("unable to open file %s", obj_name); + FAIL("Unable to open file %s", obj_name); } if ( o.file_size == (int) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { s->obj_create.suc++; @@ -517,7 +515,6 @@ void run_precreate(phase_stat_t * s, int current_index){ /* FIFO: create a new file, write to it. Then read from the first created file, delete it... */ void run_benchmark(phase_stat_t * s, int * current_index_p){ - char dset[MAX_PATHLEN]; char obj_name[MAX_PATHLEN]; int ret; char * buf = malloc(o.file_size); @@ -541,8 +538,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ int readRank = (o.rank - o.offset * (d+1)) % o.size; readRank = readRank < 0 ? readRank + o.size : readRank; - def_dset_name(dset, readRank, d); - def_obj_name(obj_name, dset, readRank, d, prevFile); + def_obj_name(obj_name, readRank, d, prevFile); op_timer = GetTimeStamp(); @@ -555,25 +551,25 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } if (o.verbosity >= 2){ - printf("%d: stat %s:%s (%d)\n", o.rank, dset, obj_name, ret); + printf("%d: stat %s (%d)\n", o.rank, obj_name, ret); } if(ret != 0){ if (o.verbosity) - printf("%d: Error while stating the obj: %s\n", o.rank, dset); + printf("%d: Error while stating the obj: %s\n", o.rank, obj_name); s->obj_stat.err++; continue; } s->obj_stat.suc++; if (o.verbosity >= 2){ - printf("%d: read %s:%s \n", o.rank, dset, obj_name); + printf("%d: read %s \n", o.rank, obj_name); } op_timer = GetTimeStamp(); - aiori_fh = o.backend->open(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + aiori_fh = o.backend->open(obj_name, IOR_RDONLY, o.backend_options); if (NULL == aiori_fh){ - FAIL("unable to open file %s", obj_name); + FAIL("Unable to open file %s", obj_name); } if ( o.file_size == (int) o.backend->xfer(READ, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { s->obj_read.suc++; @@ -601,18 +597,17 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } if (o.verbosity >= 2){ - printf("%d: delete %s:%s\n", o.rank, dset, obj_name); + printf("%d: delete %s\n", o.rank, obj_name); } s->obj_delete.suc++; int writeRank = (o.rank + o.offset * (d+1)) % o.size; - def_dset_name(dset, writeRank, d); - def_obj_name(obj_name, dset, writeRank, d, o.precreate + prevFile); + def_obj_name(obj_name, writeRank, d, o.precreate + prevFile); op_timer = GetTimeStamp(); - aiori_fh = o.backend->open(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); + aiori_fh = o.backend->create(obj_name, IOR_WRONLY | IOR_CREAT, o.backend_options); if (NULL == aiori_fh){ - FAIL("unable to open file %s", obj_name); + FAIL("Unable to open file %s", obj_name); } if ( o.file_size == (int) o.backend->xfer(WRITE, aiori_fh, (IOR_size_t *) buf, o.file_size, 0, o.backend_options)) { s->obj_create.suc++; @@ -632,7 +627,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } if (o.verbosity >= 2){ - printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); + printf("%d: write %s (%d)\n", o.rank, obj_name, ret); } } // end loop @@ -688,12 +683,10 @@ void run_cleanup(phase_stat_t * s, int start_index){ size_t pos = -1; // position inside the individual measurement array for(int d=0; d < o.dset_count; d++){ - def_dset_name(dset, o.rank, d); - for(int f=0; f < o.precreate; f++){ double op_time; pos++; - def_obj_name(obj_name, dset, o.rank, d, f + start_index); + def_obj_name(obj_name, o.rank, d, f + start_index); op_timer = GetTimeStamp(); o.backend->delete(obj_name, o.backend_options); @@ -705,10 +698,11 @@ void run_cleanup(phase_stat_t * s, int start_index){ s->obj_delete.suc++; } - if (o.backend->rmdir(dset, o.backend_options)) { + def_dset_name(dset, o.rank, d); + if (o.backend->rmdir(dset, o.backend_options) == 0) { s->dset_delete.suc++; }else{ - printf("unable to remove directory %s", dset); + printf("Unable to remove directory %s\n", dset); } if (o.verbosity >= 2){ printf("%d: delete dset %s\n", o.rank, dset); @@ -931,8 +925,8 @@ int md_workbench(int argc, char ** argv){ end_phase("cleanup", & phase_stats); if (o.rank == 0){ - if (! o.backend->rmdir(o.prefix, o.backend_options)) { - FAIL("unable to remove directory %s", o.prefix); + if (o.backend->rmdir(o.prefix, o.backend_options) != 0) { + printf("Unable to remove directory %s\n", o.prefix); } } }else{ @@ -940,7 +934,9 @@ int md_workbench(int argc, char ** argv){ } double t_all = GetTimeStamp(); - o.backend->finalize(o.backend_options); + if(o.backend->finalize){ + o.backend->finalize(o.backend_options); + } if (o.rank == 0 && ! o.quiet_output){ printf("Total runtime: %.0fs time: ", t_all); printTime(); From dc89a593712604a8ff0d0150bcfe4403e9683bc1 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 19:23:13 +0000 Subject: [PATCH 05/15] Workbench, adjusting the interface to IOR tool interfaces. --- src/aiori-POSIX.c | 2 +- src/md-workbench-main.c | 2 +- src/md-workbench.c | 63 +++++++++++++++++++++-------------------- src/md-workbench.h | 4 ++- 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/aiori-POSIX.c b/src/aiori-POSIX.c index b099903..e8933b7 100755 --- a/src/aiori-POSIX.c +++ b/src/aiori-POSIX.c @@ -497,7 +497,7 @@ aiori_fd_t *POSIX_Open(char *testFileName, int flags, aiori_mod_opt_t * param) *fd = open64(testFileName, fd_oflag); if (*fd < 0) - ERRF("open64(\"%s\", %d) failed", testFileName, fd_oflag); + ERRF("open64(\"%s\", %d) failed: %s", testFileName, fd_oflag, strerror(errno)); #ifdef HAVE_LUSTRE_USER if (o->lustre_ignore_locks) { diff --git a/src/md-workbench-main.c b/src/md-workbench-main.c index bdd12f2..0165824 100644 --- a/src/md-workbench-main.c +++ b/src/md-workbench-main.c @@ -4,7 +4,7 @@ int main(int argc, char ** argv){ MPI_Init(& argc, & argv); - int ret = md_workbench(argc, argv); + int ret = md_workbench_run(argc, argv, MPI_COMM_WORLD, stdout); MPI_Finalize(); return ret; } diff --git a/src/md-workbench.c b/src/md-workbench.c index a1a916d..d0b37e3 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -28,6 +28,8 @@ struct benchmark_options{ ior_aiori_t const * backend; void * backend_options; aiori_xfer_hint_t hints; + MPI_Comm com; + FILE * logfile; char * interface; int num; @@ -36,6 +38,7 @@ struct benchmark_options{ int offset; int iterations; + int global_iteration; int file_size; int read_only; int stonewall_timer; @@ -70,8 +73,6 @@ struct benchmark_options{ uint64_t start_item_number; }; -static int global_iteration; - struct benchmark_options o; static void def_dset_name(char * out_name, int n, int d){ @@ -293,13 +294,13 @@ static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t * t count += repeats; for(int i=1; i < o.size; i++){ int cnt; - ret = MPI_Recv(& global_times[count], max_repeats*2, MPI_FLOAT, i, 888, MPI_COMM_WORLD, & status); + ret = MPI_Recv(& global_times[count], max_repeats*2, MPI_FLOAT, i, 888, o.com, & status); CHECK_MPI_RET(ret) MPI_Get_count(& status, MPI_FLOAT, & cnt); count += cnt / 2; } }else{ - ret = MPI_Send(times, repeats * 2, MPI_FLOAT, 0, 888, MPI_COMM_WORLD); + ret = MPI_Send(times, repeats * 2, MPI_FLOAT, 0, 888, o.com); CHECK_MPI_RET(ret) } @@ -309,7 +310,7 @@ static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t * t static void compute_histogram(const char * name, time_result_t * times, time_statistics_t * stats, size_t repeats, int writeLatencyFile){ if(writeLatencyFile && o.latency_file_prefix ){ char file[1024]; - sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, global_iteration, name); + sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, o.global_iteration, name); FILE * f = fopen(file, "w+"); if(f == NULL){ printf("%d: Error writing to latency file: %s\n", o.rank, file); @@ -341,7 +342,7 @@ static void end_phase(const char * name, phase_stat_t * p){ char buff[MAX_PATHLEN]; //char * limit_memory_P = NULL; - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(o.com); int max_repeats = o.precreate * o.dset_count; if(strcmp(name,"benchmark") == 0){ @@ -352,19 +353,19 @@ static void end_phase(const char * name, phase_stat_t * p){ phase_stat_t g_stat; init_stats(& g_stat, (o.rank == 0 ? 1 : 0) * ((size_t) max_repeats) * o.size); // reduce timers - ret = MPI_Reduce(& p->t, & g_stat.t, 2, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + ret = MPI_Reduce(& p->t, & g_stat.t, 2, MPI_DOUBLE, MPI_MAX, 0, o.com); CHECK_MPI_RET(ret) if(o.rank == 0) { g_stat.t_all = (double*) malloc(sizeof(double) * o.size); } - ret = MPI_Gather(& p->t, 1, MPI_DOUBLE, g_stat.t_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + ret = MPI_Gather(& p->t, 1, MPI_DOUBLE, g_stat.t_all, 1, MPI_DOUBLE, 0, o.com); CHECK_MPI_RET(ret) - ret = MPI_Reduce(& p->dset_create, & g_stat.dset_create, 2*(2+4), MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + ret = MPI_Reduce(& p->dset_create, & g_stat.dset_create, 2*(2+4), MPI_INT, MPI_SUM, 0, o.com); CHECK_MPI_RET(ret) - ret = MPI_Reduce(& p->max_op_time, & g_stat.max_op_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + ret = MPI_Reduce(& p->max_op_time, & g_stat.max_op_time, 1, MPI_DOUBLE, MPI_MAX, 0, o.com); CHECK_MPI_RET(ret) if( p->stonewall_iterations ){ - ret = MPI_Reduce(& p->repeats, & g_stat.repeats, 1, MPI_UINT64_T, MPI_MIN, 0, MPI_COMM_WORLD); + ret = MPI_Reduce(& p->repeats, & g_stat.repeats, 1, MPI_UINT64_T, MPI_MIN, 0, o.com); CHECK_MPI_RET(ret) g_stat.stonewall_iterations = p->stonewall_iterations; } @@ -421,12 +422,12 @@ static void end_phase(const char * name, phase_stat_t * p){ print_p_stat(buff, name, p, p->t, 0); printf("0: %s\n", buff); for(int i=1; i < o.size; i++){ - MPI_Recv(buff, MAX_PATHLEN, MPI_CHAR, i, 4711, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(buff, MAX_PATHLEN, MPI_CHAR, i, 4711, o.com, MPI_STATUS_IGNORE); printf("%d: %s\n", i, buff); } }else{ print_p_stat(buff, name, p, p->t, 0); - MPI_Send(buff, MAX_PATHLEN, MPI_CHAR, 0, 4711, MPI_COMM_WORLD); + MPI_Send(buff, MAX_PATHLEN, MPI_CHAR, 0, 4711, o.com); } } @@ -469,7 +470,7 @@ void run_precreate(phase_stat_t * s, int current_index){ s->dset_create.err++; if (! o.ignore_precreate_errors){ printf("%d: Error while creating the dset: %s\n", o.rank, dset); - MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Abort(o.com, 1); } } } @@ -498,7 +499,7 @@ void run_precreate(phase_stat_t * s, int current_index){ if (! o.ignore_precreate_errors){ printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); fflush(stdout); - MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Abort(o.com, 1); } } o.backend->close(aiori_fh, o.backend_options); @@ -577,7 +578,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ s->obj_read.err++; printf("%d: Error while reading the obj: %s\n", o.rank, obj_name); fflush(stdout); - MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Abort(o.com, 1); } o.backend->close(aiori_fh, o.backend_options); @@ -616,7 +617,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ if (! o.ignore_precreate_errors){ printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); fflush(stdout); - MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Abort(o.com, 1); } } o.backend->close(aiori_fh, o.backend_options); @@ -643,7 +644,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ // wear out mode, now reduce the maximum int cur_pos = f + 1; phase_allreduce_time = GetTimeStamp() - s->phase_start_timer; - int ret = MPI_Allreduce(& cur_pos, & total_num, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + int ret = MPI_Allreduce(& cur_pos, & total_num, 1, MPI_INT, MPI_MAX, o.com); CHECK_MPI_RET(ret) s->phase_start_timer = GetTimeStamp(); s->stonewall_iterations = total_num; @@ -658,14 +659,14 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ s->t = GetTimeStamp() - s->phase_start_timer + phase_allreduce_time; if(armed_stone_wall && o.stonewall_timer_wear_out){ int f = total_num; - int ret = MPI_Allreduce(& f, & total_num, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + int ret = MPI_Allreduce(& f, & total_num, 1, MPI_INT, MPI_MAX, o.com); CHECK_MPI_RET(ret) s->stonewall_iterations = total_num; } if(o.stonewall_timer && ! o.stonewall_timer_wear_out){ // TODO FIXME int sh = s->stonewall_iterations; - int ret = MPI_Allreduce(& sh, & s->stonewall_iterations, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + int ret = MPI_Allreduce(& sh, & s->stonewall_iterations, 1, MPI_INT, MPI_MAX, o.com); CHECK_MPI_RET(ret) } @@ -764,7 +765,7 @@ static int return_position(){ } fclose(f); } - ret = MPI_Bcast( & position, 1, MPI_INT, 0, MPI_COMM_WORLD ); + ret = MPI_Bcast( & position, 1, MPI_INT, 0, o.com ); return position; } @@ -781,16 +782,18 @@ static void store_position(int position){ fclose(f); } -int md_workbench(int argc, char ** argv){ +int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile){ int ret; int printhelp = 0; char * limit_memory_P = NULL; - global_iteration = 0; init_options(); - MPI_Comm_rank(MPI_COMM_WORLD, & o.rank); - MPI_Comm_size(MPI_COMM_WORLD, & o.size); + o.com = world_com; + o.logfile = out_logfile; + + MPI_Comm_rank(o.com, & o.rank); + MPI_Comm_size(o.com, & o.size); if (o.rank == 0 && ! o.quiet_output){ printf("Args: %s", argv[0]); @@ -863,7 +866,7 @@ int md_workbench(int argc, char ** argv){ //ret = mem_preallocate(& limit_memory_P, o.limit_memory, o.verbosity >= 3); //if(ret != 0){ // printf("%d: Error allocating memory\n", o.rank); - // MPI_Abort(MPI_COMM_WORLD, 1); + // MPI_Abort(o.com, 1); //} double bench_start; @@ -881,7 +884,7 @@ int md_workbench(int argc, char ** argv){ } } init_stats(& phase_stats, o.precreate * o.dset_count); - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(o.com); // pre-creation phase phase_stats.phase_start_timer = GetTimeStamp(); @@ -892,12 +895,12 @@ int md_workbench(int argc, char ** argv){ if (o.phase_benchmark){ // benchmark phase - for(global_iteration = 0; global_iteration < o.iterations; global_iteration++){ + for(o.global_iteration = 0; o.global_iteration < o.iterations; o.global_iteration++){ if(o.adaptive_waiting_mode){ o.relative_waiting_factor = 0; } init_stats(& phase_stats, o.num * o.dset_count); - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(o.com); phase_stats.phase_start_timer = GetTimeStamp(); run_benchmark(& phase_stats, & current_index); end_phase("benchmark", & phase_stats); @@ -906,7 +909,7 @@ int md_workbench(int argc, char ** argv){ o.relative_waiting_factor = 0.0625; for(int r=0; r <= 6; r++){ init_stats(& phase_stats, o.num * o.dset_count); - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(o.com); phase_stats.phase_start_timer = GetTimeStamp(); run_benchmark(& phase_stats, & current_index); end_phase("benchmark", & phase_stats); diff --git a/src/md-workbench.h b/src/md-workbench.h index c556af8..2bfbddc 100644 --- a/src/md-workbench.h +++ b/src/md-workbench.h @@ -2,6 +2,8 @@ #define IOR_MD_WORKBENCH_H #include +#include +#include // successfull, errors typedef struct { @@ -56,6 +58,6 @@ typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! int stonewall_iterations; } phase_stat_t; -int md_workbench(int argc, char ** argv); +int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile); #endif From 654b797788d923a939578c10d5565badf11e5eff Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 19:34:15 +0000 Subject: [PATCH 06/15] Converted output to IOR output. --- src/md-workbench.c | 77 ++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index d0b37e3..2e0c47b 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -24,6 +24,8 @@ It follows the hierarchical file system semantics in contrast to the md-workbenc #define LLU (long long unsigned) #define min(a,b) (a < b ? a : b) +#define oprintf(...) do { fprintf(o.logfile, __VA_ARGS__); fflush(o.logfile); } while(0); + struct benchmark_options{ ior_aiori_t const * backend; void * backend_options; @@ -309,11 +311,11 @@ static uint64_t aggregate_timers(int repeats, int max_repeats, time_result_t * t static void compute_histogram(const char * name, time_result_t * times, time_statistics_t * stats, size_t repeats, int writeLatencyFile){ if(writeLatencyFile && o.latency_file_prefix ){ - char file[1024]; + char file[MAX_PATHLEN]; sprintf(file, "%s-%.2f-%d-%s.csv", o.latency_file_prefix, o.relative_waiting_factor, o.global_iteration, name); FILE * f = fopen(file, "w+"); if(f == NULL){ - printf("%d: Error writing to latency file: %s\n", o.rank, file); + ERRF("%d: Error writing to latency file: %s\n", o.rank, file); return; } fprintf(f, "time,runtime\n"); @@ -414,16 +416,16 @@ static void end_phase(const char * name, phase_stat_t * p){ if (o.rank == 0){ //print the stats: print_p_stat(buff, name, & g_stat, g_stat.t, 1); - printf("%s\n", buff); + oprintf("%s\n", buff); } if(o.process_report){ if(o.rank == 0){ print_p_stat(buff, name, p, p->t, 0); - printf("0: %s\n", buff); + oprintf("0: %s\n", buff); for(int i=1; i < o.size; i++){ MPI_Recv(buff, MAX_PATHLEN, MPI_CHAR, i, 4711, o.com, MPI_STATUS_IGNORE); - printf("%d: %s\n", i, buff); + oprintf("%d: %s\n", i, buff); } }else{ print_p_stat(buff, name, p, p->t, 0); @@ -469,8 +471,7 @@ void run_precreate(phase_stat_t * s, int current_index){ }else{ s->dset_create.err++; if (! o.ignore_precreate_errors){ - printf("%d: Error while creating the dset: %s\n", o.rank, dset); - MPI_Abort(o.com, 1); + ERRF("%d: Error while creating the dset: %s\n", o.rank, dset); } } } @@ -497,9 +498,7 @@ void run_precreate(phase_stat_t * s, int current_index){ }else{ s->obj_create.err++; if (! o.ignore_precreate_errors){ - printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); - fflush(stdout); - MPI_Abort(o.com, 1); + ERRF("%d: Error while creating the obj: %s\n", o.rank, obj_name); } } o.backend->close(aiori_fh, o.backend_options); @@ -507,7 +506,7 @@ void run_precreate(phase_stat_t * s, int current_index){ add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); if (o.verbosity >= 2){ - printf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); + oprintf("%d: write %s:%s (%d)\n", o.rank, dset, obj_name, ret); } } } @@ -552,19 +551,19 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } if (o.verbosity >= 2){ - printf("%d: stat %s (%d)\n", o.rank, obj_name, ret); + oprintf("%d: stat %s (%d)\n", o.rank, obj_name, ret); } if(ret != 0){ if (o.verbosity) - printf("%d: Error while stating the obj: %s\n", o.rank, obj_name); + ERRF("%d: Error while stating the obj: %s\n", o.rank, obj_name); s->obj_stat.err++; continue; } s->obj_stat.suc++; if (o.verbosity >= 2){ - printf("%d: read %s \n", o.rank, obj_name); + oprintf("%d: read %s \n", o.rank, obj_name); } op_timer = GetTimeStamp(); @@ -576,9 +575,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ s->obj_read.suc++; }else{ s->obj_read.err++; - printf("%d: Error while reading the obj: %s\n", o.rank, obj_name); - fflush(stdout); - MPI_Abort(o.com, 1); + ERRF("%d: Error while reading the obj: %s\n", o.rank, obj_name); } o.backend->close(aiori_fh, o.backend_options); @@ -598,7 +595,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } if (o.verbosity >= 2){ - printf("%d: delete %s\n", o.rank, obj_name); + oprintf("%d: delete %s\n", o.rank, obj_name); } s->obj_delete.suc++; @@ -615,9 +612,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ }else{ s->obj_create.err++; if (! o.ignore_precreate_errors){ - printf("%d: Error while creating the obj: %s\n", o.rank, obj_name); - fflush(stdout); - MPI_Abort(o.com, 1); + ERRF("%d: Error while creating the obj: %s\n", o.rank, obj_name); } } o.backend->close(aiori_fh, o.backend_options); @@ -628,13 +623,13 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ } if (o.verbosity >= 2){ - printf("%d: write %s (%d)\n", o.rank, obj_name, ret); + oprintf("%d: write %s (%d)\n", o.rank, obj_name, ret); } } // end loop if(armed_stone_wall && bench_runtime >= o.stonewall_timer){ if(o.verbosity){ - printf("%d: stonewall runtime %fs (%ds)\n", o.rank, bench_runtime, o.stonewall_timer); + oprintf("%d: stonewall runtime %fs (%ds)\n", o.rank, bench_runtime, o.stonewall_timer); } if(! o.stonewall_timer_wear_out){ s->stonewall_iterations = f; @@ -649,7 +644,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ s->phase_start_timer = GetTimeStamp(); s->stonewall_iterations = total_num; if(o.rank == 0){ - printf("stonewall wear out %fs (%d iter)\n", bench_runtime, total_num); + oprintf("stonewall wear out %fs (%d iter)\n", bench_runtime, total_num); } if(f == total_num){ break; @@ -694,7 +689,7 @@ void run_cleanup(phase_stat_t * s, int start_index){ add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); if (o.verbosity >= 2){ - printf("%d: delete %s\n", o.rank, obj_name); + oprintf("%d: delete %s\n", o.rank, obj_name); } s->obj_delete.suc++; } @@ -703,10 +698,10 @@ void run_cleanup(phase_stat_t * s, int start_index){ if (o.backend->rmdir(dset, o.backend_options) == 0) { s->dset_delete.suc++; }else{ - printf("Unable to remove directory %s\n", dset); + oprintf("Unable to remove directory %s\n", dset); } if (o.verbosity >= 2){ - printf("%d: delete dset %s\n", o.rank, dset); + oprintf("%d: delete dset %s\n", o.rank, dset); } } } @@ -747,7 +742,7 @@ static void printTime(){ char buff[100]; time_t now = time(0); strftime (buff, 100, "%Y-%m-%d %H:%M:%S", localtime (&now)); - printf("%s\n", buff); + oprintf("%s\n", buff); } static int return_position(){ @@ -755,12 +750,12 @@ static int return_position(){ if( o.rank == 0){ FILE * f = fopen(o.run_info_file, "r"); if(! f){ - printf("[ERROR] Could not open %s for restart\n", o.run_info_file); + ERRF("[ERROR] Could not open %s for restart\n", o.run_info_file); exit(1); } ret = fscanf(f, "pos: %d", & position); if (ret != 1){ - printf("Could not read from %s for restart\n", o.run_info_file); + ERRF("Could not read from %s for restart\n", o.run_info_file); exit(1); } fclose(f); @@ -775,7 +770,7 @@ static void store_position(int position){ } FILE * f = fopen(o.run_info_file, "w"); if(! f){ - printf("[ERROR] Could not open %s for saving data\n", o.run_info_file); + ERRF("[ERROR] Could not open %s for saving data\n", o.run_info_file); exit(1); } fprintf(f, "pos: %d\n", position); @@ -796,11 +791,11 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf MPI_Comm_size(o.com, & o.size); if (o.rank == 0 && ! o.quiet_output){ - printf("Args: %s", argv[0]); + oprintf("Args: %s", argv[0]); for(int i=1; i < argc; i++){ - printf(" \"%s\"", argv[i]); + oprintf(" \"%s\"", argv[i]); } - printf("\n"); + oprintf("\n"); } memset(& o.hints, 0, sizeof(o.hints)); @@ -821,7 +816,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf } if (! o.phase_precreate && o.phase_benchmark && o.stonewall_timer && ! o.stonewall_timer_wear_out){ if(o.rank == 0) - printf("Invalid options, if running only the benchmark phase using -2 with stonewall option then use stonewall wear-out\n"); + ERR("Invalid options, if running only the benchmark phase using -2 with stonewall option then use stonewall wear-out"); exit(1); } @@ -843,23 +838,23 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf } if(o.start_item_number){ - printf("Using start position %lld\n", (long long) o.start_item_number); + oprintf("Using start position %lld\n", (long long) o.start_item_number); current_index = o.start_item_number; } size_t total_obj_count = o.dset_count * (size_t) (o.num * o.iterations + o.precreate) * o.size; if (o.rank == 0 && ! o.quiet_output){ - printf("MD-Workbench total objects: %zu workingset size: %.3f MiB (version: %s) time: ", total_obj_count, ((double) o.size) * o.dset_count * o.precreate * o.file_size / 1024.0 / 1024.0, PACKAGE_VERSION); + oprintf("MD-Workbench total objects: %zu workingset size: %.3f MiB (version: %s) time: ", total_obj_count, ((double) o.size) * o.dset_count * o.precreate * o.file_size / 1024.0 / 1024.0, PACKAGE_VERSION); printTime(); if(o.num > o.precreate){ - printf("WARNING: num > precreate, this may cause the situation that no objects are available to read\n"); + oprintf("WARNING: num > precreate, this may cause the situation that no objects are available to read\n"); } } if ( o.rank == 0 && ! o.quiet_output ){ // print the set output options option_print_current(options); - printf("\n"); + oprintf("\n"); } // preallocate memory if necessary @@ -929,7 +924,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf if (o.rank == 0){ if (o.backend->rmdir(o.prefix, o.backend_options) != 0) { - printf("Unable to remove directory %s\n", o.prefix); + oprintf("Unable to remove directory %s\n", o.prefix); } } }else{ @@ -941,7 +936,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf o.backend->finalize(o.backend_options); } if (o.rank == 0 && ! o.quiet_output){ - printf("Total runtime: %.0fs time: ", t_all); + oprintf("Total runtime: %.0fs time: ", t_all); printTime(); } From 82d20f27445a1a5b15946a0951b26584776d1cdc Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Mon, 2 Nov 2020 20:12:15 +0000 Subject: [PATCH 07/15] Basic API converted. --- src/aiori-POSIX.c | 2 +- src/md-workbench-main.c | 7 +++++-- src/md-workbench.c | 22 ++++++++++++++-------- src/md-workbench.h | 5 +++-- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/aiori-POSIX.c b/src/aiori-POSIX.c index e8933b7..2f5bcd7 100755 --- a/src/aiori-POSIX.c +++ b/src/aiori-POSIX.c @@ -100,7 +100,7 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o #endif #ifdef HAVE_LUSTRE_USER - {0, "posix.lustre.stripecount", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_count}, + {0, "posix.lustre.stribeegfs_chunkSizepecount", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_count}, {0, "posix.lustre.stripesize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_size}, {0, "posix.lustre.startost", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_start_ost}, {0, "posix.lustre.ignorelocks", "", OPTION_FLAG, 'd', & o->lustre_ignore_locks}, diff --git a/src/md-workbench-main.c b/src/md-workbench-main.c index 0165824..bb94126 100644 --- a/src/md-workbench-main.c +++ b/src/md-workbench-main.c @@ -4,7 +4,10 @@ int main(int argc, char ** argv){ MPI_Init(& argc, & argv); - int ret = md_workbench_run(argc, argv, MPI_COMM_WORLD, stdout); + //phase_stat_t* results = + md_workbench_run(argc, argv, MPI_COMM_WORLD, stdout); + // API check, access the results of the first phase which is precrate. + //printf("Max op runtime: %f\n", results->max_op_time); MPI_Finalize(); - return ret; + return 0; } diff --git a/src/md-workbench.c b/src/md-workbench.c index 2e0c47b..b1c4dc8 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -38,6 +38,8 @@ struct benchmark_options{ int precreate; int dset_count; + int result_position; // in the global structure + int offset; int iterations; int global_iteration; @@ -339,7 +341,7 @@ static void compute_histogram(const char * name, time_result_t * times, time_sta stats->max = times[repeats - 1].runtime; } -static void end_phase(const char * name, phase_stat_t * p){ +static void end_phase(const char * name, phase_stat_t * p, phase_stat_t * result){ int ret; char buff[MAX_PATHLEN]; @@ -449,6 +451,10 @@ static void end_phase(const char * name, phase_stat_t * p){ free(g_stat.time_delete); } + // copy the result back for the API + memcpy(& result[o.result_position], & g_stat, sizeof(g_stat)); + o.result_position++; + // allocate memory if necessary // ret = mem_preallocate(& limit_memory_P, o.limit_memory_between_phases, o.verbosity >= 3); // if( ret != 0){ @@ -777,7 +783,7 @@ static void store_position(int position){ fclose(f); } -int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile){ +phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile){ int ret; int printhelp = 0; char * limit_memory_P = NULL; @@ -867,6 +873,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf double bench_start; bench_start = GetTimeStamp(); phase_stat_t phase_stats; + phase_stat_t* all_phases_stats = malloc(sizeof(phase_stat_t) * (2 + o.iterations)); if(o.rank == 0 && o.print_detailed_stats && ! o.quiet_output){ print_detailed_stat_header(); @@ -885,7 +892,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf phase_stats.phase_start_timer = GetTimeStamp(); run_precreate(& phase_stats, current_index); phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; - end_phase("precreate", & phase_stats); + end_phase("precreate", & phase_stats, all_phases_stats); } if (o.phase_benchmark){ @@ -898,7 +905,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf MPI_Barrier(o.com); phase_stats.phase_start_timer = GetTimeStamp(); run_benchmark(& phase_stats, & current_index); - end_phase("benchmark", & phase_stats); + end_phase("benchmark", & phase_stats, all_phases_stats); if(o.adaptive_waiting_mode){ o.relative_waiting_factor = 0.0625; @@ -907,7 +914,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf MPI_Barrier(o.com); phase_stats.phase_start_timer = GetTimeStamp(); run_benchmark(& phase_stats, & current_index); - end_phase("benchmark", & phase_stats); + end_phase("benchmark", & phase_stats, all_phases_stats); o.relative_waiting_factor *= 2; } } @@ -920,7 +927,7 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf phase_stats.phase_start_timer = GetTimeStamp(); run_cleanup(& phase_stats, current_index); phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; - end_phase("cleanup", & phase_stats); + end_phase("cleanup", & phase_stats, all_phases_stats); if (o.rank == 0){ if (o.backend->rmdir(o.prefix, o.backend_options) != 0) { @@ -939,7 +946,6 @@ int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logf oprintf("Total runtime: %.0fs time: ", t_all); printTime(); } - //mem_free_preallocated(& limit_memory_P); - return 0; + return all_phases_stats; } diff --git a/src/md-workbench.h b/src/md-workbench.h index 2bfbddc..e8794f5 100644 --- a/src/md-workbench.h +++ b/src/md-workbench.h @@ -40,7 +40,7 @@ typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! op_stat_t obj_stat; op_stat_t obj_delete; - // time measurements individual runs + // time measurements of individual runs, these are not returned for now by the API! uint64_t repeats; time_result_t * time_create; time_result_t * time_read; @@ -58,6 +58,7 @@ typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! int stonewall_iterations; } phase_stat_t; -int md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile); +// @Return The first statistics returned are precreate, then iteration many benchmark runs, the last is cleanup +phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile); #endif From e1dd3103cfc1b94088c54900ca8c5791d48f5ffb Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Tue, 3 Nov 2020 10:52:45 +0000 Subject: [PATCH 08/15] Fix make dist for md-workbench. --- src/Makefile.am | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 47ede87..d933aa6 100755 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -15,7 +15,7 @@ extraLDADD = extraLDFLAGS = extraCPPFLAGS = -md_workbench_SOURCES = md-workbench.c md-workbench-main.c +md_workbench_SOURCES = md-workbench-main.c md_workbench_LDFLAGS = md_workbench_LDADD = libaiori.a md_workbench_CPPFLAGS = @@ -132,6 +132,16 @@ mdtest_LDFLAGS += $(extraLDFLAGS) mdtest_LDADD += $(extraLDADD) mdtest_CPPFLAGS += $(extraCPPFLAGS) +md_workbench_SOURCES += $(extraSOURCES) +md_workbench_LDFLAGS += $(extraLDFLAGS) +md_workbench_LDADD += $(extraLDADD) +md_workbench_CPPFLAGS += $(extraCPPFLAGS) + +MD_WORKBENCH_SOURCES = $(md_workbench_SOURCES) +MD_WORKBENCH_LDFLAGS = $(md_workbench_LDFLAGS) +MD_WORKBENCH_LDADD = $(md_workbench_LDADD) +MD_WORKBENCH_CPPFLAGS = $(md_workbench_CPPFLAGS) + IOR_SOURCES = $(ior_SOURCES) IOR_LDFLAGS = $(ior_LDFLAGS) IOR_LDADD = $(ior_LDADD) @@ -145,7 +155,8 @@ MDTEST_CPPFLAGS = $(mdtest_CPPFLAGS) libaiori_a_SOURCES += $(extraSOURCES) libaiori_a_CPPFLAGS = $(extraCPPFLAGS) -# Generate config file with build flags to allow reuse of library +# Generate a config file with the build flags to allow the reuse of library +.PHONY: build.conf all-local: build.conf build.conf: @echo LDFLAGS=$(LDFLAGS) $(extraLDFLAGS) $(extraLDADD) > build.conf From 4e452766b96e748da098fbd9e4339dbc14c655dc Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Wed, 4 Nov 2020 12:48:46 +0000 Subject: [PATCH 09/15] Enable random seed to be stored. --- src/ior.c | 22 +++++++++++----------- src/parse_options.c | 1 + 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/ior.c b/src/ior.c index 010a5ce..dc416e3 100755 --- a/src/ior.c +++ b/src/ior.c @@ -1624,8 +1624,8 @@ static void ValidateTests(IOR_param_t * test) if (test->randomOffset && test->reorderTasks && test->filePerProc == FALSE) ERR("random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit"); - if (test->randomOffset && test->checkRead) - ERR("random offset not available with read check option (use write check)"); + if (test->randomOffset && test->checkRead && test->randomSeed == -1) + ERR("random offset with read check option requires to set the random seed"); if (test->randomOffset && test->storeFileOffset) ERR("random offset not available with store file offset option)"); if ((strcasecmp(test->api, "HDF5") == 0) && test->randomOffset) @@ -1711,11 +1711,11 @@ IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int acce IOR_offset_t fileSize; IOR_offset_t *offsetArray; - /* set up seed for random() */ - if (access == WRITE || access == READ) { + /* set up seed, each process can determine which regions to access individually */ + if (test->randomSeed == -1) { test->randomSeed = seed = rand(); } else { - seed = test->randomSeed; + seed = test->randomSeed + pretendRank; } srand(seed); @@ -1725,16 +1725,16 @@ IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int acce } /* count needed offsets (pass 1) */ - for (i = 0; i < fileSize; i += test->transferSize) { - if (test->filePerProc == FALSE) { + if (test->filePerProc == FALSE) { + for (i = 0; i < fileSize; i += test->transferSize) { // this counts which process get how many transferes in // a shared file if ((rand() % test->numTasks) == pretendRank) { offsets++; } - } else { - offsets++; - } + } + } else { + offsets += fileSize / test->transferSize; } /* setup empty array */ @@ -1751,7 +1751,7 @@ IOR_offset_t *GetOffsetArrayRandom(IOR_param_t * test, int pretendRank, int acce } } else { /* fill with offsets (pass 2) */ - srand(seed); /* need same seed to get same transfers as counted in the beginning*/ + srand(seed); /* need same seedto get same transfers as counted in the beginning*/ for (i = 0; i < fileSize; i += test->transferSize) { if ((rand() % test->numTasks) == pretendRank) { offsetArray[offsetCnt] = i; diff --git a/src/parse_options.c b/src/parse_options.c index c2b5b8c..87e3c91 100755 --- a/src/parse_options.c +++ b/src/parse_options.c @@ -433,6 +433,7 @@ option_help * createGlobalOptions(IOR_param_t * params){ {'y', NULL, "dualMount -- use dual mount points for a filesystem", OPTION_FLAG, 'd', & params->dualMount}, {'Y', NULL, "fsyncPerWrite -- perform sync operation after every write operation", OPTION_FLAG, 'd', & params->fsyncPerWrite}, {'z', NULL, "randomOffset -- access is to random, not sequential, offsets within a file", OPTION_FLAG, 'd', & params->randomOffset}, + {0, "random-offset-seed", "The seed for -z", OPTION_OPTIONAL_ARGUMENT, 'd', & params->randomSeed}, {'Z', NULL, "reorderTasksRandom -- changes task ordering to random ordering for readback", OPTION_FLAG, 'd', & params->reorderTasksRandom}, {0, "warningAsErrors", "Any warning should lead to an error.", OPTION_FLAG, 'd', & params->warningAsErrors}, {.help=" -O summaryFile=FILE -- store result data into this file", .arg = OPTION_OPTIONAL_ARGUMENT}, From b5dfeea82a8e8f7cbaf2b49c60d52f5047cec1e9 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Wed, 4 Nov 2020 18:04:46 +0000 Subject: [PATCH 10/15] Remove output for the API. --- src/md-workbench.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index b1c4dc8..2dfc90b 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -859,8 +859,8 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE if ( o.rank == 0 && ! o.quiet_output ){ // print the set output options - option_print_current(options); - oprintf("\n"); + // option_print_current(options); + // oprintf("\n"); } // preallocate memory if necessary From d9c74af8f302fd21b5ccb7fde94e57a5f0f647d0 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Wed, 4 Nov 2020 19:27:39 +0000 Subject: [PATCH 11/15] Fix (accidently rename of option) --- src/aiori-POSIX.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aiori-POSIX.c b/src/aiori-POSIX.c index 2f5bcd7..e8933b7 100755 --- a/src/aiori-POSIX.c +++ b/src/aiori-POSIX.c @@ -100,7 +100,7 @@ option_help * POSIX_options(aiori_mod_opt_t ** init_backend_options, aiori_mod_o #endif #ifdef HAVE_LUSTRE_USER - {0, "posix.lustre.stribeegfs_chunkSizepecount", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_count}, + {0, "posix.lustre.stripecount", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_count}, {0, "posix.lustre.stripesize", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_stripe_size}, {0, "posix.lustre.startost", "", OPTION_OPTIONAL_ARGUMENT, 'd', & o->lustre_start_ost}, {0, "posix.lustre.ignorelocks", "", OPTION_FLAG, 'd', & o->lustre_ignore_locks}, From c0ffdf44d0d78cc554561bbddf37c10c56c49494 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Wed, 4 Nov 2020 20:25:27 +0000 Subject: [PATCH 12/15] Workbench API improved. --- src/md-workbench.c | 95 +++++++++++++++++++++++++++++++++++++++------- src/md-workbench.h | 48 +++++++---------------- 2 files changed, 94 insertions(+), 49 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index 2dfc90b..d981324 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -26,6 +26,50 @@ It follows the hierarchical file system semantics in contrast to the md-workbenc #define oprintf(...) do { fprintf(o.logfile, __VA_ARGS__); fflush(o.logfile); } while(0); +// successfull, errors +typedef struct { + int suc; + int err; +} op_stat_t; + +// A runtime for an operation and when the operation was started +typedef struct{ + float time_since_app_start; + float runtime; +} time_result_t; + + +// statistics for running a single phase +typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! + double t; // maximum time + double * t_all; + + op_stat_t dset_create; + op_stat_t dset_delete; + + op_stat_t obj_create; + op_stat_t obj_read; + op_stat_t obj_stat; + op_stat_t obj_delete; + + // time measurements of individual runs, these are not returned for now by the API! + uint64_t repeats; + time_result_t * time_create; + time_result_t * time_read; + time_result_t * time_stat; + time_result_t * time_delete; + + time_statistics_t stats_create; + time_statistics_t stats_read; + time_statistics_t stats_stat; + time_statistics_t stats_delete; + + // the maximum time for any single operation + double max_op_time; + double phase_start_timer; + int stonewall_iterations; +} phase_stat_t; + struct benchmark_options{ ior_aiori_t const * backend; void * backend_options; @@ -38,7 +82,7 @@ struct benchmark_options{ int precreate; int dset_count; - int result_position; // in the global structure + mdworkbench_results_t * results; // the results int offset; int iterations; @@ -211,10 +255,13 @@ static void print_p_stat(char * buff, const char * name, phase_stat_t * p, doubl ioops_per_iter = 2; } + double rate; + switch(name[0]){ case('b'): + rate = p->obj_read.suc * ioops_per_iter / t; pos += sprintf(buff + pos, "rate:%.1f iops/s objects:%d rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es", - p->obj_read.suc * ioops_per_iter / t, // write, stat, read, delete + rate, // write, stat, read, delete p->obj_read.suc, p->obj_read.suc / t, tp, @@ -225,8 +272,9 @@ static void print_p_stat(char * buff, const char * name, phase_stat_t * p, doubl } break; case('p'): + rate = (p->dset_create.suc + p->obj_create.suc) / t; pos += sprintf(buff + pos, "rate:%.1f iops/s dsets: %d objects:%d rate:%.3f dset/s rate:%.1f obj/s tp:%.1f MiB/s op-max:%.4es", - (p->dset_create.suc + p->obj_create.suc) / t, + rate, p->dset_create.suc, p->obj_create.suc, p->dset_create.suc / t, @@ -235,8 +283,9 @@ static void print_p_stat(char * buff, const char * name, phase_stat_t * p, doubl p->max_op_time); break; case('c'): + rate = (p->obj_delete.suc + p->dset_delete.suc) / t; pos += sprintf(buff + pos, "rate:%.1f iops/s objects:%d dsets: %d rate:%.1f obj/s rate:%.3f dset/s op-max:%.4es", - (p->obj_delete.suc + p->dset_delete.suc) / t, + rate, p->obj_delete.suc, p->dset_delete.suc, p->obj_delete.suc / t, @@ -248,6 +297,16 @@ static void print_p_stat(char * buff, const char * name, phase_stat_t * p, doubl break; } + if(print_global){ + mdworkbench_result_t * res = & o.results->result[o.results->count]; + res->errors = errs; + o.results->errors += errs; + res->rate = rate; + res->max_op_time = p->max_op_time; + res->runtime = t; + res->iterations_done = p->repeats; + } + if(! o.quiet_output || errs > 0){ pos += sprintf(buff + pos, " (%d errs", errs); if(errs > 0){ @@ -341,7 +400,7 @@ static void compute_histogram(const char * name, time_result_t * times, time_sta stats->max = times[repeats - 1].runtime; } -static void end_phase(const char * name, phase_stat_t * p, phase_stat_t * result){ +static void end_phase(const char * name, phase_stat_t * p){ int ret; char buff[MAX_PATHLEN]; @@ -452,8 +511,13 @@ static void end_phase(const char * name, phase_stat_t * p, phase_stat_t * result } // copy the result back for the API - memcpy(& result[o.result_position], & g_stat, sizeof(g_stat)); - o.result_position++; + mdworkbench_result_t * res = & o.results->result[o.results->count]; + memcpy(& res->stats_create, & g_stat.stats_create, sizeof(time_statistics_t)); + memcpy(& res->stats_read, & g_stat.stats_read, sizeof(time_statistics_t)); + memcpy(& res->stats_stat, & g_stat.stats_stat, sizeof(time_statistics_t)); + memcpy(& res->stats_delete, & g_stat.stats_delete, sizeof(time_statistics_t)); + + o.results->count++; // allocate memory if necessary // ret = mem_preallocate(& limit_memory_P, o.limit_memory_between_phases, o.verbosity >= 3); @@ -783,7 +847,7 @@ static void store_position(int position){ fclose(f); } -phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile){ +mdworkbench_results_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile){ int ret; int printhelp = 0; char * limit_memory_P = NULL; @@ -873,7 +937,10 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE double bench_start; bench_start = GetTimeStamp(); phase_stat_t phase_stats; - phase_stat_t* all_phases_stats = malloc(sizeof(phase_stat_t) * (2 + o.iterations)); + size_t result_count = (2 + o.iterations) * (o.adaptive_waiting_mode ? 7 : 1); + o.results = malloc(sizeof(mdworkbench_results_t) + sizeof(mdworkbench_result_t) * result_count); + memset(o.results, 0, sizeof(mdworkbench_results_t) + sizeof(mdworkbench_result_t) * result_count); + o.results->count = 0; if(o.rank == 0 && o.print_detailed_stats && ! o.quiet_output){ print_detailed_stat_header(); @@ -892,7 +959,7 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE phase_stats.phase_start_timer = GetTimeStamp(); run_precreate(& phase_stats, current_index); phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; - end_phase("precreate", & phase_stats, all_phases_stats); + end_phase("precreate", & phase_stats); } if (o.phase_benchmark){ @@ -905,7 +972,7 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE MPI_Barrier(o.com); phase_stats.phase_start_timer = GetTimeStamp(); run_benchmark(& phase_stats, & current_index); - end_phase("benchmark", & phase_stats, all_phases_stats); + end_phase("benchmark", & phase_stats); if(o.adaptive_waiting_mode){ o.relative_waiting_factor = 0.0625; @@ -914,7 +981,7 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE MPI_Barrier(o.com); phase_stats.phase_start_timer = GetTimeStamp(); run_benchmark(& phase_stats, & current_index); - end_phase("benchmark", & phase_stats, all_phases_stats); + end_phase("benchmark", & phase_stats); o.relative_waiting_factor *= 2; } } @@ -927,7 +994,7 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE phase_stats.phase_start_timer = GetTimeStamp(); run_cleanup(& phase_stats, current_index); phase_stats.t = GetTimeStamp() - phase_stats.phase_start_timer; - end_phase("cleanup", & phase_stats, all_phases_stats); + end_phase("cleanup", & phase_stats); if (o.rank == 0){ if (o.backend->rmdir(o.prefix, o.backend_options) != 0) { @@ -947,5 +1014,5 @@ phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE printTime(); } //mem_free_preallocated(& limit_memory_P); - return all_phases_stats; + return o.results; } diff --git a/src/md-workbench.h b/src/md-workbench.h index e8794f5..394a43c 100644 --- a/src/md-workbench.h +++ b/src/md-workbench.h @@ -5,18 +5,6 @@ #include #include -// successfull, errors -typedef struct { - int suc; - int err; -} op_stat_t; - -// A runtime for an operation and when the operation was started -typedef struct{ - float time_since_app_start; - float runtime; -} time_result_t; - typedef struct{ float min; float q1; @@ -27,38 +15,28 @@ typedef struct{ float max; } time_statistics_t; + // statistics for running a single phase typedef struct{ // NOTE: if this type is changed, adjust end_phase() !!! - double t; // maximum time - double * t_all; - - op_stat_t dset_create; - op_stat_t dset_delete; - - op_stat_t obj_create; - op_stat_t obj_read; - op_stat_t obj_stat; - op_stat_t obj_delete; - - // time measurements of individual runs, these are not returned for now by the API! - uint64_t repeats; - time_result_t * time_create; - time_result_t * time_read; - time_result_t * time_stat; - time_result_t * time_delete; - time_statistics_t stats_create; time_statistics_t stats_read; time_statistics_t stats_stat; time_statistics_t stats_delete; - // the maximum time for any single operation + int errors; + double rate; double max_op_time; - double phase_start_timer; - int stonewall_iterations; -} phase_stat_t; + double runtime; + uint64_t iterations_done; +} mdworkbench_result_t; + +typedef struct{ + int count; // the number of results + int errors; + mdworkbench_result_t result[]; +} mdworkbench_results_t; // @Return The first statistics returned are precreate, then iteration many benchmark runs, the last is cleanup -phase_stat_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile); +mdworkbench_results_t* md_workbench_run(int argc, char ** argv, MPI_Comm world_com, FILE * out_logfile); #endif From 75c08386a44c625d6d568d2f912ff5986f08b263 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Wed, 4 Nov 2020 20:47:30 +0000 Subject: [PATCH 13/15] Bugfix porting error for performance stats. --- src/md-workbench.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index d981324..ed9ec4e 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -175,7 +175,7 @@ static void init_stats(phase_stat_t * p, size_t repeats){ static float add_timed_result(double start, double phase_start_timer, time_result_t * results, size_t pos, double * max_time, double * out_op_time){ float curtime = start - phase_start_timer; - double op_time = GetTimeStamp(); + double op_time = GetTimeStamp() - start; results[pos].runtime = (float) op_time; results[pos].time_since_app_start = curtime; if (op_time > *max_time){ @@ -248,7 +248,7 @@ static void print_p_stat(char * buff, const char * name, phase_stat_t * p, doubl // single line pos += sprintf(buff, "%s process max:%.2fs ", name, t); if(print_global){ - pos += sprintf(buff + pos, "min:%.1fs mean: %.1fs balance:%.1f stddev:%.1f ", r_min, r_mean, r_min/r_max * 100.0, r_std); + pos += sprintf(buff + pos, "min:%.2fs mean: %.2fs balance:%.1f stddev:%.1f ", r_min, r_mean, r_min/r_max * 100.0, r_std); } int ioops_per_iter = 4; if(o.read_only){ From 2d79efc0c5fd0cbbb0f4a5dff62a9b41a214068a Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Thu, 5 Nov 2020 19:13:08 +0000 Subject: [PATCH 14/15] Fix wait issue on MacOS --- src/md-workbench.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/md-workbench.c b/src/md-workbench.c index ed9ec4e..34dfa01 100644 --- a/src/md-workbench.c +++ b/src/md-workbench.c @@ -144,7 +144,7 @@ void init_options(){ o.run_info_file = "md-workbench.status"; } -static void wait(double runtime){ +static void mdw_wait(double runtime){ double waittime = runtime * o.relative_waiting_factor; //printf("waittime: %e\n", waittime); if(waittime < 0.01){ @@ -617,7 +617,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_stat, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { - wait(op_time); + mdw_wait(op_time); } if (o.verbosity >= 2){ @@ -651,7 +651,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_read, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { - wait(op_time); + mdw_wait(op_time); } if(o.read_only){ continue; @@ -661,7 +661,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ o.backend->delete(obj_name, o.backend_options); bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_delete, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { - wait(op_time); + mdw_wait(op_time); } if (o.verbosity >= 2){ @@ -689,7 +689,7 @@ void run_benchmark(phase_stat_t * s, int * current_index_p){ bench_runtime = add_timed_result(op_timer, s->phase_start_timer, s->time_create, pos, & s->max_op_time, & op_time); if(o.relative_waiting_factor > 1e-9) { - wait(op_time); + mdw_wait(op_time); } if (o.verbosity >= 2){ From ad985af76363dabbbc66837fd2aaa00182378149 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Fri, 6 Nov 2020 09:30:59 +0000 Subject: [PATCH 15/15] Location problem on non Linux systems. --- src/utilities.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/utilities.h b/src/utilities.h index 020f27b..56cc8db 100755 --- a/src/utilities.h +++ b/src/utilities.h @@ -30,13 +30,7 @@ extern enum OutputFormat_t outputFormat; /* format of the output */ * Try using the system's PATH_MAX, which is what realpath and such use. */ #define MAX_PATHLEN PATH_MAX - - -#ifdef __linux__ #define ERROR_LOCATION __func__ -#else -#define ERROR_LOCATION __LINE__ -#endif void* safeMalloc(uint64_t size);