diff --git a/mon/mon.js b/mon/mon.js index 66f8b6cd..564fdbac 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -110,7 +110,15 @@ const etcd_tree = { autosync_interval: 5, autosync_writes: 128, client_queue_depth: 128, // unused - recovery_queue_depth: 4, + recovery_queue_depth: 1, + recovery_sleep_us: 0, + recovery_tune_min_util: 0.1, + recovery_tune_min_client_util: 0, + recovery_tune_max_util: 1.0, + recovery_tune_max_client_util: 0.5, + recovery_tune_interval: 1, + recovery_tune_ewma_rate: 0.5, + recovery_tune_sleep_min_us: 10, // 10 microseconds recovery_pg_switch: 128, recovery_sync_batch: 16, no_recovery: false, diff --git a/src/osd.cpp b/src/osd.cpp index 6513e417..943da8ba 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -68,14 +68,21 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop) } } - print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id) + if (print_stats_timer_id == -1) { - print_stats(); - }); - slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id) + print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id) + { + print_stats(); + }); + } + if (slow_log_timer_id == -1) { - print_slow(); - }); + slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id) + { + print_slow(); + }); + } + apply_recovery_tune_interval(); msgr.tfd = this->tfd; msgr.ringloop = this->ringloop; @@ -97,6 +104,11 @@ osd_t::~osd_t() tfd->clear_timer(slow_log_timer_id); slow_log_timer_id = -1; } + if (rtune_timer_id >= 0) + { + tfd->clear_timer(rtune_timer_id); + rtune_timer_id = -1; + } if (print_stats_timer_id >= 0) { tfd->clear_timer(print_stats_timer_id); @@ -196,6 +208,22 @@ void osd_t::parse_config(bool init) recovery_queue_depth = config["recovery_queue_depth"].uint64_value(); if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE) recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; + recovery_sleep_us = config["recovery_sleep_us"].uint64_value(); + recovery_tune_min_util = config["recovery_tune_min_util"].is_null() + ? 0.1 : config["recovery_tune_min_util"].number_value(); + recovery_tune_max_util = config["recovery_tune_max_util"].is_null() + ? 1.0 : config["recovery_tune_max_util"].number_value(); + recovery_tune_min_client_util = config["recovery_tune_min_client_util"].is_null() + ? 0 : config["recovery_tune_min_client_util"].number_value(); + recovery_tune_max_client_util = config["recovery_tune_max_client_util"].is_null() + ? 0.5 : config["recovery_tune_max_client_util"].number_value(); + auto old_recovery_tune_interval = recovery_tune_interval; + recovery_tune_interval = config["recovery_tune_interval"].is_null() + ? 1 : config["recovery_tune_interval"].uint64_value(); + recovery_tune_ewma_rate = config["recovery_tune_ewma_rate"].is_null() + ? 0.5 : config["recovery_tune_ewma_rate"].number_value(); + recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null() + ? 10 : config["recovery_tune_sleep_min_us"].uint64_value(); recovery_pg_switch = config["recovery_pg_switch"].uint64_value(); if (recovery_pg_switch < 1) recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; @@ -274,6 +302,10 @@ void osd_t::parse_config(bool init) print_slow(); }); } + if (old_recovery_tune_interval != recovery_tune_interval) + { + apply_recovery_tune_interval(); + } } void osd_t::bind_socket() diff --git a/src/osd.h b/src/osd.h index 36a62423..fd60d021 100644 --- a/src/osd.h +++ b/src/osd.h @@ -34,7 +34,7 @@ #define DEFAULT_AUTOSYNC_INTERVAL 5 #define DEFAULT_AUTOSYNC_WRITES 128 #define MAX_RECOVERY_QUEUE 2048 -#define DEFAULT_RECOVERY_QUEUE 4 +#define DEFAULT_RECOVERY_QUEUE 1 #define DEFAULT_RECOVERY_PG_SWITCH 128 #define DEFAULT_RECOVERY_BATCH 16 @@ -116,7 +116,15 @@ class osd_t int immediate_commit = IMMEDIATE_NONE; int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds int autosync_writes = DEFAULT_AUTOSYNC_WRITES; - int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; + uint64_t recovery_queue_depth = 1; + uint64_t recovery_sleep_us = 0; + double recovery_tune_min_util = 0.1; + double recovery_tune_min_client_util = 0; + double recovery_tune_max_util = 1.0; + double recovery_tune_max_client_util = 0.5; + int recovery_tune_interval = 1; + double recovery_tune_ewma_rate = 0.5; + int recovery_tune_sleep_min_us = 10; int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int inode_vanish_time = 60; @@ -197,6 +205,16 @@ class osd_t recovery_stat_t recovery_stat[2]; recovery_stat_t recovery_print_prev[2]; + // recovery auto-tuning + int rtune_timer_id = -1; + uint64_t rtune_avg_lat = 0; + double rtune_avg_count = 0; + double rtune_client_util = 0, rtune_target_util = 1; + osd_op_stats_t rtune_prev_stats; + recovery_stat_t rtune_prev_recovery[2]; + uint64_t recovery_target_queue_depth = 1; + uint64_t recovery_target_sleep_us = 0; + // cluster connection void parse_config(bool init); void init_cluster(); @@ -213,6 +231,8 @@ class osd_t void create_osd_state(); void renew_lease(bool reload); void print_stats(); + void tune_recovery(); + void apply_recovery_tune_interval(); void print_slow(); json11::Json get_statistics(); void report_statistics(); @@ -242,6 +262,7 @@ class osd_t bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data); bool pick_next_recovery(osd_recovery_op_t &op); void submit_recovery_op(osd_recovery_op_t *op); + void finish_recovery_op(osd_recovery_op_t *op); bool continue_recovery(); pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg); diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index a8e5ff48..0bacf3c5 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -325,30 +325,116 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op) { printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe); } - // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase() - op->osd_op = NULL; - recovery_ops.erase(op->oid); - delete osd_op; - if (immediate_commit != IMMEDIATE_ALL) + if (recovery_target_sleep_us) { - recovery_done++; - if (recovery_done >= recovery_sync_batch) + this->tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id) { - // Force sync every operations - // This is required not to pile up an excessive amount of delete operations - autosync(); - recovery_done = 0; - } + finish_recovery_op(op); + }); + } + else + { + finish_recovery_op(op); } - continue_recovery(); }; exec_op(op->osd_op); } +void osd_t::apply_recovery_tune_interval() +{ + if (rtune_timer_id >= 0) + { + tfd->clear_timer(rtune_timer_id); + rtune_timer_id = -1; + } + if (recovery_tune_interval != 0) + { + rtune_timer_id = this->tfd->set_timer(recovery_tune_interval*1000, true, [this](int timer_id) + { + tune_recovery(); + }); + } + else + { + recovery_target_queue_depth = recovery_queue_depth; + recovery_target_sleep_us = recovery_sleep_us; + } +} + +void osd_t::finish_recovery_op(osd_recovery_op_t *op) +{ + // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase() + delete op->osd_op; + op->osd_op = NULL; + recovery_ops.erase(op->oid); + if (immediate_commit != IMMEDIATE_ALL) + { + recovery_done++; + if (recovery_done >= recovery_sync_batch) + { + // Force sync every operations + // This is required not to pile up an excessive amount of delete operations + autosync(); + recovery_done = 0; + } + } + continue_recovery(); +} + +void osd_t::tune_recovery() +{ + static int total_client_ops[] = { OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE }; + uint64_t total_client_usec = 0; + for (int i = 0; i < sizeof(total_client_ops)/sizeof(total_client_ops[0]); i++) + { + total_client_usec += (msgr.stats.op_stat_sum[total_client_ops[i]] - rtune_prev_stats.op_stat_sum[total_client_ops[i]]); + rtune_prev_stats.op_stat_sum[total_client_ops[i]] = msgr.stats.op_stat_sum[total_client_ops[i]]; + } + uint64_t total_recovery_usec = 0, recovery_count = 0; + total_recovery_usec += recovery_stat[0].usec-rtune_prev_recovery[0].usec; + total_recovery_usec += recovery_stat[1].usec-rtune_prev_recovery[1].usec; + recovery_count += recovery_stat[0].count-rtune_prev_recovery[0].count; + recovery_count += recovery_stat[1].count-rtune_prev_recovery[1].count; + memcpy(rtune_prev_recovery, recovery_stat, sizeof(recovery_stat)); + if (recovery_count == 0) + { + return; + } + rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate + + rtune_avg_lat*(1-recovery_tune_ewma_rate); + rtune_avg_count = recovery_count*recovery_tune_ewma_rate + + rtune_avg_count*(1-recovery_tune_ewma_rate); + // client_util = count/interval * usec/1000000.0/count = usec/1000000.0/interval :-) + double client_util = total_client_usec/1000000.0/recovery_tune_interval; + rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate; + rtune_target_util = (rtune_client_util < recovery_tune_min_client_util + ? recovery_tune_max_util + : recovery_tune_min_util + (rtune_client_util >= recovery_tune_max_client_util + ? 0 : (recovery_tune_max_util-recovery_tune_min_util)* + (recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util) + ) + ); + // for example: utilisation = 8, target = 1 + // intuitively target latency should be 8x of real + // target_lat = rtune_avg_lat * utilisation / target_util + // = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util + // = 0.0625 + recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0); + uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0*rtune_avg_count/recovery_tune_interval/rtune_target_util; + recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0; + if (log_level > 3) + { + printf( + "recovery tune: client util %.2f (ewma %.2f), target util %.2f -> queue %ld, lat %lu us, real %lu us, pause %lu us\n", + client_util, rtune_client_util, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us + ); + } +} + // Just trigger write requests for degraded objects. They'll be recovered during writing bool osd_t::continue_recovery() { - while (recovery_ops.size() < recovery_queue_depth) + while (recovery_ops.size() < recovery_target_queue_depth) { osd_recovery_op_t op; if (pick_next_recovery(op)) diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index dee2da01..5a92f142 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -296,6 +296,7 @@ resume_7: if (!recovery_stat[recovery_type].count) // wrapped { memset(&recovery_print_prev[recovery_type], 0, sizeof(recovery_print_prev[recovery_type])); + memset(&rtune_prev_recovery[recovery_type], 0, sizeof(rtune_prev_recovery[recovery_type])); memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type])); recovery_stat[recovery_type].count++; }