From 1edf86ed264b5fbc610e5a25e9a03d85cc388f6a Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 20 Dec 2023 02:24:04 +0300 Subject: [PATCH] Aggregate recovery delay using simple mean over last 10 observations (EWMA is shit) --- mon/mon.js | 2 +- src/osd.cpp | 9 +++++---- src/osd.h | 6 ++++-- src/osd_flush.cpp | 34 ++++++++++++++++++++++++---------- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/mon/mon.js b/mon/mon.js index 564fdbac..3a2a5fa9 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -117,7 +117,7 @@ const etcd_tree = { recovery_tune_max_util: 1.0, recovery_tune_max_client_util: 0.5, recovery_tune_interval: 1, - recovery_tune_ewma_rate: 0.5, + recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval recovery_tune_sleep_min_us: 10, // 10 microseconds recovery_pg_switch: 128, recovery_sync_batch: 16, diff --git a/src/osd.cpp b/src/osd.cpp index 943da8ba..3889420a 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -220,8 +220,8 @@ void osd_t::parse_config(bool init) auto old_recovery_tune_interval = recovery_tune_interval; recovery_tune_interval = config["recovery_tune_interval"].is_null() ? 1 : config["recovery_tune_interval"].uint64_value(); - recovery_tune_ewma_rate = config["recovery_tune_ewma_rate"].is_null() - ? 0.5 : config["recovery_tune_ewma_rate"].number_value(); + recovery_tune_agg_interval = config["recovery_tune_agg_interval"].is_null() + ? 10 : config["recovery_tune_agg_interval"].uint64_value(); recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null() ? 10 : config["recovery_tune_sleep_min_us"].uint64_value(); recovery_pg_switch = config["recovery_pg_switch"].uint64_value(); @@ -494,11 +494,12 @@ void osd_t::print_stats() { uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval; printf( - "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg lat %ld us\n", osd_num, recovery_stat_names[i], + "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg latency %ld us, delay %ld us\n", osd_num, recovery_stat_names[i], (recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval, (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)), (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")), - (recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count) + (recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count), + recovery_target_sleep_us ); } } diff --git a/src/osd.h b/src/osd.h index a1bceb00..dce93d5b 100644 --- a/src/osd.h +++ b/src/osd.h @@ -123,7 +123,7 @@ class osd_t double recovery_tune_max_util = 1.0; double recovery_tune_max_client_util = 0.5; int recovery_tune_interval = 1; - double recovery_tune_ewma_rate = 0.2; + int recovery_tune_agg_interval = 10; int recovery_tune_sleep_min_us = 10; int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; @@ -210,8 +210,10 @@ class osd_t uint64_t rtune_avg_lat = 0; double rtune_client_util = 0, rtune_target_util = 1; osd_op_stats_t rtune_prev_stats, rtune_prev_recovery_stats; - uint64_t recovery_target_queue_depth = 1; + std::vector recovery_target_sleep_items; uint64_t recovery_target_sleep_us = 0; + uint64_t recovery_target_sleep_total = 0; + int recovery_target_sleep_cur = 0, recovery_target_sleep_count = 0; // cluster connection void parse_config(bool init); diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 95de5f08..bbac2d49 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -346,7 +346,6 @@ void osd_t::apply_recovery_tune_interval() } else { - recovery_target_queue_depth = recovery_queue_depth; recovery_target_sleep_us = recovery_sleep_us; } } @@ -412,8 +411,7 @@ void osd_t::tune_recovery() // = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util // = 0.0625 // recovery utilisation will be 1 - auto client_util = total_client_usec/1000000.0/recovery_tune_interval; - rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate; + rtune_client_util = total_client_usec/1000000.0/recovery_tune_interval; rtune_target_util = (rtune_client_util < recovery_tune_min_client_util ? recovery_tune_max_util : recovery_tune_min_util + (rtune_client_util >= recovery_tune_max_client_util @@ -421,15 +419,31 @@ void osd_t::tune_recovery() (recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util) ) ); - rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate + rtune_avg_lat*(1-recovery_tune_ewma_rate); - recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0); + rtune_avg_lat = total_recovery_usec/recovery_count; uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util; - recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0; - if (log_level > 3) + auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0; + if (recovery_target_sleep_items.size() != recovery_tune_agg_interval) + { + recovery_target_sleep_items.resize(recovery_tune_agg_interval); + for (int i = 0; i < recovery_tune_agg_interval; i++) + recovery_target_sleep_items[i] = 0; + recovery_target_sleep_total = 0; + recovery_target_sleep_cur = 0; + recovery_target_sleep_count = 0; + } + recovery_target_sleep_total -= recovery_target_sleep_items[recovery_target_sleep_cur]; + recovery_target_sleep_items[recovery_target_sleep_cur] = sleep_us; + recovery_target_sleep_cur = (recovery_target_sleep_cur+1) % recovery_tune_agg_interval; + recovery_target_sleep_total += sleep_us; + if (recovery_target_sleep_count < recovery_tune_agg_interval) + recovery_target_sleep_count++; + recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count; + if (log_level > 4) { printf( - "recovery tune: cli %lu us, recovery %lu us / %lu ops, target util %.2f -> queue %ld, lat %lu us, real %lu us, delay %lu us\n", - total_client_usec, total_recovery_usec, recovery_count, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us + "[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n", + osd_num, rtune_client_util, total_recovery_usec/1000000.0/recovery_tune_interval, + rtune_avg_lat, rtune_target_util, recovery_target_sleep_us ); } } @@ -437,7 +451,7 @@ void osd_t::tune_recovery() // Just trigger write requests for degraded objects. They'll be recovered during writing bool osd_t::continue_recovery() { - while (recovery_ops.size() < recovery_target_queue_depth) + while (recovery_ops.size() < recovery_queue_depth) { osd_recovery_op_t op; if (pick_next_recovery(op))