Aggregate recovery delay using simple mean over last 10 observations (EWMA is shit)
parent
5ca7cde612
commit
1edf86ed26
|
@ -117,7 +117,7 @@ const etcd_tree = {
|
||||||
recovery_tune_max_util: 1.0,
|
recovery_tune_max_util: 1.0,
|
||||||
recovery_tune_max_client_util: 0.5,
|
recovery_tune_max_client_util: 0.5,
|
||||||
recovery_tune_interval: 1,
|
recovery_tune_interval: 1,
|
||||||
recovery_tune_ewma_rate: 0.5,
|
recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval
|
||||||
recovery_tune_sleep_min_us: 10, // 10 microseconds
|
recovery_tune_sleep_min_us: 10, // 10 microseconds
|
||||||
recovery_pg_switch: 128,
|
recovery_pg_switch: 128,
|
||||||
recovery_sync_batch: 16,
|
recovery_sync_batch: 16,
|
||||||
|
|
|
@ -220,8 +220,8 @@ void osd_t::parse_config(bool init)
|
||||||
auto old_recovery_tune_interval = recovery_tune_interval;
|
auto old_recovery_tune_interval = recovery_tune_interval;
|
||||||
recovery_tune_interval = config["recovery_tune_interval"].is_null()
|
recovery_tune_interval = config["recovery_tune_interval"].is_null()
|
||||||
? 1 : config["recovery_tune_interval"].uint64_value();
|
? 1 : config["recovery_tune_interval"].uint64_value();
|
||||||
recovery_tune_ewma_rate = config["recovery_tune_ewma_rate"].is_null()
|
recovery_tune_agg_interval = config["recovery_tune_agg_interval"].is_null()
|
||||||
? 0.5 : config["recovery_tune_ewma_rate"].number_value();
|
? 10 : config["recovery_tune_agg_interval"].uint64_value();
|
||||||
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
|
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
|
||||||
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
|
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
|
||||||
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||||
|
@ -494,11 +494,12 @@ void osd_t::print_stats()
|
||||||
{
|
{
|
||||||
uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
|
uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg lat %ld us\n", osd_num, recovery_stat_names[i],
|
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg latency %ld us, delay %ld us\n", osd_num, recovery_stat_names[i],
|
||||||
(recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
|
(recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
|
||||||
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
||||||
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
|
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
|
||||||
(recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count)
|
(recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count),
|
||||||
|
recovery_target_sleep_us
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -123,7 +123,7 @@ class osd_t
|
||||||
double recovery_tune_max_util = 1.0;
|
double recovery_tune_max_util = 1.0;
|
||||||
double recovery_tune_max_client_util = 0.5;
|
double recovery_tune_max_client_util = 0.5;
|
||||||
int recovery_tune_interval = 1;
|
int recovery_tune_interval = 1;
|
||||||
double recovery_tune_ewma_rate = 0.2;
|
int recovery_tune_agg_interval = 10;
|
||||||
int recovery_tune_sleep_min_us = 10;
|
int recovery_tune_sleep_min_us = 10;
|
||||||
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
|
@ -210,8 +210,10 @@ class osd_t
|
||||||
uint64_t rtune_avg_lat = 0;
|
uint64_t rtune_avg_lat = 0;
|
||||||
double rtune_client_util = 0, rtune_target_util = 1;
|
double rtune_client_util = 0, rtune_target_util = 1;
|
||||||
osd_op_stats_t rtune_prev_stats, rtune_prev_recovery_stats;
|
osd_op_stats_t rtune_prev_stats, rtune_prev_recovery_stats;
|
||||||
uint64_t recovery_target_queue_depth = 1;
|
std::vector<uint64_t> recovery_target_sleep_items;
|
||||||
uint64_t recovery_target_sleep_us = 0;
|
uint64_t recovery_target_sleep_us = 0;
|
||||||
|
uint64_t recovery_target_sleep_total = 0;
|
||||||
|
int recovery_target_sleep_cur = 0, recovery_target_sleep_count = 0;
|
||||||
|
|
||||||
// cluster connection
|
// cluster connection
|
||||||
void parse_config(bool init);
|
void parse_config(bool init);
|
||||||
|
|
|
@ -346,7 +346,6 @@ void osd_t::apply_recovery_tune_interval()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
recovery_target_queue_depth = recovery_queue_depth;
|
|
||||||
recovery_target_sleep_us = recovery_sleep_us;
|
recovery_target_sleep_us = recovery_sleep_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -412,8 +411,7 @@ void osd_t::tune_recovery()
|
||||||
// = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util
|
// = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util
|
||||||
// = 0.0625
|
// = 0.0625
|
||||||
// recovery utilisation will be 1
|
// recovery utilisation will be 1
|
||||||
auto client_util = total_client_usec/1000000.0/recovery_tune_interval;
|
rtune_client_util = total_client_usec/1000000.0/recovery_tune_interval;
|
||||||
rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate;
|
|
||||||
rtune_target_util = (rtune_client_util < recovery_tune_min_client_util
|
rtune_target_util = (rtune_client_util < recovery_tune_min_client_util
|
||||||
? recovery_tune_max_util
|
? recovery_tune_max_util
|
||||||
: recovery_tune_min_util + (rtune_client_util >= recovery_tune_max_client_util
|
: recovery_tune_min_util + (rtune_client_util >= recovery_tune_max_client_util
|
||||||
|
@ -421,15 +419,31 @@ void osd_t::tune_recovery()
|
||||||
(recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util)
|
(recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate + rtune_avg_lat*(1-recovery_tune_ewma_rate);
|
rtune_avg_lat = total_recovery_usec/recovery_count;
|
||||||
recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0);
|
|
||||||
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
|
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
|
||||||
recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
|
auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
|
||||||
if (log_level > 3)
|
if (recovery_target_sleep_items.size() != recovery_tune_agg_interval)
|
||||||
|
{
|
||||||
|
recovery_target_sleep_items.resize(recovery_tune_agg_interval);
|
||||||
|
for (int i = 0; i < recovery_tune_agg_interval; i++)
|
||||||
|
recovery_target_sleep_items[i] = 0;
|
||||||
|
recovery_target_sleep_total = 0;
|
||||||
|
recovery_target_sleep_cur = 0;
|
||||||
|
recovery_target_sleep_count = 0;
|
||||||
|
}
|
||||||
|
recovery_target_sleep_total -= recovery_target_sleep_items[recovery_target_sleep_cur];
|
||||||
|
recovery_target_sleep_items[recovery_target_sleep_cur] = sleep_us;
|
||||||
|
recovery_target_sleep_cur = (recovery_target_sleep_cur+1) % recovery_tune_agg_interval;
|
||||||
|
recovery_target_sleep_total += sleep_us;
|
||||||
|
if (recovery_target_sleep_count < recovery_tune_agg_interval)
|
||||||
|
recovery_target_sleep_count++;
|
||||||
|
recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count;
|
||||||
|
if (log_level > 4)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"recovery tune: cli %lu us, recovery %lu us / %lu ops, target util %.2f -> queue %ld, lat %lu us, real %lu us, delay %lu us\n",
|
"[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n",
|
||||||
total_client_usec, total_recovery_usec, recovery_count, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us
|
osd_num, rtune_client_util, total_recovery_usec/1000000.0/recovery_tune_interval,
|
||||||
|
rtune_avg_lat, rtune_target_util, recovery_target_sleep_us
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -437,7 +451,7 @@ void osd_t::tune_recovery()
|
||||||
// Just trigger write requests for degraded objects. They'll be recovered during writing
|
// Just trigger write requests for degraded objects. They'll be recovered during writing
|
||||||
bool osd_t::continue_recovery()
|
bool osd_t::continue_recovery()
|
||||||
{
|
{
|
||||||
while (recovery_ops.size() < recovery_target_queue_depth)
|
while (recovery_ops.size() < recovery_queue_depth)
|
||||||
{
|
{
|
||||||
osd_recovery_op_t op;
|
osd_recovery_op_t op;
|
||||||
if (pick_next_recovery(op))
|
if (pick_next_recovery(op))
|
||||||
|
|
Loading…
Reference in New Issue