From 5ca7cde6123f51d1db2479fc65639c2c6cbd9933 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 17 Dec 2023 00:03:21 +0300 Subject: [PATCH] Experiment/WIP: Try to track "secondary" recovery ops separately --- src/messenger.h | 3 +- src/msgr_op.cpp | 14 +++++++ src/msgr_op.h | 2 + src/msgr_send.cpp | 38 ++++++++++++------- src/osd.h | 8 ++-- src/osd_flush.cpp | 75 ++++++++++++++++++++------------------ src/osd_ops.h | 13 ++++++- src/osd_primary_subops.cpp | 42 ++++++++++++--------- src/osd_primary_write.cpp | 1 - src/osd_secondary.cpp | 16 +++++++- 10 files changed, 137 insertions(+), 75 deletions(-) diff --git a/src/messenger.h b/src/messenger.h index 282586a9..6c44fbfe 100644 --- a/src/messenger.h +++ b/src/messenger.h @@ -149,7 +149,7 @@ public: std::map wanted_peers; std::map osd_peer_fds; // op statistics - osd_op_stats_t stats; + osd_op_stats_t stats, recovery_stats; void init(); void parse_config(const json11::Json & config); @@ -175,6 +175,7 @@ public: bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg); #endif + void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len); void measure_exec(osd_op_t *cur_op); protected: diff --git a/src/msgr_op.cpp b/src/msgr_op.cpp index 30dce6c1..84dd0a0f 100644 --- a/src/msgr_op.cpp +++ b/src/msgr_op.cpp @@ -24,3 +24,17 @@ osd_op_t::~osd_op_t() free(buf); } } + +bool osd_op_t::is_recovery_related() +{ + return (req.hdr.opcode == OSD_OP_SEC_READ || + req.hdr.opcode == OSD_OP_SEC_WRITE || + req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) && + (req.sec_rw.flags & OSD_OP_RECOVERY_RELATED) || + req.hdr.opcode == OSD_OP_SEC_DELETE && + (req.sec_del.flags & OSD_OP_RECOVERY_RELATED) || + req.hdr.opcode == OSD_OP_SEC_STABILIZE && + (req.sec_stab.flags & OSD_OP_RECOVERY_RELATED) || + req.hdr.opcode == OSD_OP_SEC_SYNC && + (req.sec_sync.flags & OSD_OP_RECOVERY_RELATED); +} diff --git a/src/msgr_op.h b/src/msgr_op.h index f06522ce..64c9166e 100644 --- a/src/msgr_op.h +++ b/src/msgr_op.h @@ -173,4 +173,6 @@ struct osd_op_t osd_op_buf_list_t iov; ~osd_op_t(); + + bool is_recovery_related(); }; diff --git a/src/msgr_send.cpp b/src/msgr_send.cpp index 0f702cf8..7374a1e7 100644 --- a/src/msgr_send.cpp +++ b/src/msgr_send.cpp @@ -131,6 +131,23 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) } } +void osd_messenger_t::inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len) +{ + uint64_t usecs = ( + (tv_end.tv_sec - tv_begin.tv_sec)*1000000 + + (tv_end.tv_nsec - tv_begin.tv_nsec)/1000 + ); + stats.op_stat_count[opcode]++; + if (!stats.op_stat_count[opcode]) + { + stats.op_stat_count[opcode] = 1; + stats.op_stat_sum[opcode] = 0; + stats.op_stat_bytes[opcode] = 0; + } + stats.op_stat_sum[opcode] += usecs; + stats.op_stat_bytes[opcode] += len; +} + void osd_messenger_t::measure_exec(osd_op_t *cur_op) { // Measure execution latency @@ -142,29 +159,24 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op) { clock_gettime(CLOCK_REALTIME, &cur_op->tv_end); } - stats.op_stat_count[cur_op->req.hdr.opcode]++; - if (!stats.op_stat_count[cur_op->req.hdr.opcode]) - { - stats.op_stat_count[cur_op->req.hdr.opcode]++; - stats.op_stat_sum[cur_op->req.hdr.opcode] = 0; - stats.op_stat_bytes[cur_op->req.hdr.opcode] = 0; - } - stats.op_stat_sum[cur_op->req.hdr.opcode] += ( - (cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 + - (cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000 - ); + uint64_t len = 0; if (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_SCRUB) { // req.rw.len is internally set to the full object size for scrubs - stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.rw.len; + len = cur_op->req.rw.len; } else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) { - stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.sec_rw.len; + len = cur_op->req.sec_rw.len; + } + inc_op_stats(stats, cur_op->req.hdr.opcode, cur_op->tv_begin, cur_op->tv_end, len); + if (cur_op->is_recovery_related()) + { + inc_op_stats(recovery_stats, cur_op->req.hdr.opcode, cur_op->tv_begin, cur_op->tv_end, len); } } diff --git a/src/osd.h b/src/osd.h index fd60d021..a1bceb00 100644 --- a/src/osd.h +++ b/src/osd.h @@ -123,7 +123,7 @@ class osd_t double recovery_tune_max_util = 1.0; double recovery_tune_max_client_util = 0.5; int recovery_tune_interval = 1; - double recovery_tune_ewma_rate = 0.5; + double recovery_tune_ewma_rate = 0.2; int recovery_tune_sleep_min_us = 10; int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; @@ -208,10 +208,8 @@ class osd_t // recovery auto-tuning int rtune_timer_id = -1; uint64_t rtune_avg_lat = 0; - double rtune_avg_count = 0; double rtune_client_util = 0, rtune_target_util = 1; - osd_op_stats_t rtune_prev_stats; - recovery_stat_t rtune_prev_recovery[2]; + osd_op_stats_t rtune_prev_stats, rtune_prev_recovery_stats; uint64_t recovery_target_queue_depth = 1; uint64_t recovery_target_sleep_us = 0; @@ -304,7 +302,7 @@ class osd_t bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_bs_subop(osd_op_t *subop); - void add_bs_subop_stats(osd_op_t *subop); + void add_bs_subop_stats(osd_op_t *subop, bool recovery_related = false); void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval); void submit_primary_subops(int submit_type, uint64_t op_version, const uint64_t* osd_set, osd_op_t *cur_op); diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 0bacf3c5..95de5f08 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -325,17 +325,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op) { printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe); } - if (recovery_target_sleep_us) - { - this->tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id) - { - finish_recovery_op(op); - }); - } - else - { - finish_recovery_op(op); - } + finish_recovery_op(op); }; exec_op(op->osd_op); } @@ -383,29 +373,46 @@ void osd_t::finish_recovery_op(osd_recovery_op_t *op) void osd_t::tune_recovery() { - static int total_client_ops[] = { OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE }; - uint64_t total_client_usec = 0; - for (int i = 0; i < sizeof(total_client_ops)/sizeof(total_client_ops[0]); i++) + static int accounted_ops[] = { + OSD_OP_SEC_READ, OSD_OP_SEC_WRITE, OSD_OP_SEC_WRITE_STABLE, + OSD_OP_SEC_STABILIZE, OSD_OP_SEC_SYNC, OSD_OP_SEC_DELETE + }; + uint64_t total_client_usec = 0, total_recovery_usec = 0, recovery_count = 0; + for (int i = 0; i < sizeof(accounted_ops)/sizeof(accounted_ops[0]); i++) { - total_client_usec += (msgr.stats.op_stat_sum[total_client_ops[i]] - rtune_prev_stats.op_stat_sum[total_client_ops[i]]); - rtune_prev_stats.op_stat_sum[total_client_ops[i]] = msgr.stats.op_stat_sum[total_client_ops[i]]; + total_client_usec += (msgr.stats.op_stat_sum[accounted_ops[i]] + - rtune_prev_stats.op_stat_sum[accounted_ops[i]]); + total_recovery_usec += (msgr.recovery_stats.op_stat_sum[accounted_ops[i]] + - rtune_prev_recovery_stats.op_stat_sum[accounted_ops[i]]); + recovery_count += (msgr.recovery_stats.op_stat_count[accounted_ops[i]] + - rtune_prev_recovery_stats.op_stat_count[accounted_ops[i]]); + rtune_prev_stats.op_stat_sum[accounted_ops[i]] = msgr.stats.op_stat_sum[accounted_ops[i]]; + rtune_prev_recovery_stats.op_stat_sum[accounted_ops[i]] = msgr.recovery_stats.op_stat_sum[accounted_ops[i]]; + rtune_prev_recovery_stats.op_stat_count[accounted_ops[i]] = msgr.recovery_stats.op_stat_count[accounted_ops[i]]; } - uint64_t total_recovery_usec = 0, recovery_count = 0; - total_recovery_usec += recovery_stat[0].usec-rtune_prev_recovery[0].usec; - total_recovery_usec += recovery_stat[1].usec-rtune_prev_recovery[1].usec; - recovery_count += recovery_stat[0].count-rtune_prev_recovery[0].count; - recovery_count += recovery_stat[1].count-rtune_prev_recovery[1].count; - memcpy(rtune_prev_recovery, recovery_stat, sizeof(recovery_stat)); + total_client_usec -= total_recovery_usec; if (recovery_count == 0) { return; } - rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate + - rtune_avg_lat*(1-recovery_tune_ewma_rate); - rtune_avg_count = recovery_count*recovery_tune_ewma_rate + - rtune_avg_count*(1-recovery_tune_ewma_rate); - // client_util = count/interval * usec/1000000.0/count = usec/1000000.0/interval :-) - double client_util = total_client_usec/1000000.0/recovery_tune_interval; + // example: + // total 3 GB/s + // recovery queue 1 + // 120 OSDs + // EC 5+3 + // 128kb block_size => 640kb object + // 3000*1024/640/120 = 40 MB/s per OSD = 64 recovered objects per OSD + // = 64*8*2 subops = 1024 recovery subop iops + // 8 recovery subop queue + // => subop avg latency = 0.0078125 sec + // utilisation = 8 + // target util 1 + // intuitively target latency should be 8x of real + // target_lat = rtune_avg_lat * utilisation / target_util + // = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util + // = 0.0625 + // recovery utilisation will be 1 + auto client_util = total_client_usec/1000000.0/recovery_tune_interval; rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate; rtune_target_util = (rtune_client_util < recovery_tune_min_client_util ? recovery_tune_max_util @@ -414,19 +421,15 @@ void osd_t::tune_recovery() (recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util) ) ); - // for example: utilisation = 8, target = 1 - // intuitively target latency should be 8x of real - // target_lat = rtune_avg_lat * utilisation / target_util - // = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util - // = 0.0625 + rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate + rtune_avg_lat*(1-recovery_tune_ewma_rate); recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0); - uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0*rtune_avg_count/recovery_tune_interval/rtune_target_util; + uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util; recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0; if (log_level > 3) { printf( - "recovery tune: client util %.2f (ewma %.2f), target util %.2f -> queue %ld, lat %lu us, real %lu us, pause %lu us\n", - client_util, rtune_client_util, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us + "recovery tune: cli %lu us, recovery %lu us / %lu ops, target util %.2f -> queue %ld, lat %lu us, real %lu us, delay %lu us\n", + total_client_usec, total_recovery_usec, recovery_count, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us ); } } diff --git a/src/osd_ops.h b/src/osd_ops.h index 217cbb0b..6cadead0 100644 --- a/src/osd_ops.h +++ b/src/osd_ops.h @@ -34,6 +34,7 @@ #define OSD_OP_MAX 18 #define OSD_RW_MAX 64*1024*1024 #define OSD_PROTOCOL_VERSION 1 +#define OSD_OP_RECOVERY_RELATED (uint32_t)1 // Memory alignment for direct I/O (usually 512 bytes) #ifndef DIRECT_IO_ALIGNMENT @@ -88,7 +89,8 @@ struct __attribute__((__packed__)) osd_op_sec_rw_t uint32_t len; // bitmap/attribute length - bitmap comes after header, but before data uint32_t attr_len; - uint32_t pad0; + // the only possible flag is OSD_OP_RECOVERY_RELATED + uint32_t flags; }; struct __attribute__((__packed__)) osd_reply_sec_rw_t @@ -109,6 +111,9 @@ struct __attribute__((__packed__)) osd_op_sec_del_t object_id oid; // delete version (automatic or specific) uint64_t version; + // the only possible flag is OSD_OP_RECOVERY_RELATED + uint32_t flags; + uint32_t pad0; }; struct __attribute__((__packed__)) osd_reply_sec_del_t @@ -121,6 +126,9 @@ struct __attribute__((__packed__)) osd_reply_sec_del_t struct __attribute__((__packed__)) osd_op_sec_sync_t { osd_op_header_t header; + // the only possible flag is OSD_OP_RECOVERY_RELATED + uint32_t flags; + uint32_t pad0; }; struct __attribute__((__packed__)) osd_reply_sec_sync_t @@ -134,6 +142,9 @@ struct __attribute__((__packed__)) osd_op_sec_stab_t osd_op_header_t header; // obj_ver_id array length in bytes uint64_t len; + // the only possible flag is OSD_OP_RECOVERY_RELATED + uint32_t flags; + uint32_t pad0; }; typedef osd_op_sec_stab_t osd_op_sec_rollback_t; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 6cfa2331..db169c9c 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -221,6 +221,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o .offset = wr ? si->write_start : si->read_start, .len = subop_len, .attr_len = wr ? clean_entry_bitmap_size : 0, + .flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0, }; #ifdef OSD_DEBUG printf( @@ -300,7 +301,8 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop) " retval = "+std::to_string(bs_op->retval)+")" ); } - add_bs_subop_stats(subop); + bool recovery_related = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB; + add_bs_subop_stats(subop, recovery_related); subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode]; subop->reply.hdr.retval = bs_op->retval; if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE) @@ -312,30 +314,33 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop) } delete bs_op; subop->bs_op = NULL; - subop->peer_fd = -1; - handle_primary_subop(subop, cur_op); + subop->peer_fd = SELF_FD; + if (recovery_related && recovery_target_sleep_us) + { + tfd->set_timer_us(recovery_target_sleep_us, false, [=](int timer_id) + { + handle_primary_subop(subop, cur_op); + }); + } + else + { + handle_primary_subop(subop, cur_op); + } } -void osd_t::add_bs_subop_stats(osd_op_t *subop) +void osd_t::add_bs_subop_stats(osd_op_t *subop, bool recovery_related) { // Include local blockstore ops in statistics uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode]; timespec tv_end; clock_gettime(CLOCK_REALTIME, &tv_end); - msgr.stats.op_stat_count[opcode]++; - if (!msgr.stats.op_stat_count[opcode]) + uint64_t len = (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE) + ? subop->bs_op->len : 0; + msgr.inc_op_stats(msgr.stats, opcode, subop->tv_begin, tv_end, len); + if (recovery_related) { - msgr.stats.op_stat_count[opcode] = 1; - msgr.stats.op_stat_sum[opcode] = 0; - msgr.stats.op_stat_bytes[opcode] = 0; - } - msgr.stats.op_stat_sum[opcode] += ( - (tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 + - (tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000 - ); - if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE) - { - msgr.stats.op_stat_bytes[opcode] += subop->bs_op->len; + // It is OSD_OP_RECOVERY_RELATED + msgr.inc_op_stats(msgr.recovery_stats, opcode, subop->tv_begin, tv_end, len); } } @@ -558,6 +563,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_ }, .oid = chunk.oid, .version = chunk.version, + .flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0, } }; subops[i].callback = [cur_op, this](osd_op_t *subop) { @@ -615,6 +621,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op) .id = msgr.next_subop_id++, .opcode = OSD_OP_SEC_SYNC, }, + .flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0, } }; subops[i].callback = [cur_op, this](osd_op_t *subop) { @@ -674,6 +681,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op) .opcode = OSD_OP_SEC_STABILIZE, }, .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)), + .flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0, } }; subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id)); subops[i].callback = [cur_op, this](osd_op_t *subop) diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index 5a92f142..dee2da01 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -296,7 +296,6 @@ resume_7: if (!recovery_stat[recovery_type].count) // wrapped { memset(&recovery_print_prev[recovery_type], 0, sizeof(recovery_print_prev[recovery_type])); - memset(&rtune_prev_recovery[recovery_type], 0, sizeof(rtune_prev_recovery[recovery_type])); memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type])); recovery_stat[recovery_type].count++; } diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index 91939e04..d8fe2627 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -42,7 +42,21 @@ void osd_t::secondary_op_callback(osd_op_t *op) int retval = op->bs_op->retval; delete op->bs_op; op->bs_op = NULL; - finish_op(op, retval); + if (op->is_recovery_related() && recovery_target_sleep_us) + { + if (!op->tv_end.tv_sec) + { + clock_gettime(CLOCK_REALTIME, &op->tv_end); + } + tfd->set_timer_us(recovery_target_sleep_us, false, [this, op, retval](int timer_id) + { + finish_op(op, retval); + }); + } + else + { + finish_op(op, retval); + } } void osd_t::exec_secondary(osd_op_t *cur_op)