Experiment/WIP: Try to track "secondary" recovery ops separately

kv-update
Vitaliy Filippov 2023-12-17 00:03:21 +03:00
parent 751935ddd8
commit 5ca7cde612
10 changed files with 137 additions and 75 deletions

View File

@ -149,7 +149,7 @@ public:
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
// op statistics
osd_op_stats_t stats;
osd_op_stats_t stats, recovery_stats;
void init();
void parse_config(const json11::Json & config);
@ -175,6 +175,7 @@ public:
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
#endif
void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
void measure_exec(osd_op_t *cur_op);
protected:

View File

@ -24,3 +24,17 @@ osd_op_t::~osd_op_t()
free(buf);
}
}
bool osd_op_t::is_recovery_related()
{
return (req.hdr.opcode == OSD_OP_SEC_READ ||
req.hdr.opcode == OSD_OP_SEC_WRITE ||
req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
(req.sec_rw.flags & OSD_OP_RECOVERY_RELATED) ||
req.hdr.opcode == OSD_OP_SEC_DELETE &&
(req.sec_del.flags & OSD_OP_RECOVERY_RELATED) ||
req.hdr.opcode == OSD_OP_SEC_STABILIZE &&
(req.sec_stab.flags & OSD_OP_RECOVERY_RELATED) ||
req.hdr.opcode == OSD_OP_SEC_SYNC &&
(req.sec_sync.flags & OSD_OP_RECOVERY_RELATED);
}

View File

@ -173,4 +173,6 @@ struct osd_op_t
osd_op_buf_list_t iov;
~osd_op_t();
bool is_recovery_related();
};

View File

@ -131,6 +131,23 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
}
}
void osd_messenger_t::inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len)
{
uint64_t usecs = (
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - tv_begin.tv_nsec)/1000
);
stats.op_stat_count[opcode]++;
if (!stats.op_stat_count[opcode])
{
stats.op_stat_count[opcode] = 1;
stats.op_stat_sum[opcode] = 0;
stats.op_stat_bytes[opcode] = 0;
}
stats.op_stat_sum[opcode] += usecs;
stats.op_stat_bytes[opcode] += len;
}
void osd_messenger_t::measure_exec(osd_op_t *cur_op)
{
// Measure execution latency
@ -142,29 +159,24 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
}
stats.op_stat_count[cur_op->req.hdr.opcode]++;
if (!stats.op_stat_count[cur_op->req.hdr.opcode])
{
stats.op_stat_count[cur_op->req.hdr.opcode]++;
stats.op_stat_sum[cur_op->req.hdr.opcode] = 0;
stats.op_stat_bytes[cur_op->req.hdr.opcode] = 0;
}
stats.op_stat_sum[cur_op->req.hdr.opcode] += (
(cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
(cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
);
uint64_t len = 0;
if (cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SCRUB)
{
// req.rw.len is internally set to the full object size for scrubs
stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.rw.len;
len = cur_op->req.rw.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{
stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.sec_rw.len;
len = cur_op->req.sec_rw.len;
}
inc_op_stats(stats, cur_op->req.hdr.opcode, cur_op->tv_begin, cur_op->tv_end, len);
if (cur_op->is_recovery_related())
{
inc_op_stats(recovery_stats, cur_op->req.hdr.opcode, cur_op->tv_begin, cur_op->tv_end, len);
}
}

View File

@ -123,7 +123,7 @@ class osd_t
double recovery_tune_max_util = 1.0;
double recovery_tune_max_client_util = 0.5;
int recovery_tune_interval = 1;
double recovery_tune_ewma_rate = 0.5;
double recovery_tune_ewma_rate = 0.2;
int recovery_tune_sleep_min_us = 10;
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
@ -208,10 +208,8 @@ class osd_t
// recovery auto-tuning
int rtune_timer_id = -1;
uint64_t rtune_avg_lat = 0;
double rtune_avg_count = 0;
double rtune_client_util = 0, rtune_target_util = 1;
osd_op_stats_t rtune_prev_stats;
recovery_stat_t rtune_prev_recovery[2];
osd_op_stats_t rtune_prev_stats, rtune_prev_recovery_stats;
uint64_t recovery_target_queue_depth = 1;
uint64_t recovery_target_sleep_us = 0;
@ -304,7 +302,7 @@ class osd_t
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
void handle_primary_bs_subop(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop, bool recovery_related = false);
void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
void submit_primary_subops(int submit_type, uint64_t op_version, const uint64_t* osd_set, osd_op_t *cur_op);

View File

@ -325,17 +325,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
if (recovery_target_sleep_us)
{
this->tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id)
{
finish_recovery_op(op);
});
}
else
{
finish_recovery_op(op);
}
finish_recovery_op(op);
};
exec_op(op->osd_op);
}
@ -383,29 +373,46 @@ void osd_t::finish_recovery_op(osd_recovery_op_t *op)
void osd_t::tune_recovery()
{
static int total_client_ops[] = { OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE };
uint64_t total_client_usec = 0;
for (int i = 0; i < sizeof(total_client_ops)/sizeof(total_client_ops[0]); i++)
static int accounted_ops[] = {
OSD_OP_SEC_READ, OSD_OP_SEC_WRITE, OSD_OP_SEC_WRITE_STABLE,
OSD_OP_SEC_STABILIZE, OSD_OP_SEC_SYNC, OSD_OP_SEC_DELETE
};
uint64_t total_client_usec = 0, total_recovery_usec = 0, recovery_count = 0;
for (int i = 0; i < sizeof(accounted_ops)/sizeof(accounted_ops[0]); i++)
{
total_client_usec += (msgr.stats.op_stat_sum[total_client_ops[i]] - rtune_prev_stats.op_stat_sum[total_client_ops[i]]);
rtune_prev_stats.op_stat_sum[total_client_ops[i]] = msgr.stats.op_stat_sum[total_client_ops[i]];
total_client_usec += (msgr.stats.op_stat_sum[accounted_ops[i]]
- rtune_prev_stats.op_stat_sum[accounted_ops[i]]);
total_recovery_usec += (msgr.recovery_stats.op_stat_sum[accounted_ops[i]]
- rtune_prev_recovery_stats.op_stat_sum[accounted_ops[i]]);
recovery_count += (msgr.recovery_stats.op_stat_count[accounted_ops[i]]
- rtune_prev_recovery_stats.op_stat_count[accounted_ops[i]]);
rtune_prev_stats.op_stat_sum[accounted_ops[i]] = msgr.stats.op_stat_sum[accounted_ops[i]];
rtune_prev_recovery_stats.op_stat_sum[accounted_ops[i]] = msgr.recovery_stats.op_stat_sum[accounted_ops[i]];
rtune_prev_recovery_stats.op_stat_count[accounted_ops[i]] = msgr.recovery_stats.op_stat_count[accounted_ops[i]];
}
uint64_t total_recovery_usec = 0, recovery_count = 0;
total_recovery_usec += recovery_stat[0].usec-rtune_prev_recovery[0].usec;
total_recovery_usec += recovery_stat[1].usec-rtune_prev_recovery[1].usec;
recovery_count += recovery_stat[0].count-rtune_prev_recovery[0].count;
recovery_count += recovery_stat[1].count-rtune_prev_recovery[1].count;
memcpy(rtune_prev_recovery, recovery_stat, sizeof(recovery_stat));
total_client_usec -= total_recovery_usec;
if (recovery_count == 0)
{
return;
}
rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate +
rtune_avg_lat*(1-recovery_tune_ewma_rate);
rtune_avg_count = recovery_count*recovery_tune_ewma_rate +
rtune_avg_count*(1-recovery_tune_ewma_rate);
// client_util = count/interval * usec/1000000.0/count = usec/1000000.0/interval :-)
double client_util = total_client_usec/1000000.0/recovery_tune_interval;
// example:
// total 3 GB/s
// recovery queue 1
// 120 OSDs
// EC 5+3
// 128kb block_size => 640kb object
// 3000*1024/640/120 = 40 MB/s per OSD = 64 recovered objects per OSD
// = 64*8*2 subops = 1024 recovery subop iops
// 8 recovery subop queue
// => subop avg latency = 0.0078125 sec
// utilisation = 8
// target util 1
// intuitively target latency should be 8x of real
// target_lat = rtune_avg_lat * utilisation / target_util
// = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util
// = 0.0625
// recovery utilisation will be 1
auto client_util = total_client_usec/1000000.0/recovery_tune_interval;
rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate;
rtune_target_util = (rtune_client_util < recovery_tune_min_client_util
? recovery_tune_max_util
@ -414,19 +421,15 @@ void osd_t::tune_recovery()
(recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util)
)
);
// for example: utilisation = 8, target = 1
// intuitively target latency should be 8x of real
// target_lat = rtune_avg_lat * utilisation / target_util
// = rtune_avg_lat * rtune_avg_lat * rtune_avg_iops / target_util
// = 0.0625
rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate + rtune_avg_lat*(1-recovery_tune_ewma_rate);
recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0);
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0*rtune_avg_count/recovery_tune_interval/rtune_target_util;
uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util;
recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
if (log_level > 3)
{
printf(
"recovery tune: client util %.2f (ewma %.2f), target util %.2f -> queue %ld, lat %lu us, real %lu us, pause %lu us\n",
client_util, rtune_client_util, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us
"recovery tune: cli %lu us, recovery %lu us / %lu ops, target util %.2f -> queue %ld, lat %lu us, real %lu us, delay %lu us\n",
total_client_usec, total_recovery_usec, recovery_count, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us
);
}
}

View File

@ -34,6 +34,7 @@
#define OSD_OP_MAX 18
#define OSD_RW_MAX 64*1024*1024
#define OSD_PROTOCOL_VERSION 1
#define OSD_OP_RECOVERY_RELATED (uint32_t)1
// Memory alignment for direct I/O (usually 512 bytes)
#ifndef DIRECT_IO_ALIGNMENT
@ -88,7 +89,8 @@ struct __attribute__((__packed__)) osd_op_sec_rw_t
uint32_t len;
// bitmap/attribute length - bitmap comes after header, but before data
uint32_t attr_len;
uint32_t pad0;
// the only possible flag is OSD_OP_RECOVERY_RELATED
uint32_t flags;
};
struct __attribute__((__packed__)) osd_reply_sec_rw_t
@ -109,6 +111,9 @@ struct __attribute__((__packed__)) osd_op_sec_del_t
object_id oid;
// delete version (automatic or specific)
uint64_t version;
// the only possible flag is OSD_OP_RECOVERY_RELATED
uint32_t flags;
uint32_t pad0;
};
struct __attribute__((__packed__)) osd_reply_sec_del_t
@ -121,6 +126,9 @@ struct __attribute__((__packed__)) osd_reply_sec_del_t
struct __attribute__((__packed__)) osd_op_sec_sync_t
{
osd_op_header_t header;
// the only possible flag is OSD_OP_RECOVERY_RELATED
uint32_t flags;
uint32_t pad0;
};
struct __attribute__((__packed__)) osd_reply_sec_sync_t
@ -134,6 +142,9 @@ struct __attribute__((__packed__)) osd_op_sec_stab_t
osd_op_header_t header;
// obj_ver_id array length in bytes
uint64_t len;
// the only possible flag is OSD_OP_RECOVERY_RELATED
uint32_t flags;
uint32_t pad0;
};
typedef osd_op_sec_stab_t osd_op_sec_rollback_t;

View File

@ -221,6 +221,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
.offset = wr ? si->write_start : si->read_start,
.len = subop_len,
.attr_len = wr ? clean_entry_bitmap_size : 0,
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
};
#ifdef OSD_DEBUG
printf(
@ -300,7 +301,8 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
" retval = "+std::to_string(bs_op->retval)+")"
);
}
add_bs_subop_stats(subop);
bool recovery_related = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB;
add_bs_subop_stats(subop, recovery_related);
subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode];
subop->reply.hdr.retval = bs_op->retval;
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE)
@ -312,30 +314,33 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
}
delete bs_op;
subop->bs_op = NULL;
subop->peer_fd = -1;
handle_primary_subop(subop, cur_op);
subop->peer_fd = SELF_FD;
if (recovery_related && recovery_target_sleep_us)
{
tfd->set_timer_us(recovery_target_sleep_us, false, [=](int timer_id)
{
handle_primary_subop(subop, cur_op);
});
}
else
{
handle_primary_subop(subop, cur_op);
}
}
void osd_t::add_bs_subop_stats(osd_op_t *subop)
void osd_t::add_bs_subop_stats(osd_op_t *subop, bool recovery_related)
{
// Include local blockstore ops in statistics
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
msgr.stats.op_stat_count[opcode]++;
if (!msgr.stats.op_stat_count[opcode])
uint64_t len = (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE)
? subop->bs_op->len : 0;
msgr.inc_op_stats(msgr.stats, opcode, subop->tv_begin, tv_end, len);
if (recovery_related)
{
msgr.stats.op_stat_count[opcode] = 1;
msgr.stats.op_stat_sum[opcode] = 0;
msgr.stats.op_stat_bytes[opcode] = 0;
}
msgr.stats.op_stat_sum[opcode] += (
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
);
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE)
{
msgr.stats.op_stat_bytes[opcode] += subop->bs_op->len;
// It is OSD_OP_RECOVERY_RELATED
msgr.inc_op_stats(msgr.recovery_stats, opcode, subop->tv_begin, tv_end, len);
}
}
@ -558,6 +563,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
},
.oid = chunk.oid,
.version = chunk.version,
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
} };
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
@ -615,6 +621,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
.id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_SYNC,
},
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
} };
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
@ -674,6 +681,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
.opcode = OSD_OP_SEC_STABILIZE,
},
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
.flags = cur_op->peer_fd == SELF_FD && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? OSD_OP_RECOVERY_RELATED : 0,
} };
subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop)

View File

@ -296,7 +296,6 @@ resume_7:
if (!recovery_stat[recovery_type].count) // wrapped
{
memset(&recovery_print_prev[recovery_type], 0, sizeof(recovery_print_prev[recovery_type]));
memset(&rtune_prev_recovery[recovery_type], 0, sizeof(rtune_prev_recovery[recovery_type]));
memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type]));
recovery_stat[recovery_type].count++;
}

View File

@ -42,7 +42,21 @@ void osd_t::secondary_op_callback(osd_op_t *op)
int retval = op->bs_op->retval;
delete op->bs_op;
op->bs_op = NULL;
finish_op(op, retval);
if (op->is_recovery_related() && recovery_target_sleep_us)
{
if (!op->tv_end.tv_sec)
{
clock_gettime(CLOCK_REALTIME, &op->tv_end);
}
tfd->set_timer_us(recovery_target_sleep_us, false, [this, op, retval](int timer_id)
{
finish_op(op, retval);
});
}
else
{
finish_op(op, retval);
}
}
void osd_t::exec_secondary(osd_op_t *cur_op)