forked from vitalif/vitastor
Compare commits
2 Commits
master
...
recovery-a
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | d4ebbeaf5c | |
Vitaliy Filippov | bf0c29a46c |
10
mon/mon.js
10
mon/mon.js
|
@ -110,7 +110,15 @@ const etcd_tree = {
|
|||
autosync_interval: 5,
|
||||
autosync_writes: 128,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 4,
|
||||
recovery_queue_depth: 1,
|
||||
recovery_sleep_us: 0,
|
||||
recovery_tune_min_util: 0.1,
|
||||
recovery_tune_min_client_util: 0,
|
||||
recovery_tune_max_util: 1.0,
|
||||
recovery_tune_max_client_util: 0.5,
|
||||
recovery_tune_interval: 1,
|
||||
recovery_tune_ewma_rate: 0.5,
|
||||
recovery_tune_sleep_min_us: 10, // 10 microseconds
|
||||
recovery_pg_switch: 128,
|
||||
recovery_sync_batch: 16,
|
||||
no_recovery: false,
|
||||
|
|
66
src/osd.cpp
66
src/osd.cpp
|
@ -68,14 +68,21 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
|||
}
|
||||
}
|
||||
|
||||
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||
if (print_stats_timer_id == -1)
|
||||
{
|
||||
print_stats();
|
||||
});
|
||||
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_stats();
|
||||
});
|
||||
}
|
||||
if (slow_log_timer_id == -1)
|
||||
{
|
||||
print_slow();
|
||||
});
|
||||
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_slow();
|
||||
});
|
||||
}
|
||||
apply_recovery_tune_interval();
|
||||
|
||||
msgr.tfd = this->tfd;
|
||||
msgr.ringloop = this->ringloop;
|
||||
|
@ -97,6 +104,11 @@ osd_t::~osd_t()
|
|||
tfd->clear_timer(slow_log_timer_id);
|
||||
slow_log_timer_id = -1;
|
||||
}
|
||||
if (rtune_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(rtune_timer_id);
|
||||
rtune_timer_id = -1;
|
||||
}
|
||||
if (print_stats_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(print_stats_timer_id);
|
||||
|
@ -196,6 +208,22 @@ void osd_t::parse_config(bool init)
|
|||
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
recovery_sleep_us = config["recovery_sleep_us"].uint64_value();
|
||||
recovery_tune_min_util = config["recovery_tune_min_util"].is_null()
|
||||
? 0.1 : config["recovery_tune_min_util"].number_value();
|
||||
recovery_tune_max_util = config["recovery_tune_max_util"].is_null()
|
||||
? 1.0 : config["recovery_tune_max_util"].number_value();
|
||||
recovery_tune_min_client_util = config["recovery_tune_min_client_util"].is_null()
|
||||
? 0 : config["recovery_tune_min_client_util"].number_value();
|
||||
recovery_tune_max_client_util = config["recovery_tune_max_client_util"].is_null()
|
||||
? 0.5 : config["recovery_tune_max_client_util"].number_value();
|
||||
auto old_recovery_tune_interval = recovery_tune_interval;
|
||||
recovery_tune_interval = config["recovery_tune_interval"].is_null()
|
||||
? 1 : config["recovery_tune_interval"].uint64_value();
|
||||
recovery_tune_ewma_rate = config["recovery_tune_ewma_rate"].is_null()
|
||||
? 0.5 : config["recovery_tune_ewma_rate"].number_value();
|
||||
recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
|
||||
? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
|
||||
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||
if (recovery_pg_switch < 1)
|
||||
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||
|
@ -274,6 +302,10 @@ void osd_t::parse_config(bool init)
|
|||
print_slow();
|
||||
});
|
||||
}
|
||||
if (old_recovery_tune_interval != recovery_tune_interval)
|
||||
{
|
||||
apply_recovery_tune_interval();
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::bind_socket()
|
||||
|
@ -421,14 +453,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||
}
|
||||
}
|
||||
|
||||
void osd_t::reset_stats()
|
||||
{
|
||||
msgr.stats = {};
|
||||
prev_stats = {};
|
||||
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
|
||||
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
|
||||
}
|
||||
|
||||
void osd_t::print_stats()
|
||||
{
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
|
@ -466,19 +490,19 @@ void osd_t::print_stats()
|
|||
}
|
||||
for (int i = 0; i < 2; i++)
|
||||
{
|
||||
if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
|
||||
if (recovery_stat[i].count > recovery_print_prev[i].count)
|
||||
{
|
||||
uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
|
||||
uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
|
||||
printf(
|
||||
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
|
||||
(recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
|
||||
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg lat %ld us\n", osd_num, recovery_stat_names[i],
|
||||
(recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
|
||||
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
|
||||
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
|
||||
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
|
||||
(recovery_stat[i].usec - recovery_print_prev[i].usec) / (recovery_stat[i].count - recovery_print_prev[i].count)
|
||||
);
|
||||
recovery_stat_count[1][i] = recovery_stat_count[0][i];
|
||||
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
|
||||
}
|
||||
}
|
||||
memcpy(recovery_print_prev, recovery_stat, sizeof(recovery_stat));
|
||||
if (corrupted_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
|
||||
|
|
34
src/osd.h
34
src/osd.h
|
@ -34,7 +34,7 @@
|
|||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||
#define DEFAULT_AUTOSYNC_WRITES 128
|
||||
#define MAX_RECOVERY_QUEUE 2048
|
||||
#define DEFAULT_RECOVERY_QUEUE 4
|
||||
#define DEFAULT_RECOVERY_QUEUE 1
|
||||
#define DEFAULT_RECOVERY_PG_SWITCH 128
|
||||
#define DEFAULT_RECOVERY_BATCH 16
|
||||
|
||||
|
@ -87,6 +87,11 @@ struct osd_chain_read_t
|
|||
|
||||
struct osd_rmw_stripe_t;
|
||||
|
||||
struct recovery_stat_t
|
||||
{
|
||||
uint64_t count, usec, bytes;
|
||||
};
|
||||
|
||||
class osd_t
|
||||
{
|
||||
// config
|
||||
|
@ -111,7 +116,15 @@ class osd_t
|
|||
int immediate_commit = IMMEDIATE_NONE;
|
||||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
||||
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
uint64_t recovery_queue_depth = 1;
|
||||
uint64_t recovery_sleep_us = 0;
|
||||
double recovery_tune_min_util = 0.1;
|
||||
double recovery_tune_min_client_util = 0;
|
||||
double recovery_tune_max_util = 1.0;
|
||||
double recovery_tune_max_client_util = 0.5;
|
||||
int recovery_tune_interval = 1;
|
||||
double recovery_tune_ewma_rate = 0.5;
|
||||
int recovery_tune_sleep_min_us = 10;
|
||||
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int inode_vanish_time = 60;
|
||||
|
@ -189,8 +202,17 @@ class osd_t
|
|||
std::map<uint64_t, inode_stats_t> inode_stats;
|
||||
std::map<uint64_t, timespec> vanishing_inodes;
|
||||
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
|
||||
uint64_t recovery_stat_count[2][2] = {};
|
||||
uint64_t recovery_stat_bytes[2][2] = {};
|
||||
recovery_stat_t recovery_stat[2];
|
||||
recovery_stat_t recovery_print_prev[2];
|
||||
|
||||
// recovery auto-tuning
|
||||
int rtune_timer_id = -1;
|
||||
uint64_t rtune_avg_lat = 0;
|
||||
double rtune_client_util = 0, rtune_target_util = 1;
|
||||
osd_op_stats_t rtune_prev_stats;
|
||||
recovery_stat_t rtune_prev_recovery[2];
|
||||
uint64_t recovery_target_queue_depth = 1;
|
||||
uint64_t recovery_target_sleep_us = 0;
|
||||
|
||||
// cluster connection
|
||||
void parse_config(bool init);
|
||||
|
@ -208,8 +230,9 @@ class osd_t
|
|||
void create_osd_state();
|
||||
void renew_lease(bool reload);
|
||||
void print_stats();
|
||||
void tune_recovery();
|
||||
void apply_recovery_tune_interval();
|
||||
void print_slow();
|
||||
void reset_stats();
|
||||
json11::Json get_statistics();
|
||||
void report_statistics();
|
||||
void report_pg_state(pg_t & pg);
|
||||
|
@ -238,6 +261,7 @@ class osd_t
|
|||
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
bool pick_next_recovery(osd_recovery_op_t &op);
|
||||
void submit_recovery_op(osd_recovery_op_t *op);
|
||||
void finish_recovery_op(osd_recovery_op_t *op);
|
||||
bool continue_recovery();
|
||||
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
|
||||
|
||||
|
|
|
@ -213,12 +213,14 @@ json11::Json osd_t::get_statistics()
|
|||
st["subop_stats"] = subop_stats;
|
||||
st["recovery_stats"] = json11::Json::object {
|
||||
{ recovery_stat_names[0], json11::Json::object {
|
||||
{ "count", recovery_stat_count[0][0] },
|
||||
{ "bytes", recovery_stat_bytes[0][0] },
|
||||
{ "count", recovery_stat[0].count },
|
||||
{ "bytes", recovery_stat[0].bytes },
|
||||
{ "usec", recovery_stat[0].usec },
|
||||
} },
|
||||
{ recovery_stat_names[1], json11::Json::object {
|
||||
{ "count", recovery_stat_count[0][1] },
|
||||
{ "bytes", recovery_stat_bytes[0][1] },
|
||||
{ "count", recovery_stat[1].count },
|
||||
{ "bytes", recovery_stat[1].bytes },
|
||||
{ "usec", recovery_stat[1].usec },
|
||||
} },
|
||||
};
|
||||
return st;
|
||||
|
|
|
@ -325,30 +325,113 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||
{
|
||||
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||
}
|
||||
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
|
||||
op->osd_op = NULL;
|
||||
recovery_ops.erase(op->oid);
|
||||
delete osd_op;
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
if (recovery_target_sleep_us)
|
||||
{
|
||||
recovery_done++;
|
||||
if (recovery_done >= recovery_sync_batch)
|
||||
this->tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id)
|
||||
{
|
||||
// Force sync every <recovery_sync_batch> operations
|
||||
// This is required not to pile up an excessive amount of delete operations
|
||||
autosync();
|
||||
recovery_done = 0;
|
||||
}
|
||||
finish_recovery_op(op);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
finish_recovery_op(op);
|
||||
}
|
||||
continue_recovery();
|
||||
};
|
||||
exec_op(op->osd_op);
|
||||
}
|
||||
|
||||
void osd_t::apply_recovery_tune_interval()
|
||||
{
|
||||
if (rtune_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(rtune_timer_id);
|
||||
rtune_timer_id = -1;
|
||||
}
|
||||
if (recovery_tune_interval != 0)
|
||||
{
|
||||
rtune_timer_id = this->tfd->set_timer(recovery_tune_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
tune_recovery();
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
recovery_target_queue_depth = recovery_queue_depth;
|
||||
recovery_target_sleep_us = recovery_sleep_us;
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::finish_recovery_op(osd_recovery_op_t *op)
|
||||
{
|
||||
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
|
||||
delete op->osd_op;
|
||||
op->osd_op = NULL;
|
||||
recovery_ops.erase(op->oid);
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
recovery_done++;
|
||||
if (recovery_done >= recovery_sync_batch)
|
||||
{
|
||||
// Force sync every <recovery_sync_batch> operations
|
||||
// This is required not to pile up an excessive amount of delete operations
|
||||
autosync();
|
||||
recovery_done = 0;
|
||||
}
|
||||
}
|
||||
continue_recovery();
|
||||
}
|
||||
|
||||
void osd_t::tune_recovery()
|
||||
{
|
||||
static int total_client_ops[] = { OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE };
|
||||
uint64_t total_client_usec = 0;
|
||||
for (int i = 0; i < sizeof(total_client_ops)/sizeof(total_client_ops[0]); i++)
|
||||
{
|
||||
total_client_usec += (msgr.stats.op_stat_sum[total_client_ops[i]] - rtune_prev_stats.op_stat_sum[total_client_ops[i]]);
|
||||
rtune_prev_stats.op_stat_sum[total_client_ops[i]] = msgr.stats.op_stat_sum[total_client_ops[i]];
|
||||
}
|
||||
uint64_t total_recovery_usec = 0, recovery_count = 0;
|
||||
total_recovery_usec += recovery_stat[0].usec-rtune_prev_recovery[0].usec;
|
||||
total_recovery_usec += recovery_stat[1].usec-rtune_prev_recovery[1].usec;
|
||||
recovery_count += recovery_stat[0].count-rtune_prev_recovery[0].count;
|
||||
recovery_count += recovery_stat[1].count-rtune_prev_recovery[1].count;
|
||||
memcpy(rtune_prev_recovery, recovery_stat, sizeof(recovery_stat));
|
||||
if (recovery_count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
rtune_avg_lat = total_recovery_usec/recovery_count*recovery_tune_ewma_rate +
|
||||
rtune_avg_lat*(1-recovery_tune_ewma_rate);
|
||||
// client_util = count/interval * usec/1000000.0/count = usec/1000000.0/interval :-)
|
||||
double client_util = total_client_usec/1000000.0/recovery_tune_interval;
|
||||
rtune_client_util = rtune_client_util*(1-recovery_tune_ewma_rate) + client_util*recovery_tune_ewma_rate;
|
||||
rtune_target_util = (rtune_client_util < recovery_tune_min_client_util
|
||||
? recovery_tune_max_util
|
||||
: recovery_tune_min_util + (rtune_client_util >= recovery_tune_max_client_util
|
||||
? 0 : (recovery_tune_max_util-recovery_tune_min_util)*
|
||||
(recovery_tune_max_client_util-rtune_client_util)/(recovery_tune_max_client_util-recovery_tune_min_client_util)
|
||||
)
|
||||
);
|
||||
recovery_target_queue_depth = (int)rtune_target_util + (rtune_target_util < 1 || rtune_target_util-(int)rtune_target_util >= 0.1 ? 1 : 0);
|
||||
// ideal_iops = 1s / real_latency
|
||||
// ;; target_iops = target_util * ideal_iops
|
||||
// => target_lat = target_queue * 1s / target_iops
|
||||
// => target_lat = target_queue / target_util * real_latency
|
||||
uint64_t target_lat = recovery_target_queue_depth/rtune_target_util * rtune_avg_lat;
|
||||
recovery_target_sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0;
|
||||
if (log_level > 3)
|
||||
{
|
||||
printf(
|
||||
"recovery tune: client util %.2f (ewma %.2f), target util %.2f -> queue %ld, lat %lu us, real %lu us, pause %lu us\n",
|
||||
client_util, rtune_client_util, rtune_target_util, recovery_target_queue_depth, target_lat, rtune_avg_lat, recovery_target_sleep_us
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Just trigger write requests for degraded objects. They'll be recovered during writing
|
||||
bool osd_t::continue_recovery()
|
||||
{
|
||||
while (recovery_ops.size() < recovery_queue_depth)
|
||||
while (recovery_ops.size() < recovery_target_queue_depth)
|
||||
{
|
||||
osd_recovery_op_t op;
|
||||
if (pick_next_recovery(op))
|
||||
|
|
|
@ -3,13 +3,15 @@
|
|||
|
||||
#include "osd_primary.h"
|
||||
|
||||
#define SELF_FD -1
|
||||
|
||||
void osd_t::autosync()
|
||||
{
|
||||
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
|
||||
{
|
||||
autosync_op = new osd_op_t();
|
||||
autosync_op->op_type = OSD_OP_IN;
|
||||
autosync_op->peer_fd = -1;
|
||||
autosync_op->peer_fd = SELF_FD;
|
||||
autosync_op->req = (osd_any_op_t){
|
||||
.sync = {
|
||||
.header = {
|
||||
|
@ -85,9 +87,13 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
||||
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
||||
cur_op->reply.hdr.retval = retval;
|
||||
if (cur_op->peer_fd == -1)
|
||||
if (cur_op->peer_fd == SELF_FD)
|
||||
{
|
||||
msgr.measure_exec(cur_op);
|
||||
// Do not include internal primary writes (recovery/rebalance) into client op statistics
|
||||
if (cur_op->req.hdr.opcode != OSD_OP_WRITE)
|
||||
{
|
||||
msgr.measure_exec(cur_op);
|
||||
}
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
|
||||
}
|
||||
|
|
|
@ -292,16 +292,27 @@ resume_7:
|
|||
{
|
||||
{
|
||||
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
if (!recovery_stat_count[0][recovery_type])
|
||||
recovery_stat[recovery_type].count++;
|
||||
if (!recovery_stat[recovery_type].count) // wrapped
|
||||
{
|
||||
recovery_stat_count[0][recovery_type]++;
|
||||
recovery_stat_bytes[0][recovery_type] = 0;
|
||||
memset(&recovery_print_prev[recovery_type], 0, sizeof(recovery_print_prev[recovery_type]));
|
||||
memset(&rtune_prev_recovery[recovery_type], 0, sizeof(rtune_prev_recovery[recovery_type]));
|
||||
memset(&recovery_stat[recovery_type], 0, sizeof(recovery_stat[recovery_type]));
|
||||
recovery_stat[recovery_type].count++;
|
||||
}
|
||||
for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
|
||||
{
|
||||
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
recovery_stat[recovery_type].bytes += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
|
||||
}
|
||||
if (!cur_op->tv_end.tv_sec)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
|
||||
}
|
||||
uint64_t usec = (
|
||||
(cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
|
||||
(cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
recovery_stat[recovery_type].usec += usec;
|
||||
}
|
||||
// Any kind of a non-clean object can have extra chunks, because we don't record objects
|
||||
// as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
|
||||
|
|
Loading…
Reference in New Issue