Store next scrub timestamp instead of last scrub timestamp

test-double-alloc
Vitaliy Filippov 2023-04-18 02:08:43 +03:00
parent c3bd26193d
commit 3c924397e7
11 changed files with 120 additions and 110 deletions

View File

@ -110,6 +110,7 @@ const etcd_tree = {
print_stats_interval: 3,
slow_log_interval: 10,
inode_vanish_time: 60,
auto_scrub: false,
scrub_interval: '30d', // 1s/1m/1h/1d
scrub_queue_depth: 1,
scrub_sleep: 0, // milliseconds
@ -295,7 +296,7 @@ const etcd_tree = {
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint64_t,
scrub_ts: uint64_t,
next_scrub: uint64_t,
},
}, */
},

View File

@ -415,8 +415,8 @@ struct rm_osd_t
{ "all_peers", pg_cfg.all_peers },
{ "osd_sets", pg_cfg.target_history },
};
if (pg_cfg.scrub_ts)
hist["scrub_ts"] = pg_cfg.scrub_ts;
if (pg_cfg.next_scrub)
hist["next_scrub"] = pg_cfg.next_scrub;
history_updates.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", history_key },

View File

@ -923,8 +923,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
// Read epoch
pg_cfg.epoch = value["epoch"].uint64_value();
// Scrub timestamp
pg_cfg.scrub_ts = parse_time(value["scrub_ts"].string_value());
// Next scrub timestamp (0 or empty = scrub is not needed)
pg_cfg.next_scrub = value["next_scrub"].uint64_value();
if (on_change_pg_history_hook != NULL)
{
on_change_pg_history_hook(pool_id, pg_num);

View File

@ -39,7 +39,7 @@ struct pg_config_t
osd_num_t cur_primary;
int cur_state;
uint64_t epoch;
uint64_t scrub_ts;
uint64_t next_scrub;
};
struct pool_config_t

View File

@ -13,6 +13,7 @@
#include "osd_primary.h"
#include "osd.h"
#include "http_client.h"
#include "str_util.h"
static blockstore_config_t json_to_bs(const json11::Json::object & config)
{
@ -207,7 +208,8 @@ void osd_t::parse_config(bool init)
inode_vanish_time = config["inode_vanish_time"].uint64_value();
if (!inode_vanish_time)
inode_vanish_time = 60;
global_scrub_interval = config["scrub_interval"].uint64_value();
auto_scrub = json_is_true(config["auto_scrub"]);
global_scrub_interval = parse_time(config["scrub_interval"].string_value());
if (!global_scrub_interval)
global_scrub_interval = 30*86400;
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
@ -330,8 +332,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
(cur_op->req.rw.len > OSD_RW_MAX ||
cur_op->req.rw.len % bs_bitmap_granularity ||
cur_op->req.rw.offset % bs_bitmap_granularity)) ||
cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
cur_op->req.rw.offset % bs_bitmap_granularity)))
{
// Bad command
finish_op(cur_op, -EINVAL);

View File

@ -114,6 +114,7 @@ class osd_t
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
int inode_vanish_time = 60;
int log_level = 0;
bool auto_scrub = false;
uint64_t global_scrub_interval = 30*86400;
uint64_t scrub_queue_depth = 1;
uint64_t scrub_sleep_ms = 0;
@ -153,8 +154,8 @@ class osd_t
// Scrubbing
uint64_t scrub_nearest_ts = 0;
int scrub_timer_id = -1;
pool_pg_num_t scrub_last_pg;
osd_op_t *scrub_list_op;
pool_pg_num_t scrub_last_pg = {};
osd_op_t *scrub_list_op = NULL;
pg_list_result_t scrub_cur_list = {};
uint64_t scrub_list_pos = 0;
@ -237,7 +238,7 @@ class osd_t
// scrub
void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
bool pick_next_scrub(object_id & next_oid);
int pick_next_scrub(object_id & next_oid);
void submit_scrub_op(object_id oid);
bool continue_scrub();
void schedule_scrub(pg_t & pg);

View File

@ -694,11 +694,10 @@ void osd_t::apply_pg_config()
pg_it->second.all_peers == vec_all_peers)
{
// No change in osd_set and history
if (pg_it->second.scrub_ts != pg_cfg.scrub_ts)
if (pg_it->second.next_scrub != pg_cfg.next_scrub)
{
pg_it->second.scrub_ts = pg_cfg.scrub_ts;
peering_state = peering_state | OSD_SCRUBBING;
ringloop->wakeup();
pg_it->second.next_scrub = pg_cfg.next_scrub;
schedule_scrub(pg_it->second);
}
continue;
}
@ -751,7 +750,7 @@ void osd_t::apply_pg_config()
.reported_epoch = pg_cfg.epoch,
.target_history = pg_cfg.target_history,
.all_peers = vec_all_peers,
.scrub_ts = pg_cfg.scrub_ts,
.next_scrub = pg_cfg.next_scrub,
.target_set = pg_cfg.target_set,
};
if (pg.scheme == POOL_SCHEME_EC)
@ -892,8 +891,8 @@ void osd_t::report_pg_states()
{ "all_peers", pg.all_peers },
{ "osd_sets", pg.target_history },
};
if (pg.scrub_ts)
history_value["scrub_ts"] = pg.scrub_ts;
if (pg.next_scrub)
history_value["next_scrub"] = pg.next_scrub;
checks.push_back(json11::Json::object {
{ "target", "MOD" },
{ "key", history_key },

View File

@ -307,6 +307,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
op->osd_op->peer_fd = -1;
op->osd_op->callback = [this, op](osd_op_t *osd_op)
{
if (osd_op->reply.hdr.retval < 0)

View File

@ -95,8 +95,8 @@ struct pg_t
// target history and all potential peers
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
// last scrub time
uint64_t scrub_ts = 0;
// next scrub time
uint64_t next_scrub = 0;
bool history_changed = false;
// peer list from the last peering event
std::vector<osd_num_t> cur_peers;

View File

@ -81,7 +81,11 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
free(cur_op->op_data);
cur_op->op_data = NULL;
}
if (!cur_op->peer_fd)
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval;
if (cur_op->peer_fd == -1)
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
@ -92,10 +96,6 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
auto cl_it = msgr.clients.find(cur_op->peer_fd);
if (cl_it != msgr.clients.end())
{
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval;
msgr.outbox_push(cur_op);
}
else
@ -149,22 +149,23 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
}
osd_num_t role_osd_num = osd_set[role];
int stripe_num = rep ? 0 : role;
osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
if (role_osd_num != 0)
{
osd_op_t *subop = op_data->subops + i;
uint32_t subop_len = wr
? stripes[stripe_num].write_end - stripes[stripe_num].write_start
: stripes[stripe_num].read_end - stripes[stripe_num].read_start;
if (!wr && stripes[stripe_num].read_end == UINT32_MAX)
? si->write_end - si->write_start
: si->read_end - si->read_start;
if (!wr && si->read_end == UINT32_MAX)
{
subop_len = 0;
}
stripes[stripe_num].osd_num = role_osd_num;
stripes[stripe_num].read_error = false;
subop->bitmap = stripes[stripe_num].bmp_buf;
si->osd_num = role_osd_num;
si->read_error = false;
subop->bitmap = si->bmp_buf;
subop->bitmap_len = clean_entry_bitmap_size;
// Using rmw_buf to pass pointer to stripes. Dirty but should work
subop->rmw_buf = stripes+stripe_num;
subop->rmw_buf = si;
if (role_osd_num == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
@ -181,11 +182,11 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
.stripe = op_data->oid.stripe | stripe_num,
},
.version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.offset = wr ? si->write_start : si->read_start,
.len = subop_len,
},
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
.bitmap = stripes[stripe_num].bmp_buf,
.buf = wr ? si->write_buf : si->read_buf,
.bitmap = si->bmp_buf,
});
#ifdef OSD_DEBUG
printf(
@ -210,7 +211,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
.stripe = op_data->oid.stripe | stripe_num,
},
.version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.offset = wr ? si->write_start : si->read_start,
.len = subop_len,
.attr_len = wr ? clean_entry_bitmap_size : 0,
};
@ -223,16 +224,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
#endif
if (wr)
{
if (stripes[stripe_num].write_end > stripes[stripe_num].write_start)
if (si->write_end > si->write_start)
{
subop->iov.push_back(stripes[stripe_num].write_buf, stripes[stripe_num].write_end - stripes[stripe_num].write_start);
subop->iov.push_back(si->write_buf, si->write_end - si->write_start);
}
}
else
{
if (subop_len > 0)
{
subop->iov.push_back(stripes[stripe_num].read_buf, subop_len);
subop->iov.push_back(si->read_buf, subop_len);
}
}
subop->callback = [cur_op, this](osd_op_t *subop)
@ -257,7 +258,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
}
else
{
stripes[stripe_num].osd_num = 0;
si->osd_num = 0;
}
}
return i-subop_idx;

View File

@ -23,7 +23,10 @@ void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oi
if (min_oid.inode != 0 || min_oid.stripe != 0)
op->bs_op->min_oid = min_oid;
else
{
op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
op->bs_op->min_oid.stripe = 0;
}
op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
op->bs_op->max_oid.stripe = UINT64_MAX;
op->bs_op->list_stable_limit = scrub_list_limit;
@ -100,7 +103,7 @@ void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oi
}
}
bool osd_t::pick_next_scrub(object_id & next_oid)
int osd_t::pick_next_scrub(object_id & next_oid)
{
if (!pgs.size())
{
@ -110,86 +113,88 @@ bool osd_t::pick_next_scrub(object_id & next_oid)
scrub_cur_list = {};
scrub_last_pg = {};
}
return false;
return 0;
}
timespec tv_now;
clock_gettime(CLOCK_REALTIME, &tv_now);
bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
// Restart scanning from the same PG as the last time
auto pg_it = pgs.lower_bound(scrub_last_pg);
if (pg_it == pgs.end() && rescan)
{
pg_it = pgs.begin();
rescan = false;
}
while (pg_it != pgs.end())
{
if (pg_it->second.state & PG_ACTIVE)
if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub < tv_now.tv_sec)
{
auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id);
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
if (pg_it->second.scrub_ts < tv_now.tv_sec-interval)
// Continue scrubbing from the next object
if (scrub_last_pg == pg_it->first)
{
// Continue scrubbing from the next object
if (scrub_last_pg == pg_it->first)
while (scrub_list_pos < scrub_cur_list.total_count)
{
while (scrub_list_pos < scrub_cur_list.total_count)
auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
oid.stripe &= ~STRIPE_MASK;
scrub_list_pos++;
if (recovery_ops.find(oid) == recovery_ops.end() &&
scrub_ops.find(oid) == scrub_ops.end() &&
pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end())
{
auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
oid.stripe &= ~STRIPE_MASK;
scrub_list_pos++;
if (recovery_ops.find(oid) == recovery_ops.end() &&
scrub_ops.find(oid) == scrub_ops.end())
next_oid = oid;
if (!(pg_it->second.state & PG_SCRUBBING))
{
next_oid = oid;
if (!(pg_it->second.state & PG_SCRUBBING))
{
// Currently scrubbing this PG
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
report_pg_state(pg_it->second);
}
return true;
// Currently scrubbing this PG
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
report_pg_state(pg_it->second);
}
return 2;
}
}
if (scrub_last_pg == pg_it->first &&
scrub_cur_list.total_count && scrub_list_pos >= scrub_cur_list.total_count &&
scrub_cur_list.stable_count < scrub_list_limit)
}
if (scrub_last_pg == pg_it->first &&
scrub_list_pos >= scrub_cur_list.total_count &&
scrub_cur_list.stable_count < scrub_list_limit)
{
// End of the list, mark this PG as scrubbed and go to the next PG
}
else
{
// Continue listing
object_id scrub_last_oid = {};
if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0)
{
// End of the list, mark this PG as scrubbed and go to the next PG
scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
scrub_last_oid.stripe++;
}
else
osd_num_t scrub_osd = 0;
for (osd_num_t pg_osd: pg_it->second.cur_set)
{
// Continue listing
object_id scrub_last_oid;
if (scrub_last_pg != pg_it->first)
scrub_last_oid = (object_id){};
else if (scrub_cur_list.stable_count > 0)
{
scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
scrub_last_oid.stripe++;
}
osd_num_t scrub_osd = 0;
for (osd_num_t pg_osd: pg_it->second.cur_set)
{
if (pg_osd == this->osd_num || scrub_osd == 0)
scrub_osd = pg_osd;
}
if (!(pg_it->second.state & PG_SCRUBBING))
{
// Currently scrubbing this PG
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
report_pg_state(pg_it->second);
}
if (scrub_cur_list.buf)
{
free(scrub_cur_list.buf);
scrub_cur_list = {};
scrub_last_oid = {};
}
scrub_last_pg = pg_it->first;
scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
return true;
if (pg_osd == this->osd_num || scrub_osd == 0)
scrub_osd = pg_osd;
}
if (!(pg_it->second.state & PG_SCRUBBING))
{
// Currently scrubbing this PG
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
report_pg_state(pg_it->second);
}
if (scrub_cur_list.buf)
{
free(scrub_cur_list.buf);
scrub_cur_list = {};
scrub_list_pos = 0;
}
scrub_last_pg = pg_it->first;
scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
return 1;
}
if (pg_it->second.state & PG_SCRUBBING)
{
pg_it->second.scrub_ts = tv_now.tv_sec;
scrub_last_pg = {};
auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id);
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
pg_it->second.next_scrub = auto_scrub ? tv_now.tv_sec + interval : 0;
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
pg_it->second.history_changed = true;
report_pg_state(pg_it->second);
@ -211,13 +216,14 @@ bool osd_t::pick_next_scrub(object_id & next_oid)
}
}
// Scanned all PGs - no more scrubs to do
return false;
return 0;
}
void osd_t::submit_scrub_op(object_id oid)
{
auto osd_op = new osd_op_t();
osd_op->op_type = OSD_OP_OUT;
osd_op->peer_fd = -1;
osd_op->req = (osd_any_op_t){
.rw = {
.header = {
@ -249,7 +255,7 @@ void osd_t::submit_scrub_op(object_id oid)
}
else if (log_level > 2)
{
printf("Scrubbed %lx:%lx OK\n", oid.inode, oid.stripe);
printf("Scrubbed %lx:%lx\n", oid.inode, oid.stripe);
}
delete osd_op;
if (scrub_sleep_ms)
@ -282,21 +288,20 @@ bool osd_t::continue_scrub()
while (scrub_ops.size() < scrub_queue_depth)
{
object_id oid;
if (pick_next_scrub(oid))
int r = pick_next_scrub(oid);
if (r == 2)
submit_scrub_op(oid);
else
return false;
return r;
}
return true;
}
void osd_t::schedule_scrub(pg_t & pg)
{
auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
if (!scrub_nearest_ts || scrub_nearest_ts > pg.scrub_ts+interval)
if (pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub))
{
scrub_nearest_ts = pg.scrub_ts+interval;
scrub_nearest_ts = pg.next_scrub;
timespec tv_now;
clock_gettime(CLOCK_REALTIME, &tv_now);
if (scrub_timer_id >= 0)
@ -344,6 +349,7 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
op_data->degraded = false;
for (int role = 0; role < op_data->pg_size; role++)
{
op_data->stripes[role].write_buf = NULL;
op_data->stripes[role].read_start = 0;
op_data->stripes[role].read_end = bs_block_size;
if (op_data->prev_set[role] != 0)
@ -417,7 +423,7 @@ resume_2:
for (int role = 0; role < op_data->pg_size; role++)
{
eq_to[role] = -1;
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing)
{
total++;
eq_to[role] = role;
@ -443,10 +449,10 @@ resume_2:
int best = -1;
for (int role = 0; role < op_data->pg_size; role++)
{
if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
if (votes[role] > (best >= 0 ? votes[best] : 0))
best = role;
}
if (best > 0 && votes[best] < total)
if (best >= 0 && votes[best] < total)
{
// FIXME Add a flag to allow to skip such objects and not recover them automatically
bool unknown = false;