diff --git a/mon/mon.js b/mon/mon.js index 567b9e90..6a0ec14e 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -110,6 +110,7 @@ const etcd_tree = { print_stats_interval: 3, slow_log_interval: 10, inode_vanish_time: 60, + auto_scrub: false, scrub_interval: '30d', // 1s/1m/1h/1d scrub_queue_depth: 1, scrub_sleep: 0, // milliseconds @@ -295,7 +296,7 @@ const etcd_tree = { osd_sets: osd_num_t[][], all_peers: osd_num_t[], epoch: uint64_t, - scrub_ts: uint64_t, + next_scrub: uint64_t, }, }, */ }, diff --git a/src/cli_rm_osd.cpp b/src/cli_rm_osd.cpp index 1536fd21..9a4a5a52 100644 --- a/src/cli_rm_osd.cpp +++ b/src/cli_rm_osd.cpp @@ -415,8 +415,8 @@ struct rm_osd_t { "all_peers", pg_cfg.all_peers }, { "osd_sets", pg_cfg.target_history }, }; - if (pg_cfg.scrub_ts) - hist["scrub_ts"] = pg_cfg.scrub_ts; + if (pg_cfg.next_scrub) + hist["next_scrub"] = pg_cfg.next_scrub; history_updates.push_back(json11::Json::object { { "request_put", json11::Json::object { { "key", history_key }, diff --git a/src/etcd_state_client.cpp b/src/etcd_state_client.cpp index eba680b5..1d76b0c8 100644 --- a/src/etcd_state_client.cpp +++ b/src/etcd_state_client.cpp @@ -923,8 +923,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv) } // Read epoch pg_cfg.epoch = value["epoch"].uint64_value(); - // Scrub timestamp - pg_cfg.scrub_ts = parse_time(value["scrub_ts"].string_value()); + // Next scrub timestamp (0 or empty = scrub is not needed) + pg_cfg.next_scrub = value["next_scrub"].uint64_value(); if (on_change_pg_history_hook != NULL) { on_change_pg_history_hook(pool_id, pg_num); diff --git a/src/etcd_state_client.h b/src/etcd_state_client.h index cd495368..e8a5a306 100644 --- a/src/etcd_state_client.h +++ b/src/etcd_state_client.h @@ -39,7 +39,7 @@ struct pg_config_t osd_num_t cur_primary; int cur_state; uint64_t epoch; - uint64_t scrub_ts; + uint64_t next_scrub; }; struct pool_config_t diff --git a/src/osd.cpp b/src/osd.cpp index 09dd2521..21ea86dc 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -13,6 +13,7 @@ #include "osd_primary.h" #include "osd.h" #include "http_client.h" +#include "str_util.h" static blockstore_config_t json_to_bs(const json11::Json::object & config) { @@ -207,7 +208,8 @@ void osd_t::parse_config(bool init) inode_vanish_time = config["inode_vanish_time"].uint64_value(); if (!inode_vanish_time) inode_vanish_time = 60; - global_scrub_interval = config["scrub_interval"].uint64_value(); + auto_scrub = json_is_true(config["auto_scrub"]); + global_scrub_interval = parse_time(config["scrub_interval"].string_value()); if (!global_scrub_interval) global_scrub_interval = 30*86400; scrub_queue_depth = config["scrub_queue_depth"].uint64_value(); @@ -330,8 +332,7 @@ void osd_t::exec_op(osd_op_t *cur_op) cur_op->req.hdr.opcode == OSD_OP_DELETE) && (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % bs_bitmap_granularity || - cur_op->req.rw.offset % bs_bitmap_granularity)) || - cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1) + cur_op->req.rw.offset % bs_bitmap_granularity))) { // Bad command finish_op(cur_op, -EINVAL); diff --git a/src/osd.h b/src/osd.h index 0c8b6a7b..403badf0 100644 --- a/src/osd.h +++ b/src/osd.h @@ -114,6 +114,7 @@ class osd_t int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int inode_vanish_time = 60; int log_level = 0; + bool auto_scrub = false; uint64_t global_scrub_interval = 30*86400; uint64_t scrub_queue_depth = 1; uint64_t scrub_sleep_ms = 0; @@ -153,8 +154,8 @@ class osd_t // Scrubbing uint64_t scrub_nearest_ts = 0; int scrub_timer_id = -1; - pool_pg_num_t scrub_last_pg; - osd_op_t *scrub_list_op; + pool_pg_num_t scrub_last_pg = {}; + osd_op_t *scrub_list_op = NULL; pg_list_result_t scrub_cur_list = {}; uint64_t scrub_list_pos = 0; @@ -237,7 +238,7 @@ class osd_t // scrub void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid); - bool pick_next_scrub(object_id & next_oid); + int pick_next_scrub(object_id & next_oid); void submit_scrub_op(object_id oid); bool continue_scrub(); void schedule_scrub(pg_t & pg); diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index df98de06..259936f9 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -694,11 +694,10 @@ void osd_t::apply_pg_config() pg_it->second.all_peers == vec_all_peers) { // No change in osd_set and history - if (pg_it->second.scrub_ts != pg_cfg.scrub_ts) + if (pg_it->second.next_scrub != pg_cfg.next_scrub) { - pg_it->second.scrub_ts = pg_cfg.scrub_ts; - peering_state = peering_state | OSD_SCRUBBING; - ringloop->wakeup(); + pg_it->second.next_scrub = pg_cfg.next_scrub; + schedule_scrub(pg_it->second); } continue; } @@ -751,7 +750,7 @@ void osd_t::apply_pg_config() .reported_epoch = pg_cfg.epoch, .target_history = pg_cfg.target_history, .all_peers = vec_all_peers, - .scrub_ts = pg_cfg.scrub_ts, + .next_scrub = pg_cfg.next_scrub, .target_set = pg_cfg.target_set, }; if (pg.scheme == POOL_SCHEME_EC) @@ -892,8 +891,8 @@ void osd_t::report_pg_states() { "all_peers", pg.all_peers }, { "osd_sets", pg.target_history }, }; - if (pg.scrub_ts) - history_value["scrub_ts"] = pg.scrub_ts; + if (pg.next_scrub) + history_value["next_scrub"] = pg.next_scrub; checks.push_back(json11::Json::object { { "target", "MOD" }, { "key", history_key }, diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 50fa3f86..4e586e28 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -307,6 +307,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op) { printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe); } + op->osd_op->peer_fd = -1; op->osd_op->callback = [this, op](osd_op_t *osd_op) { if (osd_op->reply.hdr.retval < 0) diff --git a/src/osd_peering_pg.h b/src/osd_peering_pg.h index b8c56870..96f96dc1 100644 --- a/src/osd_peering_pg.h +++ b/src/osd_peering_pg.h @@ -95,8 +95,8 @@ struct pg_t // target history and all potential peers std::vector> target_history; std::vector all_peers; - // last scrub time - uint64_t scrub_ts = 0; + // next scrub time + uint64_t next_scrub = 0; bool history_changed = false; // peer list from the last peering event std::vector cur_peers; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 0a5a3ac7..9c8e9751 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -81,7 +81,11 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval) free(cur_op->op_data); cur_op->op_data = NULL; } - if (!cur_op->peer_fd) + cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC; + cur_op->reply.hdr.id = cur_op->req.hdr.id; + cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode; + cur_op->reply.hdr.retval = retval; + if (cur_op->peer_fd == -1) { // Copy lambda to be unaffected by `delete op` std::function(cur_op->callback)(cur_op); @@ -92,10 +96,6 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval) auto cl_it = msgr.clients.find(cur_op->peer_fd); if (cl_it != msgr.clients.end()) { - cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC; - cur_op->reply.hdr.id = cur_op->req.hdr.id; - cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode; - cur_op->reply.hdr.retval = retval; msgr.outbox_push(cur_op); } else @@ -149,22 +149,23 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o } osd_num_t role_osd_num = osd_set[role]; int stripe_num = rep ? 0 : role; + osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num); if (role_osd_num != 0) { osd_op_t *subop = op_data->subops + i; uint32_t subop_len = wr - ? stripes[stripe_num].write_end - stripes[stripe_num].write_start - : stripes[stripe_num].read_end - stripes[stripe_num].read_start; - if (!wr && stripes[stripe_num].read_end == UINT32_MAX) + ? si->write_end - si->write_start + : si->read_end - si->read_start; + if (!wr && si->read_end == UINT32_MAX) { subop_len = 0; } - stripes[stripe_num].osd_num = role_osd_num; - stripes[stripe_num].read_error = false; - subop->bitmap = stripes[stripe_num].bmp_buf; + si->osd_num = role_osd_num; + si->read_error = false; + subop->bitmap = si->bmp_buf; subop->bitmap_len = clean_entry_bitmap_size; // Using rmw_buf to pass pointer to stripes. Dirty but should work - subop->rmw_buf = stripes+stripe_num; + subop->rmw_buf = si; if (role_osd_num == this->osd_num) { clock_gettime(CLOCK_REALTIME, &subop->tv_begin); @@ -181,11 +182,11 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o .stripe = op_data->oid.stripe | stripe_num, }, .version = op_version, - .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, + .offset = wr ? si->write_start : si->read_start, .len = subop_len, }, - .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf, - .bitmap = stripes[stripe_num].bmp_buf, + .buf = wr ? si->write_buf : si->read_buf, + .bitmap = si->bmp_buf, }); #ifdef OSD_DEBUG printf( @@ -210,7 +211,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o .stripe = op_data->oid.stripe | stripe_num, }, .version = op_version, - .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, + .offset = wr ? si->write_start : si->read_start, .len = subop_len, .attr_len = wr ? clean_entry_bitmap_size : 0, }; @@ -223,16 +224,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o #endif if (wr) { - if (stripes[stripe_num].write_end > stripes[stripe_num].write_start) + if (si->write_end > si->write_start) { - subop->iov.push_back(stripes[stripe_num].write_buf, stripes[stripe_num].write_end - stripes[stripe_num].write_start); + subop->iov.push_back(si->write_buf, si->write_end - si->write_start); } } else { if (subop_len > 0) { - subop->iov.push_back(stripes[stripe_num].read_buf, subop_len); + subop->iov.push_back(si->read_buf, subop_len); } } subop->callback = [cur_op, this](osd_op_t *subop) @@ -257,7 +258,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o } else { - stripes[stripe_num].osd_num = 0; + si->osd_num = 0; } } return i-subop_idx; diff --git a/src/osd_scrub.cpp b/src/osd_scrub.cpp index 9b0fe15e..7cd2c3a3 100644 --- a/src/osd_scrub.cpp +++ b/src/osd_scrub.cpp @@ -23,7 +23,10 @@ void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oi if (min_oid.inode != 0 || min_oid.stripe != 0) op->bs_op->min_oid = min_oid; else + { op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS)); + op->bs_op->min_oid.stripe = 0; + } op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1; op->bs_op->max_oid.stripe = UINT64_MAX; op->bs_op->list_stable_limit = scrub_list_limit; @@ -100,7 +103,7 @@ void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oi } } -bool osd_t::pick_next_scrub(object_id & next_oid) +int osd_t::pick_next_scrub(object_id & next_oid) { if (!pgs.size()) { @@ -110,86 +113,88 @@ bool osd_t::pick_next_scrub(object_id & next_oid) scrub_cur_list = {}; scrub_last_pg = {}; } - return false; + return 0; } timespec tv_now; clock_gettime(CLOCK_REALTIME, &tv_now); bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0; // Restart scanning from the same PG as the last time auto pg_it = pgs.lower_bound(scrub_last_pg); + if (pg_it == pgs.end() && rescan) + { + pg_it = pgs.begin(); + rescan = false; + } while (pg_it != pgs.end()) { - if (pg_it->second.state & PG_ACTIVE) + if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub < tv_now.tv_sec) { - auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id); - auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval; - if (pg_it->second.scrub_ts < tv_now.tv_sec-interval) + // Continue scrubbing from the next object + if (scrub_last_pg == pg_it->first) { - // Continue scrubbing from the next object - if (scrub_last_pg == pg_it->first) + while (scrub_list_pos < scrub_cur_list.total_count) { - while (scrub_list_pos < scrub_cur_list.total_count) + auto oid = scrub_cur_list.buf[scrub_list_pos].oid; + oid.stripe &= ~STRIPE_MASK; + scrub_list_pos++; + if (recovery_ops.find(oid) == recovery_ops.end() && + scrub_ops.find(oid) == scrub_ops.end() && + pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end()) { - auto oid = scrub_cur_list.buf[scrub_list_pos].oid; - oid.stripe &= ~STRIPE_MASK; - scrub_list_pos++; - if (recovery_ops.find(oid) == recovery_ops.end() && - scrub_ops.find(oid) == scrub_ops.end()) + next_oid = oid; + if (!(pg_it->second.state & PG_SCRUBBING)) { - next_oid = oid; - if (!(pg_it->second.state & PG_SCRUBBING)) - { - // Currently scrubbing this PG - pg_it->second.state = pg_it->second.state | PG_SCRUBBING; - report_pg_state(pg_it->second); - } - return true; + // Currently scrubbing this PG + pg_it->second.state = pg_it->second.state | PG_SCRUBBING; + report_pg_state(pg_it->second); } + return 2; } } - if (scrub_last_pg == pg_it->first && - scrub_cur_list.total_count && scrub_list_pos >= scrub_cur_list.total_count && - scrub_cur_list.stable_count < scrub_list_limit) + } + if (scrub_last_pg == pg_it->first && + scrub_list_pos >= scrub_cur_list.total_count && + scrub_cur_list.stable_count < scrub_list_limit) + { + // End of the list, mark this PG as scrubbed and go to the next PG + } + else + { + // Continue listing + object_id scrub_last_oid = {}; + if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0) { - // End of the list, mark this PG as scrubbed and go to the next PG + scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid; + scrub_last_oid.stripe++; } - else + osd_num_t scrub_osd = 0; + for (osd_num_t pg_osd: pg_it->second.cur_set) { - // Continue listing - object_id scrub_last_oid; - if (scrub_last_pg != pg_it->first) - scrub_last_oid = (object_id){}; - else if (scrub_cur_list.stable_count > 0) - { - scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid; - scrub_last_oid.stripe++; - } - osd_num_t scrub_osd = 0; - for (osd_num_t pg_osd: pg_it->second.cur_set) - { - if (pg_osd == this->osd_num || scrub_osd == 0) - scrub_osd = pg_osd; - } - if (!(pg_it->second.state & PG_SCRUBBING)) - { - // Currently scrubbing this PG - pg_it->second.state = pg_it->second.state | PG_SCRUBBING; - report_pg_state(pg_it->second); - } - if (scrub_cur_list.buf) - { - free(scrub_cur_list.buf); - scrub_cur_list = {}; - scrub_last_oid = {}; - } - scrub_last_pg = pg_it->first; - scrub_list(pg_it->first, scrub_osd, scrub_last_oid); - return true; + if (pg_osd == this->osd_num || scrub_osd == 0) + scrub_osd = pg_osd; } + if (!(pg_it->second.state & PG_SCRUBBING)) + { + // Currently scrubbing this PG + pg_it->second.state = pg_it->second.state | PG_SCRUBBING; + report_pg_state(pg_it->second); + } + if (scrub_cur_list.buf) + { + free(scrub_cur_list.buf); + scrub_cur_list = {}; + scrub_list_pos = 0; + } + scrub_last_pg = pg_it->first; + scrub_list(pg_it->first, scrub_osd, scrub_last_oid); + return 1; } if (pg_it->second.state & PG_SCRUBBING) { - pg_it->second.scrub_ts = tv_now.tv_sec; + scrub_last_pg = {}; + auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id); + auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval; + pg_it->second.next_scrub = auto_scrub ? tv_now.tv_sec + interval : 0; pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING; pg_it->second.history_changed = true; report_pg_state(pg_it->second); @@ -211,13 +216,14 @@ bool osd_t::pick_next_scrub(object_id & next_oid) } } // Scanned all PGs - no more scrubs to do - return false; + return 0; } void osd_t::submit_scrub_op(object_id oid) { auto osd_op = new osd_op_t(); osd_op->op_type = OSD_OP_OUT; + osd_op->peer_fd = -1; osd_op->req = (osd_any_op_t){ .rw = { .header = { @@ -249,7 +255,7 @@ void osd_t::submit_scrub_op(object_id oid) } else if (log_level > 2) { - printf("Scrubbed %lx:%lx OK\n", oid.inode, oid.stripe); + printf("Scrubbed %lx:%lx\n", oid.inode, oid.stripe); } delete osd_op; if (scrub_sleep_ms) @@ -282,21 +288,20 @@ bool osd_t::continue_scrub() while (scrub_ops.size() < scrub_queue_depth) { object_id oid; - if (pick_next_scrub(oid)) + int r = pick_next_scrub(oid); + if (r == 2) submit_scrub_op(oid); else - return false; + return r; } return true; } void osd_t::schedule_scrub(pg_t & pg) { - auto & pool_cfg = st_cli.pool_config.at(pg.pool_id); - auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval; - if (!scrub_nearest_ts || scrub_nearest_ts > pg.scrub_ts+interval) + if (pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub)) { - scrub_nearest_ts = pg.scrub_ts+interval; + scrub_nearest_ts = pg.next_scrub; timespec tv_now; clock_gettime(CLOCK_REALTIME, &tv_now); if (scrub_timer_id >= 0) @@ -344,6 +349,7 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op) op_data->degraded = false; for (int role = 0; role < op_data->pg_size; role++) { + op_data->stripes[role].write_buf = NULL; op_data->stripes[role].read_start = 0; op_data->stripes[role].read_end = bs_block_size; if (op_data->prev_set[role] != 0) @@ -417,7 +423,7 @@ resume_2: for (int role = 0; role < op_data->pg_size; role++) { eq_to[role] = -1; - if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error) + if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing) { total++; eq_to[role] = role; @@ -443,10 +449,10 @@ resume_2: int best = -1; for (int role = 0; role < op_data->pg_size; role++) { - if (best < 0 && votes[role] > 0 || votes[role] > votes[best]) + if (votes[role] > (best >= 0 ? votes[best] : 0)) best = role; } - if (best > 0 && votes[best] < total) + if (best >= 0 && votes[best] < total) { // FIXME Add a flag to allow to skip such objects and not recover them automatically bool unknown = false;