// Copyright (c) Vitaliy Filippov, 2019+ // License: VNPL-1.1 (see README.md for details) #include "osd_primary.h" #define SELF_FD -1 void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid) { pool_id_t pool_id = pg_id.pool_id; pg_num_t pg_num = pg_id.pg_num; assert(!scrub_list_op); if (role_osd == this->osd_num) { // Self osd_op_t *op = new osd_op_t(); op->op_type = 0; op->peer_fd = SELF_FD; clock_gettime(CLOCK_REALTIME, &op->tv_begin); op->bs_op = new blockstore_op_t(); op->bs_op->opcode = BS_OP_LIST; op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size; if (min_oid.inode != 0 || min_oid.stripe != 0) op->bs_op->min_oid = min_oid; else { op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS)); op->bs_op->min_oid.stripe = 0; } op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1; op->bs_op->max_oid.stripe = UINT64_MAX; op->bs_op->list_stable_limit = scrub_list_limit; op->bs_op->pg_count = pg_counts[pool_id]; op->bs_op->pg_number = pg_num-1; op->bs_op->callback = [this, op](blockstore_op_t *bs_op) { scrub_list_op = NULL; if (op->bs_op->retval < 0) { printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval); force_stop(1); return; } add_bs_subop_stats(op); scrub_cur_list = { .buf = (obj_ver_id*)op->bs_op->buf, .total_count = (uint64_t)op->bs_op->retval, .stable_count = op->bs_op->version, }; delete op->bs_op; op->bs_op = NULL; delete op; continue_scrub(); }; scrub_list_op = op; bs->enqueue_op(op->bs_op); } else { // Peer osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; op->peer_fd = msgr.osd_peer_fds.at(role_osd); op->req = (osd_any_op_t){ .sec_list = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, .id = msgr.next_subop_id++, .opcode = OSD_OP_SEC_LIST, }, .list_pg = pg_num, .pg_count = pg_counts[pool_id], .pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size, .min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)), .max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1, .min_stripe = min_oid.stripe, .stable_limit = scrub_list_limit, }, }; op->callback = [this, role_osd](osd_op_t *op) { scrub_list_op = NULL; if (op->reply.hdr.retval < 0) { printf("Failed to get object list from OSD %ju (retval=%jd), disconnecting peer\n", role_osd, op->reply.hdr.retval); int fail_fd = op->peer_fd; delete op; msgr.stop_client(fail_fd); return; } scrub_cur_list = { .buf = (obj_ver_id*)op->buf, .total_count = (uint64_t)op->reply.hdr.retval, .stable_count = op->reply.sec_list.stable_count, }; // set op->buf to NULL so it doesn't get freed op->buf = NULL; delete op; continue_scrub(); }; scrub_list_op = op; msgr.outbox_push(op); } } int osd_t::pick_next_scrub(object_id & next_oid) { if (!pgs.size()) { if (scrub_cur_list.buf) { free(scrub_cur_list.buf); scrub_cur_list = {}; scrub_last_pg = {}; } return 0; } if (scrub_list_op) { return 1; } timespec tv_now; clock_gettime(CLOCK_REALTIME, &tv_now); bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0; // Restart scanning from the same PG as the last time auto pg_it = pgs.lower_bound(scrub_last_pg); if (pg_it == pgs.end() && rescan) { pg_it = pgs.begin(); rescan = false; } while (pg_it != pgs.end()) { if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub <= tv_now.tv_sec) { // Continue scrubbing from the next object if (scrub_last_pg == pg_it->first) { while (scrub_list_pos < scrub_cur_list.total_count) { auto oid = scrub_cur_list.buf[scrub_list_pos].oid; oid.stripe &= ~STRIPE_MASK; scrub_list_pos++; if (recovery_ops.find(oid) == recovery_ops.end() && scrub_ops.find(oid) == scrub_ops.end() && pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end()) { next_oid = oid; if (!(pg_it->second.state & PG_SCRUBBING)) { // Currently scrubbing this PG pg_it->second.state = pg_it->second.state | PG_SCRUBBING; report_pg_state(pg_it->second); } return 2; } } } if (scrub_last_pg == pg_it->first && scrub_list_pos >= scrub_cur_list.total_count && scrub_cur_list.stable_count < scrub_list_limit) { // End of the list, mark this PG as scrubbed and go to the next PG } else { // Continue listing object_id scrub_last_oid = {}; if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0) { scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid; scrub_last_oid.stripe++; } osd_num_t scrub_osd = 0; for (osd_num_t pg_osd: pg_it->second.cur_set) { if (pg_osd == this->osd_num || scrub_osd == 0) scrub_osd = pg_osd; } if (!(pg_it->second.state & PG_SCRUBBING)) { // Currently scrubbing this PG pg_it->second.state = pg_it->second.state | PG_SCRUBBING; report_pg_state(pg_it->second); } if (scrub_cur_list.buf) { free(scrub_cur_list.buf); scrub_cur_list = {}; scrub_list_pos = 0; } scrub_last_pg = pg_it->first; scrub_list(pg_it->first, scrub_osd, scrub_last_oid); return 1; } if (pg_it->second.state & PG_SCRUBBING) { scrub_last_pg = {}; pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING; pg_it->second.next_scrub = 0; pg_it->second.history_changed = true; report_pg_state(pg_it->second); } // The list is definitely not needed anymore if (scrub_cur_list.buf) { free(scrub_cur_list.buf); scrub_cur_list = {}; } } pg_it++; if (pg_it == pgs.end() && rescan) { // Scan one more time to guarantee that there are no PGs to scrub pg_it = pgs.begin(); rescan = false; } } // Scanned all PGs - no more scrubs to do return 0; } void osd_t::submit_scrub_op(object_id oid) { auto osd_op = new osd_op_t(); osd_op->op_type = OSD_OP_OUT; osd_op->peer_fd = -1; osd_op->req = (osd_any_op_t){ .rw = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, .id = 1, .opcode = OSD_OP_SCRUB, }, .inode = oid.inode, .offset = oid.stripe, .len = 0, }, }; if (log_level > 2) { printf("Submitting scrub for %jx:%jx\n", oid.inode, oid.stripe); } osd_op->callback = [this](osd_op_t *osd_op) { object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset }; if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT) { // Scrub error printf( "Scrub failed with object %jx:%jx (PG %u/%u): error %jd\n", oid.inode, oid.stripe, INODE_POOL(oid.inode), map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size), osd_op->reply.hdr.retval ); } else if (log_level > 2) { printf("Scrubbed %jx:%jx\n", oid.inode, oid.stripe); } delete osd_op; if (scrub_sleep_ms) { this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id) { scrub_ops.erase(oid); continue_scrub(); }); } else { scrub_ops.erase(oid); continue_scrub(); } }; scrub_ops[oid] = osd_op; exec_op(osd_op); } // Triggers scrub requests // Scrub reads data from all replicas and compares it // To scrub first we need to read objects listings bool osd_t::continue_scrub() { if (scrub_list_op) { return true; } if (no_scrub) { // Return false = no more scrub work to do scrub_cur_list = {}; scrub_last_pg = {}; scrub_nearest_ts = 0; if (scrub_timer_id >= 0) { tfd->clear_timer(scrub_timer_id); scrub_timer_id = -1; } for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++) { if (pg_it->second.state & PG_SCRUBBING) { pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING; report_pg_state(pg_it->second); } } return false; } while (scrub_ops.size() < scrub_queue_depth) { object_id oid; int r = pick_next_scrub(oid); if (r == 2) submit_scrub_op(oid); else return r; } return true; } void osd_t::plan_scrub(pg_t & pg, bool report_state) { if ((pg.state & PG_ACTIVE) && !pg.next_scrub && auto_scrub) { timespec tv_now; clock_gettime(CLOCK_REALTIME, &tv_now); auto & pool_cfg = st_cli.pool_config.at(pg.pool_id); auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval; if (pg.next_scrub != tv_now.tv_sec + interval) { pool_cfg.pg_config[pg.pg_num].next_scrub = pg.next_scrub = tv_now.tv_sec + interval; pg.history_changed = true; if (report_state) report_pg_state(pg); } schedule_scrub(pg); } } void osd_t::schedule_scrub(pg_t & pg) { if (!no_scrub && pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub)) { scrub_nearest_ts = pg.next_scrub; timespec tv_now; clock_gettime(CLOCK_REALTIME, &tv_now); if (scrub_timer_id >= 0) { tfd->clear_timer(scrub_timer_id); scrub_timer_id = -1; } if (tv_now.tv_sec >= scrub_nearest_ts) { scrub_nearest_ts = 0; peering_state = peering_state | OSD_SCRUBBING; ringloop->wakeup(); } else { scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id) { scrub_timer_id = -1; scrub_nearest_ts = 0; peering_state = peering_state | OSD_SCRUBBING; ringloop->wakeup(); }); } } } void osd_t::continue_primary_scrub(osd_op_t *cur_op) { if (!cur_op->op_data && !prepare_primary_rw(cur_op)) return; osd_primary_op_data_t *op_data = cur_op->op_data; if (op_data->st == 1) goto resume_1; else if (op_data->st == 2) goto resume_2; { auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); cur_op->req.rw.len = bs_block_size * pg.pg_data_size; // Determine version auto vo_it = pg.ver_override.find(op_data->oid); op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX; // PG may have degraded or misplaced objects op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state); // Read all available chunks int n_copies = 0; op_data->degraded = false; for (int role = 0; role < op_data->pg_size; role++) { op_data->stripes[role].write_buf = NULL; op_data->stripes[role].read_start = 0; op_data->stripes[role].read_end = bs_block_size; if (op_data->prev_set[role] != 0) { n_copies++; } else { op_data->stripes[role].missing = true; if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size) { op_data->degraded = true; } } } if (n_copies <= op_data->pg_data_size) { // Nothing to compare, even if we'd like to finish_op(cur_op, 0); return; } cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0); // Submit reads osd_op_t *subops = new osd_op_t[n_copies]; op_data->fact_ver = 0; op_data->done = op_data->errors = op_data->errcode = 0; op_data->n_subops = n_copies; op_data->subops = subops; int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver, op_data->stripes, op_data->prev_set, cur_op, 0, -1); assert(sent == n_copies); op_data->st = 1; } resume_1: return; resume_2: if (op_data->errors > 0) { if (op_data->errcode == -EIO || op_data->errcode == -EDOM) { // I/O or checksum error int n_copies = 0; for (int role = 0; role < op_data->pg_size; role++) { if (op_data->stripes[role].read_error) { op_data->stripes[role].missing = true; if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size) { op_data->degraded = true; } } else if (!op_data->stripes[role].missing) { n_copies++; } } if (n_copies <= op_data->pg_data_size) { // Nothing to compare, just mark the object as corrupted auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false); // Operation is treated as unsuccessful only if the object becomes unreadable finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0); return; } // Proceed, we can still compare chunks that were successfully read } else { finish_op(cur_op, op_data->errcode); return; } } bool inconsistent = false; if (op_data->scheme == POOL_SCHEME_REPLICATED) { // Check that all chunks have returned the same data int total = 0; int eq_to[op_data->pg_size]; for (int role = 0; role < op_data->pg_size; role++) { eq_to[role] = -1; if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing && !op_data->stripes[role].not_exists) { total++; eq_to[role] = role; for (int other = 0; other < role; other++) { // Only compare with unique chunks (eq_to[other] == other) if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0) { eq_to[role] = eq_to[other]; break; } } } } int votes[op_data->pg_size]; for (int role = 0; role < op_data->pg_size; role++) votes[role] = 0; for (int role = 0; role < op_data->pg_size; role++) { if (eq_to[role] != -1) votes[eq_to[role]]++; } int best = -1; for (int role = 0; role < op_data->pg_size; role++) { if (votes[role] > (best >= 0 ? votes[best] : 0)) best = role; } if (best >= 0 && votes[best] < total) { bool unknown = false; for (int role = 0; role < op_data->pg_size; role++) { if (role != best && votes[role] == votes[best]) { unknown = true; } if (votes[role] > 0 && votes[role] < votes[best]) { printf( "[PG %u/%u] Object %jx:%jx v%ju copy on OSD %ju doesn't match %d other copies%s\n", INODE_POOL(op_data->oid.inode), op_data->pg_num, op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver, op_data->stripes[role].osd_num, votes[best], scrub_find_best ? ", marking it as corrupted" : "" ); if (scrub_find_best) { op_data->stripes[role].read_error = true; } } } if (!scrub_find_best) { unknown = true; } if (unknown) { // It's unknown which replica is good. There are multiple versions with no majority // Mark all good replicas as ambiguous best = -1; inconsistent = true; printf( "[PG %u/%u] Object %jx:%jx v%ju is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n", INODE_POOL(op_data->oid.inode), op_data->pg_num, op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver ); } } } else { assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR); auto good_subset = ec_find_good( op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR, bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce ); if (!good_subset.size()) { inconsistent = true; printf( "[PG %u/%u] Object %jx:%jx v%ju is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n", INODE_POOL(op_data->oid.inode), op_data->pg_num, op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver ); } else { int total = 0; for (int role = 0; role < op_data->pg_size; role++) { if (!op_data->stripes[role].missing) { total++; op_data->stripes[role].read_error = true; } } for (int role: good_subset) { op_data->stripes[role].read_error = false; } for (int role = 0; role < op_data->pg_size; role++) { if (!op_data->stripes[role].missing && op_data->stripes[role].read_error) { printf( "[PG %u/%u] Object %jx:%jx v%ju chunk %d on OSD %ju doesn't match other chunks%s\n", INODE_POOL(op_data->oid.inode), op_data->pg_num, op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver, role, op_data->stripes[role].osd_num, scrub_find_best ? ", marking it as corrupted" : "" ); } } if (!scrub_find_best && good_subset.size() < total) { inconsistent = true; printf( "[PG %u/%u] Object %jx:%jx v%ju is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n", INODE_POOL(op_data->oid.inode), op_data->pg_num, op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver ); for (int role = 0; role < op_data->pg_size; role++) { if (!op_data->stripes[role].missing && op_data->stripes[role].read_error) { // Undo error locator marking chunk as bad op_data->stripes[role].read_error = false; } } } } } for (int role = 0; role < op_data->pg_size; role++) { if (op_data->stripes[role].osd_num != 0 && (op_data->stripes[role].read_error || op_data->stripes[role].not_exists) || inconsistent) { // Got at least 1 read error or mismatch, mark the object as corrupted auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent); break; } } finish_op(cur_op, 0); }