From 6648f6bb6ef6a1c6c1aaf219360b511fb477c1e1 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 10 Apr 2023 01:05:41 +0300 Subject: [PATCH] Implement ambiguity detection during scrub --- mon/mon.js | 5 +- src/osd.cpp | 3 + src/osd.h | 8 +- src/osd_peering_pg.cpp | 16 ++-- src/osd_peering_pg.h | 5 +- src/osd_primary.cpp | 185 +++++++++++++++++++++++++++++++------- src/osd_primary_chain.cpp | 2 +- src/osd_primary_write.cpp | 2 +- src/osd_scrub.cpp | 109 ++++++++++++---------- src/pg_states.cpp | 32 ++++++- src/pg_states.h | 12 ++- 11 files changed, 282 insertions(+), 97 deletions(-) diff --git a/mon/mon.js b/mon/mon.js index 6a0ec14e..76bbd959 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -115,6 +115,7 @@ const etcd_tree = { scrub_queue_depth: 1, scrub_sleep: 0, // milliseconds scrub_list_limit: 1000, // objects to list on one scrub iteration + scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators // blockstore - fixed in superblock block_size, disk_alignment, @@ -273,8 +274,8 @@ const etcd_tree = { : { primary: osd_num_t, state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| - "degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| - "has_invalid"|"left_on_dead"|"scrubbing")[], + "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| + "has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[], } }, */ }, diff --git a/src/osd.cpp b/src/osd.cpp index 6db6cb94..1b3dcb0a 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -217,6 +217,9 @@ void osd_t::parse_config(bool init) scrub_queue_depth = config["scrub_queue_depth"].uint64_value(); if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE) scrub_queue_depth = 1; + scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value(); + if (scrub_ec_max_bruteforce < 1) + scrub_ec_max_bruteforce = 100; scrub_sleep_ms = config["scrub_sleep"].uint64_value(); scrub_list_limit = config["scrub_list_limit"].uint64_value(); if (!scrub_list_limit) diff --git a/src/osd.h b/src/osd.h index fc760c23..3f0b504e 100644 --- a/src/osd.h +++ b/src/osd.h @@ -120,6 +120,7 @@ class osd_t uint64_t scrub_queue_depth = 1; uint64_t scrub_sleep_ms = 0; uint32_t scrub_list_limit = 1000; + uint64_t scrub_ec_max_bruteforce = 100; // cluster state @@ -142,7 +143,7 @@ class osd_t std::set dirty_pgs; std::set dirty_osds; int copies_to_delete_after_sync_count = 0; - uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0; + uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0; int peering_state = 0; std::map recovery_ops; std::map scrub_ops; @@ -264,8 +265,11 @@ class osd_t void continue_primary_sync(osd_op_t *cur_op); void continue_primary_del(osd_op_t *cur_op); bool check_write_queue(osd_op_t *cur_op, pg_t & pg); + pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set, + uint64_t old_pg_state, int log_at_level); void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true); - pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref); + pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, + osd_rmw_stripe_t *stripes, bool ref, bool inconsistent); void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref); bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 1bdd88c2..7602957b 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -336,7 +336,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_ { for (auto & o: osd_set) { - if (!o.loc_bad) + if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED))) { read_target.push_back(o.osd_num); } @@ -356,7 +356,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_ } for (auto & o: osd_set) { - if (!o.loc_bad) + if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED))) { read_target[o.role] = o.osd_num; } @@ -374,7 +374,11 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_ { it->second.object_count++; } - if (state & OBJ_INCOMPLETE) + if (state & OBJ_INCONSISTENT) + { + inconsistent_objects[oid] = &it->second; + } + else if (state & OBJ_INCOMPLETE) { incomplete_objects[oid] = &it->second; } @@ -453,7 +457,8 @@ void pg_t::calc_object_states(int log_level) std::to_string(loc.osd_num)+ (st.replicated ? "" : "("+std::to_string(loc.role)+")")+ (loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+ - (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : ""); + (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+ + (loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : ""); } printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str()); } @@ -463,7 +468,7 @@ void pg_t::calc_object_states(int log_level) void pg_t::print_state() { printf( - "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, + "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, (state & PG_STARTING) ? "starting" : "", (state & PG_OFFLINE) ? "offline" : "", (state & PG_PEERING) ? "peering" : "", @@ -472,6 +477,7 @@ void pg_t::print_state() (state & PG_REPEERING) ? "repeering" : "", (state & PG_STOPPING) ? "stopping" : "", (state & PG_DEGRADED) ? " + degraded" : "", + (state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "", (state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "", (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "", (state & PG_HAS_DEGRADED) ? " + has_degraded" : "", diff --git a/src/osd_peering_pg.h b/src/osd_peering_pg.h index 96f96dc1..45b17eba 100644 --- a/src/osd_peering_pg.h +++ b/src/osd_peering_pg.h @@ -15,12 +15,13 @@ #define LOC_OUTDATED 1 #define LOC_CORRUPTED 2 +#define LOC_INCONSISTENT 4 struct pg_obj_loc_t { uint64_t role; osd_num_t osd_num; - uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED + uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT }; typedef std::vector pg_osd_set_t; @@ -113,7 +114,7 @@ struct pg_t // which is up to ~192 MB per 1 TB in the worst case scenario std::map state_dict; uint64_t corrupted_count; - btree::btree_map incomplete_objects, misplaced_objects, degraded_objects; + btree::btree_map inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects; std::map flush_actions; std::vector copies_to_delete_after_sync; btree::btree_map ver_override; diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 7a2c0249..6f24a016 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -255,7 +255,7 @@ resume_2: // I/O or checksum error auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated - op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); + op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false); goto resume_0; } finish_op(cur_op, op_data->errcode); @@ -296,7 +296,8 @@ resume_2: finish_op(cur_op, cur_op->req.rw.len); } -pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref) +pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, + osd_rmw_stripe_t *stripes, bool ref, bool inconsistent) { pg_osd_set_state_t *object_state = NULL; get_object_osd_set(pg, oid, &object_state); @@ -327,26 +328,24 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os } } // Mark object chunk(s) as corrupted - uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0; + int changes = 0; for (auto & chunk: corrupted_set) { bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error; - if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED)) - n_corrupted++; - chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0); - if (!chunk.loc_bad) + if (corrupted) { - if (pg.scheme == POOL_SCHEME_REPLICATED) - n_roles = 1; - else if (!(has_roles & (1 << chunk.role))) - { - n_roles++; - has_roles |= (1 << chunk.role); - } - n_copies++; + if (!(chunk.loc_bad & LOC_CORRUPTED)) + changes++; + chunk.loc_bad |= LOC_CORRUPTED; + } + else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED)) + { + if (!(chunk.loc_bad & LOC_INCONSISTENT)) + changes++; + chunk.loc_bad |= LOC_INCONSISTENT; } } - if (!n_corrupted) + if (!changes) { // No chunks newly marked as corrupted - object is already marked or moved return object_state; @@ -357,17 +356,82 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os remove_object_from_state(oid, &object_state, pg, false); deref_object_state(pg, &object_state, ref); } - // Calculate object state - uint64_t obj_state = OBJ_CORRUPTED; - int pg_state_bits = PG_HAS_CORRUPTED; - this->corrupted_objects++; - pg.corrupted_count++; - if (log_level > 1) + // Insert object into the new state and retry + object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2); + if (ref) { - printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n", - oid.inode, oid.stripe, n_roles, n_copies, n_corrupted); + object_state->ref_count++; } - if (n_roles < pg.pg_data_size) + return object_state; +} + +pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set, + uint64_t old_pg_state, int log_at_level) +{ + // Object state will be calculated from + uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0, + n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0; + for (auto & chunk: osd_set) + { + if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size)) + { + n_invalid++; + } + else if (chunk.loc_bad & LOC_OUTDATED) + { + n_outdated++; + } + else + { + if (chunk.loc_bad & LOC_INCONSISTENT) + { + n_inconsistent++; + } + if (chunk.loc_bad & LOC_CORRUPTED) + { + n_corrupted++; + } + else if (pg.scheme == POOL_SCHEME_REPLICATED) + { + n_roles = 1; + int i; + for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {} + if (i == pg.cur_set.size()) + { + n_misplaced++; + } + } + else + { + if (!(has_roles & (1 << chunk.role))) + { + n_roles++; + has_roles |= (1 << chunk.role); + } + if (pg.cur_set[chunk.role] != chunk.osd_num) + { + n_misplaced++; + } + } + n_copies++; + } + } + uint64_t obj_state = 0; + int pg_state_bits = 0; + if (n_corrupted > 0) + { + this->corrupted_objects++; + pg.corrupted_count++; + obj_state |= OBJ_CORRUPTED; + pg_state_bits |= PG_HAS_CORRUPTED; + } + if (n_invalid > 0 || n_inconsistent > 0) + { + this->inconsistent_objects++; + obj_state |= OBJ_INCONSISTENT; + pg_state_bits |= PG_HAS_INCONSISTENT; + } + else if (n_roles < pg.pg_data_size) { this->incomplete_objects++; obj_state |= OBJ_INCOMPLETE; @@ -379,12 +443,52 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os obj_state |= OBJ_DEGRADED; pg_state_bits = PG_HAS_DEGRADED; } - else + else if (n_misplaced > 0 || n_outdated > 0) { this->misplaced_objects++; obj_state |= OBJ_MISPLACED; pg_state_bits = PG_HAS_MISPLACED; } + if (this->log_level >= log_at_level) + { + printf("Marking object %lx:%lx ", oid.inode, oid.stripe); + for (int i = 0, j = 0; i < object_state_bit_count; i++) + { + if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0) + { + printf((j++) ? "+%s" : "%s", object_state_names[i]); + } + } + if (pg.scheme == POOL_SCHEME_REPLICATED) + { + printf(": %lu copies available", n_copies); + } + else + { + printf(": %lu parts / %lu copies available", n_roles, n_copies); + } + if (n_invalid > 0) + { + printf(", %lu invalid", n_invalid); + } + if (n_outdated > 0) + { + printf(", %lu outdated", n_outdated); + } + if (n_misplaced > 0) + { + printf(", %lu misplaced", n_misplaced); + } + if (n_corrupted > 0) + { + printf(", %lu corrupted", n_corrupted); + } + if (n_inconsistent > 0) + { + printf(", %lu inconsistent", n_inconsistent); + } + printf("\n"); + } pg.state |= pg_state_bits; if (pg.state != old_pg_state) { @@ -403,11 +507,13 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os ringloop->wakeup(); } } + if (!obj_state) + { + // Object is clean + return NULL; + } // Insert object into the new state and retry - object_state = pg.add_object_to_state(oid, obj_state, corrupted_set); - if (ref) - object_state->ref_count++; - return object_state; + return pg.add_object_to_state(oid, obj_state, osd_set); } // Decrement pg_osd_set_state_t's object_count and change PG state accordingly @@ -426,14 +532,29 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec *object_state = recheck_state; return; } + bool changed = false; (*object_state)->object_count--; if ((*object_state)->state & OBJ_CORRUPTED) { this->corrupted_objects--; pg.corrupted_count--; + if (!pg.corrupted_count) + { + pg.state = pg.state & ~PG_HAS_CORRUPTED; + changed = true; + } } - bool changed = false; - if ((*object_state)->state & OBJ_INCOMPLETE) + if ((*object_state)->state & OBJ_INCONSISTENT) + { + this->inconsistent_objects--; + pg.inconsistent_objects.erase(oid); + if (!pg.inconsistent_objects.size()) + { + pg.state = pg.state & ~PG_HAS_INCONSISTENT; + changed = true; + } + } + else if ((*object_state)->state & OBJ_INCOMPLETE) { // Successful write means that object is not incomplete anymore this->incomplete_objects--; diff --git a/src/osd_primary_chain.cpp b/src/osd_primary_chain.cpp index 5713c0ca..ac381775 100644 --- a/src/osd_primary_chain.cpp +++ b/src/osd_primary_chain.cpp @@ -532,7 +532,7 @@ void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op) } if (corrupted) { - mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false); + mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false); } } } diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index 5ef322f4..2ac41fe3 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -141,7 +141,7 @@ resume_3: if (op_data->errcode == -EIO || op_data->errcode == -EDOM) { // Mark object corrupted and retry - op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true); + op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false); op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data(); goto retry_1; } diff --git a/src/osd_scrub.cpp b/src/osd_scrub.cpp index 64771258..f673e76b 100644 --- a/src/osd_scrub.cpp +++ b/src/osd_scrub.cpp @@ -377,9 +377,13 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op) { n_copies++; } - else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size) + else { - op_data->degraded = true; + op_data->stripes[role].missing = true; + if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size) + { + op_data->degraded = true; + } } } if (n_copies <= op_data->pg_data_size) @@ -388,8 +392,7 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op) finish_op(cur_op, 0); return; } - cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, - op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0); + cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0); // Submit reads osd_op_t *subops = new osd_op_t[n_copies]; op_data->fact_ver = 0; @@ -412,8 +415,15 @@ resume_2: int n_copies = 0; for (int role = 0; role < op_data->pg_size; role++) { - if (op_data->stripes[role].read_end != 0 && - !op_data->stripes[role].read_error) + if (op_data->stripes[role].read_error) + { + op_data->stripes[role].missing = true; + if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size) + { + op_data->degraded = true; + } + } + else if (!op_data->stripes[role].missing) { n_copies++; } @@ -423,7 +433,7 @@ resume_2: // Nothing to compare, just mark the object as corrupted auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated - op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); + op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false); // Operation is treated as unsuccessful only if the object becomes unreadable finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0); return; @@ -436,6 +446,7 @@ resume_2: return; } } + bool inconsistent = false; if (op_data->scheme == POOL_SCHEME_REPLICATED) { // Check that all chunks have returned the same data @@ -475,7 +486,6 @@ resume_2: } if (best >= 0 && votes[best] < total) { - // FIXME Add a flag to allow to skip such objects and not recover them automatically bool unknown = false; for (int role = 0; role < op_data->pg_size; role++) { @@ -484,9 +494,10 @@ resume_2: if (votes[role] > 0 && votes[role] < votes[best]) { printf( - "[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n", + "[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n", INODE_POOL(op_data->oid.inode), op_data->pg_num, - op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best] + op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver, + op_data->stripes[role].osd_num, votes[best] ); op_data->stripes[role].read_error = true; } @@ -494,63 +505,67 @@ resume_2: if (unknown) { // It's unknown which replica is good. There are multiple versions with no majority + // Mark all good replicas as ambiguous best = -1; + inconsistent = true; + printf( + "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n", + INODE_POOL(op_data->oid.inode), op_data->pg_num, + op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver + ); } } } else { assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR); - if (op_data->degraded) + auto good_subset = ec_find_good( + op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR, + bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce + ); + if (!good_subset.size()) { - // Reconstruct missing stripes - // XOR shouldn't come here as it only has 1 parity chunk - assert(op_data->scheme == POOL_SCHEME_EC); - reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size); + inconsistent = true; + printf( + "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n", + INODE_POOL(op_data->oid.inode), op_data->pg_num, + op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver + ); } - // Generate parity chunks and compare them with actual data - osd_num_t fake_osd_set[op_data->pg_size]; - for (int i = 0; i < op_data->pg_size; i++) + else { - fake_osd_set[i] = 1; - op_data->stripes[i].write_buf = i >= op_data->pg_data_size - ? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size) - : op_data->stripes[i].read_buf; - } - if (op_data->scheme == POOL_SCHEME_XOR) - { - calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size); - } - else if (op_data->scheme == POOL_SCHEME_EC) - { - calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size); - } - // Now compare that write_buf == read_buf - for (int role = op_data->pg_data_size; role < op_data->pg_size; role++) - { - if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error && - memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0) + for (int role = 0; role < op_data->pg_size; role++) { - // Chunks don't match - something's wrong... but we don't know what :D - // FIXME: Try to locate errors (may be possible with >= 2 parity chunks) - printf( - "[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n", - INODE_POOL(op_data->oid.inode), op_data->pg_num, - op_data->oid.inode, op_data->oid.stripe, - role-op_data->pg_data_size, op_data->stripes[role].osd_num - ); - op_data->stripes[role].read_error = true; + if (!op_data->stripes[role].missing) + op_data->stripes[role].read_error = true; + } + for (int role: good_subset) + { + op_data->stripes[role].read_error = false; + } + for (int role = 0; role < op_data->pg_size; role++) + { + if (!op_data->stripes[role].missing && op_data->stripes[role].read_error) + { + op_data->stripes[role].read_error = true; + printf( + "[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match data, marking it as corrupted\n", + INODE_POOL(op_data->oid.inode), op_data->pg_num, + op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver, + role, op_data->stripes[role].osd_num + ); + } } } } for (int role = 0; role < op_data->pg_size; role++) { - if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error) + if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent) { // Got at least 1 read error or mismatch, mark the object as corrupted auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated - op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); + op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent); break; } } diff --git a/src/pg_states.cpp b/src/pg_states.cpp index 41072e21..69b85417 100644 --- a/src/pg_states.cpp +++ b/src/pg_states.cpp @@ -3,9 +3,9 @@ #include "pg_states.h" -const int pg_state_bit_count = 16; +const int pg_state_bit_count = 17; -const int pg_state_bits[16] = { +const int pg_state_bits[17] = { PG_STARTING, PG_PEERING, PG_INCOMPLETE, @@ -14,6 +14,7 @@ const int pg_state_bits[16] = { PG_STOPPING, PG_OFFLINE, PG_DEGRADED, + PG_HAS_INCONSISTENT, PG_HAS_CORRUPTED, PG_HAS_INCOMPLETE, PG_HAS_DEGRADED, @@ -24,7 +25,7 @@ const int pg_state_bits[16] = { PG_SCRUBBING, }; -const char *pg_state_names[16] = { +const char *pg_state_names[17] = { "starting", "peering", "incomplete", @@ -33,6 +34,7 @@ const char *pg_state_names[16] = { "stopping", "offline", "degraded", + "has_inconsistent", "has_corrupted", "has_incomplete", "has_degraded", @@ -42,3 +44,27 @@ const char *pg_state_names[16] = { "left_on_dead", "scrubbing", }; + +const int object_state_bit_count = 8; + +const int object_state_bits[8] = { + OBJ_DEGRADED, + OBJ_INCOMPLETE, + OBJ_MISPLACED, + OBJ_CORRUPTED, + OBJ_INCONSISTENT, + OBJ_NEEDS_STABLE, + OBJ_NEEDS_ROLLBACK, + 0, +}; + +const char *object_state_names[8] = { + "degraded", + "incomplete", + "misplaced", + "corrupted", + "inconsistent", + "needs_stable", + "needs_rollback", + "clean", +}; diff --git a/src/pg_states.h b/src/pg_states.h index 35348959..c2e21c38 100644 --- a/src/pg_states.h +++ b/src/pg_states.h @@ -23,8 +23,9 @@ #define PG_HAS_UNCLEAN (1<<11) #define PG_HAS_INVALID (1<<12) #define PG_HAS_CORRUPTED (1<<13) -#define PG_LEFT_ON_DEAD (1<<14) -#define PG_SCRUBBING (1<<15) +#define PG_HAS_INCONSISTENT (1<<14) +#define PG_LEFT_ON_DEAD (1<<15) +#define PG_SCRUBBING (1<<16) // Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size @@ -36,9 +37,16 @@ #define OBJ_MISPLACED 0x08 // OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED #define OBJ_CORRUPTED 0x10 +// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct +// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states +#define OBJ_INCONSISTENT 0x20 #define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_ROLLBACK 0x20000 extern const int pg_state_bits[]; extern const char *pg_state_names[]; extern const int pg_state_bit_count; + +extern const int object_state_bits[]; +extern const char *object_state_names[]; +extern const int object_state_bit_count;