Implement ambiguity detection during scrub

test-double-alloc
Vitaliy Filippov 2023-04-10 01:05:41 +03:00
parent 281be547eb
commit 6648f6bb6e
11 changed files with 282 additions and 97 deletions

View File

@ -115,6 +115,7 @@ const etcd_tree = {
scrub_queue_depth: 1,
scrub_sleep: 0, // milliseconds
scrub_list_limit: 1000, // objects to list on one scrub iteration
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
// blockstore - fixed in superblock
block_size,
disk_alignment,
@ -273,8 +274,8 @@ const etcd_tree = {
<pg_id>: {
primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead"|"scrubbing")[],
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
}
}, */
},

View File

@ -217,6 +217,9 @@ void osd_t::parse_config(bool init)
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
scrub_queue_depth = 1;
scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
if (scrub_ec_max_bruteforce < 1)
scrub_ec_max_bruteforce = 100;
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
scrub_list_limit = config["scrub_list_limit"].uint64_value();
if (!scrub_list_limit)

View File

@ -120,6 +120,7 @@ class osd_t
uint64_t scrub_queue_depth = 1;
uint64_t scrub_sleep_ms = 0;
uint32_t scrub_list_limit = 1000;
uint64_t scrub_ec_max_bruteforce = 100;
// cluster state
@ -142,7 +143,7 @@ class osd_t
std::set<pool_pg_num_t> dirty_pgs;
std::set<osd_num_t> dirty_osds;
int copies_to_delete_after_sync_count = 0;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
int peering_state = 0;
std::map<object_id, osd_recovery_op_t> recovery_ops;
std::map<object_id, osd_op_t*> scrub_ops;
@ -264,8 +265,11 @@ class osd_t
void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
uint64_t old_pg_state, int log_at_level);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);

View File

@ -336,7 +336,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
{
for (auto & o: osd_set)
{
if (!o.loc_bad)
if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
{
read_target.push_back(o.osd_num);
}
@ -356,7 +356,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
}
for (auto & o: osd_set)
{
if (!o.loc_bad)
if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
{
read_target[o.role] = o.osd_num;
}
@ -374,7 +374,11 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
{
it->second.object_count++;
}
if (state & OBJ_INCOMPLETE)
if (state & OBJ_INCONSISTENT)
{
inconsistent_objects[oid] = &it->second;
}
else if (state & OBJ_INCOMPLETE)
{
incomplete_objects[oid] = &it->second;
}
@ -453,7 +457,8 @@ void pg_t::calc_object_states(int log_level)
std::to_string(loc.osd_num)+
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+
(loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "");
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
(loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
}
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
}
@ -463,7 +468,7 @@ void pg_t::calc_object_states(int log_level)
void pg_t::print_state()
{
printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
(state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "",
@ -472,6 +477,7 @@ void pg_t::print_state()
(state & PG_REPEERING) ? "repeering" : "",
(state & PG_STOPPING) ? "stopping" : "",
(state & PG_DEGRADED) ? " + degraded" : "",
(state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "",
(state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",

View File

@ -15,12 +15,13 @@
#define LOC_OUTDATED 1
#define LOC_CORRUPTED 2
#define LOC_INCONSISTENT 4
struct pg_obj_loc_t
{
uint64_t role;
osd_num_t osd_num;
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
};
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@ -113,7 +114,7 @@ struct pg_t
// which is up to ~192 MB per 1 TB in the worst case scenario
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
uint64_t corrupted_count;
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
btree::btree_map<object_id, pg_osd_set_state_t*> inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects;
std::map<obj_piece_id_t, flush_action_t> flush_actions;
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
btree::btree_map<object_id, uint64_t> ver_override;

View File

@ -255,7 +255,7 @@ resume_2:
// I/O or checksum error
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
goto resume_0;
}
finish_op(cur_op, op_data->errcode);
@ -296,7 +296,8 @@ resume_2:
finish_op(cur_op, cur_op->req.rw.len);
}
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
{
pg_osd_set_state_t *object_state = NULL;
get_object_osd_set(pg, oid, &object_state);
@ -327,26 +328,24 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
}
}
// Mark object chunk(s) as corrupted
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
int changes = 0;
for (auto & chunk: corrupted_set)
{
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
n_corrupted++;
chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
if (!chunk.loc_bad)
if (corrupted)
{
if (pg.scheme == POOL_SCHEME_REPLICATED)
n_roles = 1;
else if (!(has_roles & (1 << chunk.role)))
{
n_roles++;
has_roles |= (1 << chunk.role);
}
n_copies++;
if (!(chunk.loc_bad & LOC_CORRUPTED))
changes++;
chunk.loc_bad |= LOC_CORRUPTED;
}
else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED))
{
if (!(chunk.loc_bad & LOC_INCONSISTENT))
changes++;
chunk.loc_bad |= LOC_INCONSISTENT;
}
}
if (!n_corrupted)
if (!changes)
{
// No chunks newly marked as corrupted - object is already marked or moved
return object_state;
@ -357,17 +356,82 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
remove_object_from_state(oid, &object_state, pg, false);
deref_object_state(pg, &object_state, ref);
}
// Calculate object state
uint64_t obj_state = OBJ_CORRUPTED;
int pg_state_bits = PG_HAS_CORRUPTED;
this->corrupted_objects++;
pg.corrupted_count++;
if (log_level > 1)
// Insert object into the new state and retry
object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
if (ref)
{
printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
object_state->ref_count++;
}
if (n_roles < pg.pg_data_size)
return object_state;
}
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
uint64_t old_pg_state, int log_at_level)
{
// Object state will be calculated from <osd_set>
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
for (auto & chunk: osd_set)
{
if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
{
n_invalid++;
}
else if (chunk.loc_bad & LOC_OUTDATED)
{
n_outdated++;
}
else
{
if (chunk.loc_bad & LOC_INCONSISTENT)
{
n_inconsistent++;
}
if (chunk.loc_bad & LOC_CORRUPTED)
{
n_corrupted++;
}
else if (pg.scheme == POOL_SCHEME_REPLICATED)
{
n_roles = 1;
int i;
for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
if (i == pg.cur_set.size())
{
n_misplaced++;
}
}
else
{
if (!(has_roles & (1 << chunk.role)))
{
n_roles++;
has_roles |= (1 << chunk.role);
}
if (pg.cur_set[chunk.role] != chunk.osd_num)
{
n_misplaced++;
}
}
n_copies++;
}
}
uint64_t obj_state = 0;
int pg_state_bits = 0;
if (n_corrupted > 0)
{
this->corrupted_objects++;
pg.corrupted_count++;
obj_state |= OBJ_CORRUPTED;
pg_state_bits |= PG_HAS_CORRUPTED;
}
if (n_invalid > 0 || n_inconsistent > 0)
{
this->inconsistent_objects++;
obj_state |= OBJ_INCONSISTENT;
pg_state_bits |= PG_HAS_INCONSISTENT;
}
else if (n_roles < pg.pg_data_size)
{
this->incomplete_objects++;
obj_state |= OBJ_INCOMPLETE;
@ -379,12 +443,52 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
obj_state |= OBJ_DEGRADED;
pg_state_bits = PG_HAS_DEGRADED;
}
else
else if (n_misplaced > 0 || n_outdated > 0)
{
this->misplaced_objects++;
obj_state |= OBJ_MISPLACED;
pg_state_bits = PG_HAS_MISPLACED;
}
if (this->log_level >= log_at_level)
{
printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
for (int i = 0, j = 0; i < object_state_bit_count; i++)
{
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
{
printf((j++) ? "+%s" : "%s", object_state_names[i]);
}
}
if (pg.scheme == POOL_SCHEME_REPLICATED)
{
printf(": %lu copies available", n_copies);
}
else
{
printf(": %lu parts / %lu copies available", n_roles, n_copies);
}
if (n_invalid > 0)
{
printf(", %lu invalid", n_invalid);
}
if (n_outdated > 0)
{
printf(", %lu outdated", n_outdated);
}
if (n_misplaced > 0)
{
printf(", %lu misplaced", n_misplaced);
}
if (n_corrupted > 0)
{
printf(", %lu corrupted", n_corrupted);
}
if (n_inconsistent > 0)
{
printf(", %lu inconsistent", n_inconsistent);
}
printf("\n");
}
pg.state |= pg_state_bits;
if (pg.state != old_pg_state)
{
@ -403,11 +507,13 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
ringloop->wakeup();
}
}
if (!obj_state)
{
// Object is clean
return NULL;
}
// Insert object into the new state and retry
object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
if (ref)
object_state->ref_count++;
return object_state;
return pg.add_object_to_state(oid, obj_state, osd_set);
}
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
@ -426,14 +532,29 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
*object_state = recheck_state;
return;
}
bool changed = false;
(*object_state)->object_count--;
if ((*object_state)->state & OBJ_CORRUPTED)
{
this->corrupted_objects--;
pg.corrupted_count--;
if (!pg.corrupted_count)
{
pg.state = pg.state & ~PG_HAS_CORRUPTED;
changed = true;
}
}
bool changed = false;
if ((*object_state)->state & OBJ_INCOMPLETE)
if ((*object_state)->state & OBJ_INCONSISTENT)
{
this->inconsistent_objects--;
pg.inconsistent_objects.erase(oid);
if (!pg.inconsistent_objects.size())
{
pg.state = pg.state & ~PG_HAS_INCONSISTENT;
changed = true;
}
}
else if ((*object_state)->state & OBJ_INCOMPLETE)
{
// Successful write means that object is not incomplete anymore
this->incomplete_objects--;

View File

@ -532,7 +532,7 @@ void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
}
if (corrupted)
{
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
}
}
}

View File

@ -141,7 +141,7 @@ resume_3:
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
{
// Mark object corrupted and retry
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
goto retry_1;
}

View File

@ -377,9 +377,13 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
{
n_copies++;
}
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
else
{
op_data->degraded = true;
op_data->stripes[role].missing = true;
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
{
op_data->degraded = true;
}
}
}
if (n_copies <= op_data->pg_data_size)
@ -388,8 +392,7 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
finish_op(cur_op, 0);
return;
}
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
// Submit reads
osd_op_t *subops = new osd_op_t[n_copies];
op_data->fact_ver = 0;
@ -412,8 +415,15 @@ resume_2:
int n_copies = 0;
for (int role = 0; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].read_end != 0 &&
!op_data->stripes[role].read_error)
if (op_data->stripes[role].read_error)
{
op_data->stripes[role].missing = true;
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
{
op_data->degraded = true;
}
}
else if (!op_data->stripes[role].missing)
{
n_copies++;
}
@ -423,7 +433,7 @@ resume_2:
// Nothing to compare, just mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
// Operation is treated as unsuccessful only if the object becomes unreadable
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
return;
@ -436,6 +446,7 @@ resume_2:
return;
}
}
bool inconsistent = false;
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Check that all chunks have returned the same data
@ -475,7 +486,6 @@ resume_2:
}
if (best >= 0 && votes[best] < total)
{
// FIXME Add a flag to allow to skip such objects and not recover them automatically
bool unknown = false;
for (int role = 0; role < op_data->pg_size; role++)
{
@ -484,9 +494,10 @@ resume_2:
if (votes[role] > 0 && votes[role] < votes[best])
{
printf(
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
"[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
op_data->stripes[role].osd_num, votes[best]
);
op_data->stripes[role].read_error = true;
}
@ -494,63 +505,67 @@ resume_2:
if (unknown)
{
// It's unknown which replica is good. There are multiple versions with no majority
// Mark all good replicas as ambiguous
best = -1;
inconsistent = true;
printf(
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
);
}
}
}
else
{
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
if (op_data->degraded)
auto good_subset = ec_find_good(
op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
);
if (!good_subset.size())
{
// Reconstruct missing stripes
// XOR shouldn't come here as it only has 1 parity chunk
assert(op_data->scheme == POOL_SCHEME_EC);
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
inconsistent = true;
printf(
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
);
}
// Generate parity chunks and compare them with actual data
osd_num_t fake_osd_set[op_data->pg_size];
for (int i = 0; i < op_data->pg_size; i++)
else
{
fake_osd_set[i] = 1;
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
: op_data->stripes[i].read_buf;
}
if (op_data->scheme == POOL_SCHEME_XOR)
{
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
}
else if (op_data->scheme == POOL_SCHEME_EC)
{
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
}
// Now compare that write_buf == read_buf
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
for (int role = 0; role < op_data->pg_size; role++)
{
// Chunks don't match - something's wrong... but we don't know what :D
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
printf(
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe,
role-op_data->pg_data_size, op_data->stripes[role].osd_num
);
op_data->stripes[role].read_error = true;
if (!op_data->stripes[role].missing)
op_data->stripes[role].read_error = true;
}
for (int role: good_subset)
{
op_data->stripes[role].read_error = false;
}
for (int role = 0; role < op_data->pg_size; role++)
{
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
{
op_data->stripes[role].read_error = true;
printf(
"[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
role, op_data->stripes[role].osd_num
);
}
}
}
}
for (int role = 0; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent)
{
// Got at least 1 read error or mismatch, mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
break;
}
}

View File

@ -3,9 +3,9 @@
#include "pg_states.h"
const int pg_state_bit_count = 16;
const int pg_state_bit_count = 17;
const int pg_state_bits[16] = {
const int pg_state_bits[17] = {
PG_STARTING,
PG_PEERING,
PG_INCOMPLETE,
@ -14,6 +14,7 @@ const int pg_state_bits[16] = {
PG_STOPPING,
PG_OFFLINE,
PG_DEGRADED,
PG_HAS_INCONSISTENT,
PG_HAS_CORRUPTED,
PG_HAS_INCOMPLETE,
PG_HAS_DEGRADED,
@ -24,7 +25,7 @@ const int pg_state_bits[16] = {
PG_SCRUBBING,
};
const char *pg_state_names[16] = {
const char *pg_state_names[17] = {
"starting",
"peering",
"incomplete",
@ -33,6 +34,7 @@ const char *pg_state_names[16] = {
"stopping",
"offline",
"degraded",
"has_inconsistent",
"has_corrupted",
"has_incomplete",
"has_degraded",
@ -42,3 +44,27 @@ const char *pg_state_names[16] = {
"left_on_dead",
"scrubbing",
};
const int object_state_bit_count = 8;
const int object_state_bits[8] = {
OBJ_DEGRADED,
OBJ_INCOMPLETE,
OBJ_MISPLACED,
OBJ_CORRUPTED,
OBJ_INCONSISTENT,
OBJ_NEEDS_STABLE,
OBJ_NEEDS_ROLLBACK,
0,
};
const char *object_state_names[8] = {
"degraded",
"incomplete",
"misplaced",
"corrupted",
"inconsistent",
"needs_stable",
"needs_rollback",
"clean",
};

View File

@ -23,8 +23,9 @@
#define PG_HAS_UNCLEAN (1<<11)
#define PG_HAS_INVALID (1<<12)
#define PG_HAS_CORRUPTED (1<<13)
#define PG_LEFT_ON_DEAD (1<<14)
#define PG_SCRUBBING (1<<15)
#define PG_HAS_INCONSISTENT (1<<14)
#define PG_LEFT_ON_DEAD (1<<15)
#define PG_SCRUBBING (1<<16)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
@ -36,9 +37,16 @@
#define OBJ_MISPLACED 0x08
// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
#define OBJ_CORRUPTED 0x10
// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct
// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states
#define OBJ_INCONSISTENT 0x20
#define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000
extern const int pg_state_bits[];
extern const char *pg_state_names[];
extern const int pg_state_bit_count;
extern const int object_state_bits[];
extern const char *object_state_names[];
extern const int object_state_bit_count;