Implement ambiguity detection during scrub

test-double-alloc
Vitaliy Filippov 2023-04-10 01:05:41 +03:00
parent 281be547eb
commit 6648f6bb6e
11 changed files with 282 additions and 97 deletions

View File

@ -115,6 +115,7 @@ const etcd_tree = {
scrub_queue_depth: 1, scrub_queue_depth: 1,
scrub_sleep: 0, // milliseconds scrub_sleep: 0, // milliseconds
scrub_list_limit: 1000, // objects to list on one scrub iteration scrub_list_limit: 1000, // objects to list on one scrub iteration
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
// blockstore - fixed in superblock // blockstore - fixed in superblock
block_size, block_size,
disk_alignment, disk_alignment,
@ -273,8 +274,8 @@ const etcd_tree = {
<pg_id>: { <pg_id>: {
primary: osd_num_t, primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead"|"scrubbing")[], "has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
} }
}, */ }, */
}, },

View File

@ -217,6 +217,9 @@ void osd_t::parse_config(bool init)
scrub_queue_depth = config["scrub_queue_depth"].uint64_value(); scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE) if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
scrub_queue_depth = 1; scrub_queue_depth = 1;
scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
if (scrub_ec_max_bruteforce < 1)
scrub_ec_max_bruteforce = 100;
scrub_sleep_ms = config["scrub_sleep"].uint64_value(); scrub_sleep_ms = config["scrub_sleep"].uint64_value();
scrub_list_limit = config["scrub_list_limit"].uint64_value(); scrub_list_limit = config["scrub_list_limit"].uint64_value();
if (!scrub_list_limit) if (!scrub_list_limit)

View File

@ -120,6 +120,7 @@ class osd_t
uint64_t scrub_queue_depth = 1; uint64_t scrub_queue_depth = 1;
uint64_t scrub_sleep_ms = 0; uint64_t scrub_sleep_ms = 0;
uint32_t scrub_list_limit = 1000; uint32_t scrub_list_limit = 1000;
uint64_t scrub_ec_max_bruteforce = 100;
// cluster state // cluster state
@ -142,7 +143,7 @@ class osd_t
std::set<pool_pg_num_t> dirty_pgs; std::set<pool_pg_num_t> dirty_pgs;
std::set<osd_num_t> dirty_osds; std::set<osd_num_t> dirty_osds;
int copies_to_delete_after_sync_count = 0; int copies_to_delete_after_sync_count = 0;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0; uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
int peering_state = 0; int peering_state = 0;
std::map<object_id, osd_recovery_op_t> recovery_ops; std::map<object_id, osd_recovery_op_t> recovery_ops;
std::map<object_id, osd_op_t*> scrub_ops; std::map<object_id, osd_op_t*> scrub_ops;
@ -264,8 +265,11 @@ class osd_t
void continue_primary_sync(osd_op_t *cur_op); void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op); void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg); bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
uint64_t old_pg_state, int log_at_level);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true); void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref); pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref); void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);

View File

@ -336,7 +336,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
{ {
for (auto & o: osd_set) for (auto & o: osd_set)
{ {
if (!o.loc_bad) if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
{ {
read_target.push_back(o.osd_num); read_target.push_back(o.osd_num);
} }
@ -356,7 +356,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
} }
for (auto & o: osd_set) for (auto & o: osd_set)
{ {
if (!o.loc_bad) if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
{ {
read_target[o.role] = o.osd_num; read_target[o.role] = o.osd_num;
} }
@ -374,7 +374,11 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
{ {
it->second.object_count++; it->second.object_count++;
} }
if (state & OBJ_INCOMPLETE) if (state & OBJ_INCONSISTENT)
{
inconsistent_objects[oid] = &it->second;
}
else if (state & OBJ_INCOMPLETE)
{ {
incomplete_objects[oid] = &it->second; incomplete_objects[oid] = &it->second;
} }
@ -453,7 +457,8 @@ void pg_t::calc_object_states(int log_level)
std::to_string(loc.osd_num)+ std::to_string(loc.osd_num)+
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+ (st.replicated ? "" : "("+std::to_string(loc.role)+")")+
(loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+ (loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : ""); (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
(loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
} }
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str()); printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
} }
@ -463,7 +468,7 @@ void pg_t::calc_object_states(int log_level)
void pg_t::print_state() void pg_t::print_state()
{ {
printf( printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
(state & PG_STARTING) ? "starting" : "", (state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "", (state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "", (state & PG_PEERING) ? "peering" : "",
@ -472,6 +477,7 @@ void pg_t::print_state()
(state & PG_REPEERING) ? "repeering" : "", (state & PG_REPEERING) ? "repeering" : "",
(state & PG_STOPPING) ? "stopping" : "", (state & PG_STOPPING) ? "stopping" : "",
(state & PG_DEGRADED) ? " + degraded" : "", (state & PG_DEGRADED) ? " + degraded" : "",
(state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "",
(state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "", (state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "", (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "", (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",

View File

@ -15,12 +15,13 @@
#define LOC_OUTDATED 1 #define LOC_OUTDATED 1
#define LOC_CORRUPTED 2 #define LOC_CORRUPTED 2
#define LOC_INCONSISTENT 4
struct pg_obj_loc_t struct pg_obj_loc_t
{ {
uint64_t role; uint64_t role;
osd_num_t osd_num; osd_num_t osd_num;
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
}; };
typedef std::vector<pg_obj_loc_t> pg_osd_set_t; typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@ -113,7 +114,7 @@ struct pg_t
// which is up to ~192 MB per 1 TB in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict; std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
uint64_t corrupted_count; uint64_t corrupted_count;
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects; btree::btree_map<object_id, pg_osd_set_state_t*> inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects;
std::map<obj_piece_id_t, flush_action_t> flush_actions; std::map<obj_piece_id_t, flush_action_t> flush_actions;
std::vector<obj_ver_osd_t> copies_to_delete_after_sync; std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
btree::btree_map<object_id, uint64_t> ver_override; btree::btree_map<object_id, uint64_t> ver_override;

View File

@ -255,7 +255,7 @@ resume_2:
// I/O or checksum error // I/O or checksum error
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
goto resume_0; goto resume_0;
} }
finish_op(cur_op, op_data->errcode); finish_op(cur_op, op_data->errcode);
@ -296,7 +296,8 @@ resume_2:
finish_op(cur_op, cur_op->req.rw.len); finish_op(cur_op, cur_op->req.rw.len);
} }
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref) pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
{ {
pg_osd_set_state_t *object_state = NULL; pg_osd_set_state_t *object_state = NULL;
get_object_osd_set(pg, oid, &object_state); get_object_osd_set(pg, oid, &object_state);
@ -327,26 +328,24 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
} }
} }
// Mark object chunk(s) as corrupted // Mark object chunk(s) as corrupted
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0; int changes = 0;
for (auto & chunk: corrupted_set) for (auto & chunk: corrupted_set)
{ {
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error; bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED)) if (corrupted)
n_corrupted++;
chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
if (!chunk.loc_bad)
{ {
if (pg.scheme == POOL_SCHEME_REPLICATED) if (!(chunk.loc_bad & LOC_CORRUPTED))
n_roles = 1; changes++;
else if (!(has_roles & (1 << chunk.role))) chunk.loc_bad |= LOC_CORRUPTED;
{ }
n_roles++; else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED))
has_roles |= (1 << chunk.role); {
} if (!(chunk.loc_bad & LOC_INCONSISTENT))
n_copies++; changes++;
chunk.loc_bad |= LOC_INCONSISTENT;
} }
} }
if (!n_corrupted) if (!changes)
{ {
// No chunks newly marked as corrupted - object is already marked or moved // No chunks newly marked as corrupted - object is already marked or moved
return object_state; return object_state;
@ -357,17 +356,82 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
remove_object_from_state(oid, &object_state, pg, false); remove_object_from_state(oid, &object_state, pg, false);
deref_object_state(pg, &object_state, ref); deref_object_state(pg, &object_state, ref);
} }
// Calculate object state // Insert object into the new state and retry
uint64_t obj_state = OBJ_CORRUPTED; object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
int pg_state_bits = PG_HAS_CORRUPTED; if (ref)
this->corrupted_objects++;
pg.corrupted_count++;
if (log_level > 1)
{ {
printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n", object_state->ref_count++;
oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
} }
if (n_roles < pg.pg_data_size) return object_state;
}
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
uint64_t old_pg_state, int log_at_level)
{
// Object state will be calculated from <osd_set>
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
for (auto & chunk: osd_set)
{
if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
{
n_invalid++;
}
else if (chunk.loc_bad & LOC_OUTDATED)
{
n_outdated++;
}
else
{
if (chunk.loc_bad & LOC_INCONSISTENT)
{
n_inconsistent++;
}
if (chunk.loc_bad & LOC_CORRUPTED)
{
n_corrupted++;
}
else if (pg.scheme == POOL_SCHEME_REPLICATED)
{
n_roles = 1;
int i;
for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
if (i == pg.cur_set.size())
{
n_misplaced++;
}
}
else
{
if (!(has_roles & (1 << chunk.role)))
{
n_roles++;
has_roles |= (1 << chunk.role);
}
if (pg.cur_set[chunk.role] != chunk.osd_num)
{
n_misplaced++;
}
}
n_copies++;
}
}
uint64_t obj_state = 0;
int pg_state_bits = 0;
if (n_corrupted > 0)
{
this->corrupted_objects++;
pg.corrupted_count++;
obj_state |= OBJ_CORRUPTED;
pg_state_bits |= PG_HAS_CORRUPTED;
}
if (n_invalid > 0 || n_inconsistent > 0)
{
this->inconsistent_objects++;
obj_state |= OBJ_INCONSISTENT;
pg_state_bits |= PG_HAS_INCONSISTENT;
}
else if (n_roles < pg.pg_data_size)
{ {
this->incomplete_objects++; this->incomplete_objects++;
obj_state |= OBJ_INCOMPLETE; obj_state |= OBJ_INCOMPLETE;
@ -379,12 +443,52 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
obj_state |= OBJ_DEGRADED; obj_state |= OBJ_DEGRADED;
pg_state_bits = PG_HAS_DEGRADED; pg_state_bits = PG_HAS_DEGRADED;
} }
else else if (n_misplaced > 0 || n_outdated > 0)
{ {
this->misplaced_objects++; this->misplaced_objects++;
obj_state |= OBJ_MISPLACED; obj_state |= OBJ_MISPLACED;
pg_state_bits = PG_HAS_MISPLACED; pg_state_bits = PG_HAS_MISPLACED;
} }
if (this->log_level >= log_at_level)
{
printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
for (int i = 0, j = 0; i < object_state_bit_count; i++)
{
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
{
printf((j++) ? "+%s" : "%s", object_state_names[i]);
}
}
if (pg.scheme == POOL_SCHEME_REPLICATED)
{
printf(": %lu copies available", n_copies);
}
else
{
printf(": %lu parts / %lu copies available", n_roles, n_copies);
}
if (n_invalid > 0)
{
printf(", %lu invalid", n_invalid);
}
if (n_outdated > 0)
{
printf(", %lu outdated", n_outdated);
}
if (n_misplaced > 0)
{
printf(", %lu misplaced", n_misplaced);
}
if (n_corrupted > 0)
{
printf(", %lu corrupted", n_corrupted);
}
if (n_inconsistent > 0)
{
printf(", %lu inconsistent", n_inconsistent);
}
printf("\n");
}
pg.state |= pg_state_bits; pg.state |= pg_state_bits;
if (pg.state != old_pg_state) if (pg.state != old_pg_state)
{ {
@ -403,11 +507,13 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
ringloop->wakeup(); ringloop->wakeup();
} }
} }
if (!obj_state)
{
// Object is clean
return NULL;
}
// Insert object into the new state and retry // Insert object into the new state and retry
object_state = pg.add_object_to_state(oid, obj_state, corrupted_set); return pg.add_object_to_state(oid, obj_state, osd_set);
if (ref)
object_state->ref_count++;
return object_state;
} }
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly // Decrement pg_osd_set_state_t's object_count and change PG state accordingly
@ -426,14 +532,29 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
*object_state = recheck_state; *object_state = recheck_state;
return; return;
} }
bool changed = false;
(*object_state)->object_count--; (*object_state)->object_count--;
if ((*object_state)->state & OBJ_CORRUPTED) if ((*object_state)->state & OBJ_CORRUPTED)
{ {
this->corrupted_objects--; this->corrupted_objects--;
pg.corrupted_count--; pg.corrupted_count--;
if (!pg.corrupted_count)
{
pg.state = pg.state & ~PG_HAS_CORRUPTED;
changed = true;
}
} }
bool changed = false; if ((*object_state)->state & OBJ_INCONSISTENT)
if ((*object_state)->state & OBJ_INCOMPLETE) {
this->inconsistent_objects--;
pg.inconsistent_objects.erase(oid);
if (!pg.inconsistent_objects.size())
{
pg.state = pg.state & ~PG_HAS_INCONSISTENT;
changed = true;
}
}
else if ((*object_state)->state & OBJ_INCOMPLETE)
{ {
// Successful write means that object is not incomplete anymore // Successful write means that object is not incomplete anymore
this->incomplete_objects--; this->incomplete_objects--;

View File

@ -532,7 +532,7 @@ void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
} }
if (corrupted) if (corrupted)
{ {
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false); mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
} }
} }
} }

View File

@ -141,7 +141,7 @@ resume_3:
if (op_data->errcode == -EIO || op_data->errcode == -EDOM) if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
{ {
// Mark object corrupted and retry // Mark object corrupted and retry
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true); op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data(); op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
goto retry_1; goto retry_1;
} }

View File

@ -377,9 +377,13 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
{ {
n_copies++; n_copies++;
} }
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size) else
{ {
op_data->degraded = true; op_data->stripes[role].missing = true;
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
{
op_data->degraded = true;
}
} }
} }
if (n_copies <= op_data->pg_data_size) if (n_copies <= op_data->pg_data_size)
@ -388,8 +392,7 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
finish_op(cur_op, 0); finish_op(cur_op, 0);
return; return;
} }
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
// Submit reads // Submit reads
osd_op_t *subops = new osd_op_t[n_copies]; osd_op_t *subops = new osd_op_t[n_copies];
op_data->fact_ver = 0; op_data->fact_ver = 0;
@ -412,8 +415,15 @@ resume_2:
int n_copies = 0; int n_copies = 0;
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {
if (op_data->stripes[role].read_end != 0 && if (op_data->stripes[role].read_error)
!op_data->stripes[role].read_error) {
op_data->stripes[role].missing = true;
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
{
op_data->degraded = true;
}
}
else if (!op_data->stripes[role].missing)
{ {
n_copies++; n_copies++;
} }
@ -423,7 +433,7 @@ resume_2:
// Nothing to compare, just mark the object as corrupted // Nothing to compare, just mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
// Operation is treated as unsuccessful only if the object becomes unreadable // Operation is treated as unsuccessful only if the object becomes unreadable
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0); finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
return; return;
@ -436,6 +446,7 @@ resume_2:
return; return;
} }
} }
bool inconsistent = false;
if (op_data->scheme == POOL_SCHEME_REPLICATED) if (op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
// Check that all chunks have returned the same data // Check that all chunks have returned the same data
@ -475,7 +486,6 @@ resume_2:
} }
if (best >= 0 && votes[best] < total) if (best >= 0 && votes[best] < total)
{ {
// FIXME Add a flag to allow to skip such objects and not recover them automatically
bool unknown = false; bool unknown = false;
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {
@ -484,9 +494,10 @@ resume_2:
if (votes[role] > 0 && votes[role] < votes[best]) if (votes[role] > 0 && votes[role] < votes[best])
{ {
printf( printf(
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n", "[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num, INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best] op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
op_data->stripes[role].osd_num, votes[best]
); );
op_data->stripes[role].read_error = true; op_data->stripes[role].read_error = true;
} }
@ -494,63 +505,67 @@ resume_2:
if (unknown) if (unknown)
{ {
// It's unknown which replica is good. There are multiple versions with no majority // It's unknown which replica is good. There are multiple versions with no majority
// Mark all good replicas as ambiguous
best = -1; best = -1;
inconsistent = true;
printf(
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
);
} }
} }
} }
else else
{ {
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR); assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
if (op_data->degraded) auto good_subset = ec_find_good(
op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
);
if (!good_subset.size())
{ {
// Reconstruct missing stripes inconsistent = true;
// XOR shouldn't come here as it only has 1 parity chunk printf(
assert(op_data->scheme == POOL_SCHEME_EC); "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size); INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
);
} }
// Generate parity chunks and compare them with actual data else
osd_num_t fake_osd_set[op_data->pg_size];
for (int i = 0; i < op_data->pg_size; i++)
{ {
fake_osd_set[i] = 1; for (int role = 0; role < op_data->pg_size; role++)
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
: op_data->stripes[i].read_buf;
}
if (op_data->scheme == POOL_SCHEME_XOR)
{
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
}
else if (op_data->scheme == POOL_SCHEME_EC)
{
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
}
// Now compare that write_buf == read_buf
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
{ {
// Chunks don't match - something's wrong... but we don't know what :D if (!op_data->stripes[role].missing)
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks) op_data->stripes[role].read_error = true;
printf( }
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n", for (int role: good_subset)
INODE_POOL(op_data->oid.inode), op_data->pg_num, {
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].read_error = false;
role-op_data->pg_data_size, op_data->stripes[role].osd_num }
); for (int role = 0; role < op_data->pg_size; role++)
op_data->stripes[role].read_error = true; {
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
{
op_data->stripes[role].read_error = true;
printf(
"[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
role, op_data->stripes[role].osd_num
);
}
} }
} }
} }
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error) if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent)
{ {
// Got at least 1 read error or mismatch, mark the object as corrupted // Got at least 1 read error or mismatch, mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false); op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
break; break;
} }
} }

View File

@ -3,9 +3,9 @@
#include "pg_states.h" #include "pg_states.h"
const int pg_state_bit_count = 16; const int pg_state_bit_count = 17;
const int pg_state_bits[16] = { const int pg_state_bits[17] = {
PG_STARTING, PG_STARTING,
PG_PEERING, PG_PEERING,
PG_INCOMPLETE, PG_INCOMPLETE,
@ -14,6 +14,7 @@ const int pg_state_bits[16] = {
PG_STOPPING, PG_STOPPING,
PG_OFFLINE, PG_OFFLINE,
PG_DEGRADED, PG_DEGRADED,
PG_HAS_INCONSISTENT,
PG_HAS_CORRUPTED, PG_HAS_CORRUPTED,
PG_HAS_INCOMPLETE, PG_HAS_INCOMPLETE,
PG_HAS_DEGRADED, PG_HAS_DEGRADED,
@ -24,7 +25,7 @@ const int pg_state_bits[16] = {
PG_SCRUBBING, PG_SCRUBBING,
}; };
const char *pg_state_names[16] = { const char *pg_state_names[17] = {
"starting", "starting",
"peering", "peering",
"incomplete", "incomplete",
@ -33,6 +34,7 @@ const char *pg_state_names[16] = {
"stopping", "stopping",
"offline", "offline",
"degraded", "degraded",
"has_inconsistent",
"has_corrupted", "has_corrupted",
"has_incomplete", "has_incomplete",
"has_degraded", "has_degraded",
@ -42,3 +44,27 @@ const char *pg_state_names[16] = {
"left_on_dead", "left_on_dead",
"scrubbing", "scrubbing",
}; };
const int object_state_bit_count = 8;
const int object_state_bits[8] = {
OBJ_DEGRADED,
OBJ_INCOMPLETE,
OBJ_MISPLACED,
OBJ_CORRUPTED,
OBJ_INCONSISTENT,
OBJ_NEEDS_STABLE,
OBJ_NEEDS_ROLLBACK,
0,
};
const char *object_state_names[8] = {
"degraded",
"incomplete",
"misplaced",
"corrupted",
"inconsistent",
"needs_stable",
"needs_rollback",
"clean",
};

View File

@ -23,8 +23,9 @@
#define PG_HAS_UNCLEAN (1<<11) #define PG_HAS_UNCLEAN (1<<11)
#define PG_HAS_INVALID (1<<12) #define PG_HAS_INVALID (1<<12)
#define PG_HAS_CORRUPTED (1<<13) #define PG_HAS_CORRUPTED (1<<13)
#define PG_LEFT_ON_DEAD (1<<14) #define PG_HAS_INCONSISTENT (1<<14)
#define PG_SCRUBBING (1<<15) #define PG_LEFT_ON_DEAD (1<<15)
#define PG_SCRUBBING (1<<16)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
@ -36,9 +37,16 @@
#define OBJ_MISPLACED 0x08 #define OBJ_MISPLACED 0x08
// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED // OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
#define OBJ_CORRUPTED 0x10 #define OBJ_CORRUPTED 0x10
// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct
// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states
#define OBJ_INCONSISTENT 0x20
#define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000 #define OBJ_NEEDS_ROLLBACK 0x20000
extern const int pg_state_bits[]; extern const int pg_state_bits[];
extern const char *pg_state_names[]; extern const char *pg_state_names[];
extern const int pg_state_bit_count; extern const int pg_state_bit_count;
extern const int object_state_bits[];
extern const char *object_state_names[];
extern const int object_state_bit_count;