Implement ambiguity detection during scrub
parent
281be547eb
commit
6648f6bb6e
|
@ -115,6 +115,7 @@ const etcd_tree = {
|
|||
scrub_queue_depth: 1,
|
||||
scrub_sleep: 0, // milliseconds
|
||||
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
||||
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
|
@ -273,8 +274,8 @@ const etcd_tree = {
|
|||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead"|"scrubbing")[],
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
|
||||
}
|
||||
}, */
|
||||
},
|
||||
|
|
|
@ -217,6 +217,9 @@ void osd_t::parse_config(bool init)
|
|||
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
|
||||
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
scrub_queue_depth = 1;
|
||||
scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
|
||||
if (scrub_ec_max_bruteforce < 1)
|
||||
scrub_ec_max_bruteforce = 100;
|
||||
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
|
||||
scrub_list_limit = config["scrub_list_limit"].uint64_value();
|
||||
if (!scrub_list_limit)
|
||||
|
|
|
@ -120,6 +120,7 @@ class osd_t
|
|||
uint64_t scrub_queue_depth = 1;
|
||||
uint64_t scrub_sleep_ms = 0;
|
||||
uint32_t scrub_list_limit = 1000;
|
||||
uint64_t scrub_ec_max_bruteforce = 100;
|
||||
|
||||
// cluster state
|
||||
|
||||
|
@ -142,7 +143,7 @@ class osd_t
|
|||
std::set<pool_pg_num_t> dirty_pgs;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
int copies_to_delete_after_sync_count = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
std::map<object_id, osd_op_t*> scrub_ops;
|
||||
|
@ -264,8 +265,11 @@ class osd_t
|
|||
void continue_primary_sync(osd_op_t *cur_op);
|
||||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
|
||||
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
|
|
|
@ -336,7 +336,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
|
|||
{
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.loc_bad)
|
||||
if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
|
||||
{
|
||||
read_target.push_back(o.osd_num);
|
||||
}
|
||||
|
@ -356,7 +356,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
|
|||
}
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.loc_bad)
|
||||
if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
|
||||
{
|
||||
read_target[o.role] = o.osd_num;
|
||||
}
|
||||
|
@ -374,7 +374,11 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
|
|||
{
|
||||
it->second.object_count++;
|
||||
}
|
||||
if (state & OBJ_INCOMPLETE)
|
||||
if (state & OBJ_INCONSISTENT)
|
||||
{
|
||||
inconsistent_objects[oid] = &it->second;
|
||||
}
|
||||
else if (state & OBJ_INCOMPLETE)
|
||||
{
|
||||
incomplete_objects[oid] = &it->second;
|
||||
}
|
||||
|
@ -453,7 +457,8 @@ void pg_t::calc_object_states(int log_level)
|
|||
std::to_string(loc.osd_num)+
|
||||
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+
|
||||
(loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
|
||||
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "");
|
||||
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
|
||||
(loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
|
||||
}
|
||||
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
||||
}
|
||||
|
@ -463,7 +468,7 @@ void pg_t::calc_object_states(int log_level)
|
|||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
|
@ -472,6 +477,7 @@ void pg_t::print_state()
|
|||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
(state & PG_STOPPING) ? "stopping" : "",
|
||||
(state & PG_DEGRADED) ? " + degraded" : "",
|
||||
(state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "",
|
||||
(state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
|
||||
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
||||
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
|
||||
|
|
|
@ -15,12 +15,13 @@
|
|||
|
||||
#define LOC_OUTDATED 1
|
||||
#define LOC_CORRUPTED 2
|
||||
#define LOC_INCONSISTENT 4
|
||||
|
||||
struct pg_obj_loc_t
|
||||
{
|
||||
uint64_t role;
|
||||
osd_num_t osd_num;
|
||||
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED
|
||||
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
|
||||
};
|
||||
|
||||
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
||||
|
@ -113,7 +114,7 @@ struct pg_t
|
|||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
||||
uint64_t corrupted_count;
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects;
|
||||
std::map<obj_piece_id_t, flush_action_t> flush_actions;
|
||||
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
|
||||
btree::btree_map<object_id, uint64_t> ver_override;
|
||||
|
|
|
@ -255,7 +255,7 @@ resume_2:
|
|||
// I/O or checksum error
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
||||
goto resume_0;
|
||||
}
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
|
@ -296,7 +296,8 @@ resume_2:
|
|||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
}
|
||||
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
|
||||
osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
|
||||
{
|
||||
pg_osd_set_state_t *object_state = NULL;
|
||||
get_object_osd_set(pg, oid, &object_state);
|
||||
|
@ -327,26 +328,24 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
}
|
||||
}
|
||||
// Mark object chunk(s) as corrupted
|
||||
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
|
||||
int changes = 0;
|
||||
for (auto & chunk: corrupted_set)
|
||||
{
|
||||
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
|
||||
if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
|
||||
n_corrupted++;
|
||||
chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
|
||||
if (!chunk.loc_bad)
|
||||
if (corrupted)
|
||||
{
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
n_roles = 1;
|
||||
else if (!(has_roles & (1 << chunk.role)))
|
||||
{
|
||||
n_roles++;
|
||||
has_roles |= (1 << chunk.role);
|
||||
}
|
||||
n_copies++;
|
||||
if (!(chunk.loc_bad & LOC_CORRUPTED))
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_CORRUPTED;
|
||||
}
|
||||
else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED))
|
||||
{
|
||||
if (!(chunk.loc_bad & LOC_INCONSISTENT))
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
}
|
||||
if (!n_corrupted)
|
||||
if (!changes)
|
||||
{
|
||||
// No chunks newly marked as corrupted - object is already marked or moved
|
||||
return object_state;
|
||||
|
@ -357,17 +356,82 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
remove_object_from_state(oid, &object_state, pg, false);
|
||||
deref_object_state(pg, &object_state, ref);
|
||||
}
|
||||
// Calculate object state
|
||||
uint64_t obj_state = OBJ_CORRUPTED;
|
||||
int pg_state_bits = PG_HAS_CORRUPTED;
|
||||
this->corrupted_objects++;
|
||||
pg.corrupted_count++;
|
||||
if (log_level > 1)
|
||||
// Insert object into the new state and retry
|
||||
object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
|
||||
if (ref)
|
||||
{
|
||||
printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
|
||||
oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
|
||||
object_state->ref_count++;
|
||||
}
|
||||
if (n_roles < pg.pg_data_size)
|
||||
return object_state;
|
||||
}
|
||||
|
||||
pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
|
||||
uint64_t old_pg_state, int log_at_level)
|
||||
{
|
||||
// Object state will be calculated from <osd_set>
|
||||
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
|
||||
n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
|
||||
for (auto & chunk: osd_set)
|
||||
{
|
||||
if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
|
||||
{
|
||||
n_invalid++;
|
||||
}
|
||||
else if (chunk.loc_bad & LOC_OUTDATED)
|
||||
{
|
||||
n_outdated++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (chunk.loc_bad & LOC_INCONSISTENT)
|
||||
{
|
||||
n_inconsistent++;
|
||||
}
|
||||
if (chunk.loc_bad & LOC_CORRUPTED)
|
||||
{
|
||||
n_corrupted++;
|
||||
}
|
||||
else if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
n_roles = 1;
|
||||
int i;
|
||||
for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
|
||||
if (i == pg.cur_set.size())
|
||||
{
|
||||
n_misplaced++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!(has_roles & (1 << chunk.role)))
|
||||
{
|
||||
n_roles++;
|
||||
has_roles |= (1 << chunk.role);
|
||||
}
|
||||
if (pg.cur_set[chunk.role] != chunk.osd_num)
|
||||
{
|
||||
n_misplaced++;
|
||||
}
|
||||
}
|
||||
n_copies++;
|
||||
}
|
||||
}
|
||||
uint64_t obj_state = 0;
|
||||
int pg_state_bits = 0;
|
||||
if (n_corrupted > 0)
|
||||
{
|
||||
this->corrupted_objects++;
|
||||
pg.corrupted_count++;
|
||||
obj_state |= OBJ_CORRUPTED;
|
||||
pg_state_bits |= PG_HAS_CORRUPTED;
|
||||
}
|
||||
if (n_invalid > 0 || n_inconsistent > 0)
|
||||
{
|
||||
this->inconsistent_objects++;
|
||||
obj_state |= OBJ_INCONSISTENT;
|
||||
pg_state_bits |= PG_HAS_INCONSISTENT;
|
||||
}
|
||||
else if (n_roles < pg.pg_data_size)
|
||||
{
|
||||
this->incomplete_objects++;
|
||||
obj_state |= OBJ_INCOMPLETE;
|
||||
|
@ -379,12 +443,52 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
obj_state |= OBJ_DEGRADED;
|
||||
pg_state_bits = PG_HAS_DEGRADED;
|
||||
}
|
||||
else
|
||||
else if (n_misplaced > 0 || n_outdated > 0)
|
||||
{
|
||||
this->misplaced_objects++;
|
||||
obj_state |= OBJ_MISPLACED;
|
||||
pg_state_bits = PG_HAS_MISPLACED;
|
||||
}
|
||||
if (this->log_level >= log_at_level)
|
||||
{
|
||||
printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
|
||||
for (int i = 0, j = 0; i < object_state_bit_count; i++)
|
||||
{
|
||||
if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
|
||||
{
|
||||
printf((j++) ? "+%s" : "%s", object_state_names[i]);
|
||||
}
|
||||
}
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
printf(": %lu copies available", n_copies);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(": %lu parts / %lu copies available", n_roles, n_copies);
|
||||
}
|
||||
if (n_invalid > 0)
|
||||
{
|
||||
printf(", %lu invalid", n_invalid);
|
||||
}
|
||||
if (n_outdated > 0)
|
||||
{
|
||||
printf(", %lu outdated", n_outdated);
|
||||
}
|
||||
if (n_misplaced > 0)
|
||||
{
|
||||
printf(", %lu misplaced", n_misplaced);
|
||||
}
|
||||
if (n_corrupted > 0)
|
||||
{
|
||||
printf(", %lu corrupted", n_corrupted);
|
||||
}
|
||||
if (n_inconsistent > 0)
|
||||
{
|
||||
printf(", %lu inconsistent", n_inconsistent);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
pg.state |= pg_state_bits;
|
||||
if (pg.state != old_pg_state)
|
||||
{
|
||||
|
@ -403,11 +507,13 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
if (!obj_state)
|
||||
{
|
||||
// Object is clean
|
||||
return NULL;
|
||||
}
|
||||
// Insert object into the new state and retry
|
||||
object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
|
||||
if (ref)
|
||||
object_state->ref_count++;
|
||||
return object_state;
|
||||
return pg.add_object_to_state(oid, obj_state, osd_set);
|
||||
}
|
||||
|
||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||
|
@ -426,14 +532,29 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
|
|||
*object_state = recheck_state;
|
||||
return;
|
||||
}
|
||||
bool changed = false;
|
||||
(*object_state)->object_count--;
|
||||
if ((*object_state)->state & OBJ_CORRUPTED)
|
||||
{
|
||||
this->corrupted_objects--;
|
||||
pg.corrupted_count--;
|
||||
if (!pg.corrupted_count)
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_CORRUPTED;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
bool changed = false;
|
||||
if ((*object_state)->state & OBJ_INCOMPLETE)
|
||||
if ((*object_state)->state & OBJ_INCONSISTENT)
|
||||
{
|
||||
this->inconsistent_objects--;
|
||||
pg.inconsistent_objects.erase(oid);
|
||||
if (!pg.inconsistent_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_INCONSISTENT;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
else if ((*object_state)->state & OBJ_INCOMPLETE)
|
||||
{
|
||||
// Successful write means that object is not incomplete anymore
|
||||
this->incomplete_objects--;
|
||||
|
|
|
@ -532,7 +532,7 @@ void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
|||
}
|
||||
if (corrupted)
|
||||
{
|
||||
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
|
||||
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -141,7 +141,7 @@ resume_3:
|
|||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// Mark object corrupted and retry
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
|
||||
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
|
||||
goto retry_1;
|
||||
}
|
||||
|
|
|
@ -377,9 +377,13 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
|||
{
|
||||
n_copies++;
|
||||
}
|
||||
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
else
|
||||
{
|
||||
op_data->degraded = true;
|
||||
op_data->stripes[role].missing = true;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
|
@ -388,8 +392,7 @@ void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
|||
finish_op(cur_op, 0);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
|
||||
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
|
||||
// Submit reads
|
||||
osd_op_t *subops = new osd_op_t[n_copies];
|
||||
op_data->fact_ver = 0;
|
||||
|
@ -412,8 +415,15 @@ resume_2:
|
|||
int n_copies = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].read_end != 0 &&
|
||||
!op_data->stripes[role].read_error)
|
||||
if (op_data->stripes[role].read_error)
|
||||
{
|
||||
op_data->stripes[role].missing = true;
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
else if (!op_data->stripes[role].missing)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
|
@ -423,7 +433,7 @@ resume_2:
|
|||
// Nothing to compare, just mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
||||
// Operation is treated as unsuccessful only if the object becomes unreadable
|
||||
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
||||
return;
|
||||
|
@ -436,6 +446,7 @@ resume_2:
|
|||
return;
|
||||
}
|
||||
}
|
||||
bool inconsistent = false;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Check that all chunks have returned the same data
|
||||
|
@ -475,7 +486,6 @@ resume_2:
|
|||
}
|
||||
if (best >= 0 && votes[best] < total)
|
||||
{
|
||||
// FIXME Add a flag to allow to skip such objects and not recover them automatically
|
||||
bool unknown = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
|
@ -484,9 +494,10 @@ resume_2:
|
|||
if (votes[role] > 0 && votes[role] < votes[best])
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
|
||||
"[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||
op_data->stripes[role].osd_num, votes[best]
|
||||
);
|
||||
op_data->stripes[role].read_error = true;
|
||||
}
|
||||
|
@ -494,63 +505,67 @@ resume_2:
|
|||
if (unknown)
|
||||
{
|
||||
// It's unknown which replica is good. There are multiple versions with no majority
|
||||
// Mark all good replicas as ambiguous
|
||||
best = -1;
|
||||
inconsistent = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
||||
if (op_data->degraded)
|
||||
auto good_subset = ec_find_good(
|
||||
op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
|
||||
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
|
||||
);
|
||||
if (!good_subset.size())
|
||||
{
|
||||
// Reconstruct missing stripes
|
||||
// XOR shouldn't come here as it only has 1 parity chunk
|
||||
assert(op_data->scheme == POOL_SCHEME_EC);
|
||||
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
||||
inconsistent = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
||||
);
|
||||
}
|
||||
// Generate parity chunks and compare them with actual data
|
||||
osd_num_t fake_osd_set[op_data->pg_size];
|
||||
for (int i = 0; i < op_data->pg_size; i++)
|
||||
else
|
||||
{
|
||||
fake_osd_set[i] = 1;
|
||||
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
|
||||
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
|
||||
: op_data->stripes[i].read_buf;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||
}
|
||||
else if (op_data->scheme == POOL_SCHEME_EC)
|
||||
{
|
||||
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||
}
|
||||
// Now compare that write_buf == read_buf
|
||||
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
|
||||
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
// Chunks don't match - something's wrong... but we don't know what :D
|
||||
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe,
|
||||
role-op_data->pg_data_size, op_data->stripes[role].osd_num
|
||||
);
|
||||
op_data->stripes[role].read_error = true;
|
||||
if (!op_data->stripes[role].missing)
|
||||
op_data->stripes[role].read_error = true;
|
||||
}
|
||||
for (int role: good_subset)
|
||||
{
|
||||
op_data->stripes[role].read_error = false;
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
||||
{
|
||||
op_data->stripes[role].read_error = true;
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
||||
role, op_data->stripes[role].osd_num
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
|
||||
if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent)
|
||||
{
|
||||
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
#include "pg_states.h"
|
||||
|
||||
const int pg_state_bit_count = 16;
|
||||
const int pg_state_bit_count = 17;
|
||||
|
||||
const int pg_state_bits[16] = {
|
||||
const int pg_state_bits[17] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_INCOMPLETE,
|
||||
|
@ -14,6 +14,7 @@ const int pg_state_bits[16] = {
|
|||
PG_STOPPING,
|
||||
PG_OFFLINE,
|
||||
PG_DEGRADED,
|
||||
PG_HAS_INCONSISTENT,
|
||||
PG_HAS_CORRUPTED,
|
||||
PG_HAS_INCOMPLETE,
|
||||
PG_HAS_DEGRADED,
|
||||
|
@ -24,7 +25,7 @@ const int pg_state_bits[16] = {
|
|||
PG_SCRUBBING,
|
||||
};
|
||||
|
||||
const char *pg_state_names[16] = {
|
||||
const char *pg_state_names[17] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"incomplete",
|
||||
|
@ -33,6 +34,7 @@ const char *pg_state_names[16] = {
|
|||
"stopping",
|
||||
"offline",
|
||||
"degraded",
|
||||
"has_inconsistent",
|
||||
"has_corrupted",
|
||||
"has_incomplete",
|
||||
"has_degraded",
|
||||
|
@ -42,3 +44,27 @@ const char *pg_state_names[16] = {
|
|||
"left_on_dead",
|
||||
"scrubbing",
|
||||
};
|
||||
|
||||
const int object_state_bit_count = 8;
|
||||
|
||||
const int object_state_bits[8] = {
|
||||
OBJ_DEGRADED,
|
||||
OBJ_INCOMPLETE,
|
||||
OBJ_MISPLACED,
|
||||
OBJ_CORRUPTED,
|
||||
OBJ_INCONSISTENT,
|
||||
OBJ_NEEDS_STABLE,
|
||||
OBJ_NEEDS_ROLLBACK,
|
||||
0,
|
||||
};
|
||||
|
||||
const char *object_state_names[8] = {
|
||||
"degraded",
|
||||
"incomplete",
|
||||
"misplaced",
|
||||
"corrupted",
|
||||
"inconsistent",
|
||||
"needs_stable",
|
||||
"needs_rollback",
|
||||
"clean",
|
||||
};
|
||||
|
|
|
@ -23,8 +23,9 @@
|
|||
#define PG_HAS_UNCLEAN (1<<11)
|
||||
#define PG_HAS_INVALID (1<<12)
|
||||
#define PG_HAS_CORRUPTED (1<<13)
|
||||
#define PG_LEFT_ON_DEAD (1<<14)
|
||||
#define PG_SCRUBBING (1<<15)
|
||||
#define PG_HAS_INCONSISTENT (1<<14)
|
||||
#define PG_LEFT_ON_DEAD (1<<15)
|
||||
#define PG_SCRUBBING (1<<16)
|
||||
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
|
@ -36,9 +37,16 @@
|
|||
#define OBJ_MISPLACED 0x08
|
||||
// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
|
||||
#define OBJ_CORRUPTED 0x10
|
||||
// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct
|
||||
// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states
|
||||
#define OBJ_INCONSISTENT 0x20
|
||||
#define OBJ_NEEDS_STABLE 0x10000
|
||||
#define OBJ_NEEDS_ROLLBACK 0x20000
|
||||
|
||||
extern const int pg_state_bits[];
|
||||
extern const char *pg_state_names[];
|
||||
extern const int pg_state_bit_count;
|
||||
|
||||
extern const int object_state_bits[];
|
||||
extern const char *object_state_names[];
|
||||
extern const int object_state_bit_count;
|
||||
|
|
Loading…
Reference in New Issue