Slightly reorganize object state check code

trace-sqes
Vitaliy Filippov 2020-03-23 00:39:53 +03:00
parent a08e0bfacd
commit fd8e1a8418
2 changed files with 197 additions and 186 deletions

View File

@ -1,114 +1,251 @@
#include "osd_peering_pg.h"
void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all)
struct obj_ver_role
{
auto & pg = *this;
object_id oid;
uint64_t version;
uint64_t osd_num;
bool is_stable;
};
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
{
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
)
);
}
struct obj_piece_ver_t
{
uint64_t max_ver = 0;
uint64_t stable_ver = 0;
};
struct pg_obj_state_check_t
{
pg_t *pg;
int i;
std::vector<obj_ver_role> list;
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
object_id oid = { 0 };
uint64_t max_ver = 0;
uint64_t last_ver = 0;
uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
bool is_buggy = false, has_old_unstable = false;
pg_osd_set_t osd_set;
void walk();
void start_object();
void handle_version();
void finish_object();
};
void pg_obj_state_check_t::walk()
{
pg->clean_count = 0;
pg->total_count = 0;
pg->state = 0;
for (i = 0; i < list.size(); i++)
{
if (oid.inode != list[i].oid.inode ||
oid.stripe != (list[i].oid.stripe & ~STRIPE_MASK))
{
if (oid.inode != 0)
{
finish_object();
}
start_object();
}
handle_version();
}
if (oid.inode != 0)
{
finish_object();
}
if (pg->pg_cursize < pg->pg_size)
{
pg->state = pg->state | PG_DEGRADED;
}
pg->state = pg->state | PG_ACTIVE;
}
void pg_obj_state_check_t::start_object()
{
obj_start = i;
oid = { .inode = list[i].oid.inode, .stripe = list[i].oid.stripe & ~STRIPE_MASK };
last_ver = max_ver = list[i].version;
target_ver = 0;
ver_start = i;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
is_buggy = false;
}
void pg_obj_state_check_t::handle_version()
{
if (!target_ver && last_ver != list[i].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
{
// Version is either stable or recoverable
target_ver = last_ver;
ver_end = i;
}
if (!target_ver)
{
if (last_ver != list[i].version)
{
ver_start = i;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
last_ver = list[i].version;
}
int replica = (list[i].oid.stripe & STRIPE_MASK);
n_copies++;
if (replica >= pg->pg_size)
{
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
is_buggy = true;
}
else
{
if (list[i].is_stable)
{
n_stable++;
}
if (pg->cur_set[replica] != list[i].osd_num)
{
n_mismatched++;
}
if (!(has_roles & (1 << replica)))
{
has_roles = has_roles | (1 << replica);
n_roles++;
}
}
}
else if (!list[i].is_stable)
{
has_old_unstable = true;
}
}
void pg_obj_state_check_t::finish_object()
{
if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
{
// Version is either stable or recoverable
target_ver = last_ver;
ver_end = i;
}
obj_end = i;
// Remember the decision
uint64_t state = OBJ_CLEAN;
if (st.target_ver > 0)
if (target_ver > 0)
{
if (st.n_roles < pg.pg_minsize)
if (n_roles < pg->pg_minsize)
{
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
for (int i = st.ver_start; i < st.ver_end; i++)
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", all[i].osd_num, (all[i].oid.stripe & STRIPE_MASK), all[i].is_stable ? " (stable)" : "");
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
state = OBJ_INCOMPLETE;
pg.state = pg.state | PG_HAS_INCOMPLETE;
pg->state = pg->state | PG_HAS_INCOMPLETE;
}
else if (st.n_roles < pg.pg_cursize)
else if (n_roles < pg->pg_cursize)
{
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
for (int i = st.ver_start; i < st.ver_end; i++)
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", all[i].osd_num, (all[i].oid.stripe & STRIPE_MASK), all[i].is_stable ? " (stable)" : "");
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
state = OBJ_DEGRADED;
pg.state = pg.state | PG_HAS_DEGRADED;
pg->state = pg->state | PG_HAS_DEGRADED;
}
if (st.n_mismatched > 0)
if (n_mismatched > 0)
{
state |= OBJ_MISPLACED;
pg.state = pg.state | PG_HAS_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
}
if (st.n_stable < st.n_copies)
if (n_stable < n_copies)
{
state |= OBJ_NEEDS_STABLE;
pg.state = pg.state | PG_HAS_UNCLEAN;
pg->state = pg->state | PG_HAS_UNCLEAN;
}
}
if (st.target_ver < st.max_ver || st.has_old_unstable)
if (target_ver < max_ver || has_old_unstable)
{
state |= OBJ_NEEDS_ROLLBACK;
pg.state = pg.state | PG_HAS_UNCLEAN;
pg->state = pg->state | PG_HAS_UNCLEAN;
}
if (st.is_buggy)
if (is_buggy)
{
state |= OBJ_BUGGY;
// FIXME: bring pg offline
throw std::runtime_error("buggy object state");
}
pg.total_count++;
pg->total_count++;
if (state == OBJ_CLEAN)
{
pg.clean_count++;
pg->clean_count++;
}
else
{
st.osd_set.clear();
for (int i = st.ver_start; i < st.ver_end; i++)
osd_set.clear();
for (int i = ver_start; i < ver_end; i++)
{
st.osd_set.push_back((pg_obj_loc_t){
.role = (all[i].oid.stripe & STRIPE_MASK),
.osd_num = all[i].osd_num,
.stable = all[i].is_stable,
osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK),
.osd_num = list[i].osd_num,
.stable = list[i].is_stable,
});
}
std::sort(st.osd_set.begin(), st.osd_set.end());
auto it = pg.state_dict.find(st.osd_set);
if (it == pg.state_dict.end())
std::sort(osd_set.begin(), osd_set.end());
auto it = pg->state_dict.find(osd_set);
if (it == pg->state_dict.end())
{
std::vector<uint64_t> read_target;
read_target.resize(pg.pg_size);
for (int i = 0; i < pg.pg_size; i++)
read_target.resize(pg->pg_size);
for (int i = 0; i < pg->pg_size; i++)
{
read_target[i] = 0;
}
for (auto & o: st.osd_set)
for (auto & o: osd_set)
{
read_target[o.role] = o.osd_num;
}
pg.state_dict[st.osd_set] = {
pg->state_dict[osd_set] = {
.read_target = read_target,
.osd_set = st.osd_set,
.osd_set = osd_set,
.state = state,
.object_count = 1,
};
it = pg.state_dict.find(st.osd_set);
it = pg->state_dict.find(osd_set);
}
else
{
it->second.object_count++;
}
pg.obj_states[st.oid] = &it->second;
if (st.target_ver < st.max_ver)
pg->obj_states[oid] = &it->second;
if (target_ver < max_ver)
{
pg.ver_override[st.oid] = st.target_ver;
pg->ver_override[oid] = target_ver;
}
if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
{
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
for (int i = st.obj_start; i < st.obj_end; i++)
for (int i = obj_start; i < obj_end; i++)
{
auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }];
auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
if (!pcs.max_ver)
{
pcs.max_ver = all[i].version;
pcs.max_ver = list[i].version;
}
if (all[i].is_stable && !pcs.stable_ver)
if (list[i].is_stable && !pcs.stable_ver)
{
pcs.stable_ver = all[i].version;
pcs.stable_ver = list[i].version;
}
}
for (auto pp: pieces)
@ -116,21 +253,21 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
auto & pcs = pp.second;
if (pcs.stable_ver < pcs.max_ver)
{
auto & act = flush_actions[pp.first];
if (pcs.max_ver > st.target_ver)
auto & act = pg->flush_actions[pp.first];
if (pcs.max_ver > target_ver)
{
act.rollback = true;
act.rollback_to = st.target_ver;
act.rollback_to = target_ver;
}
else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver)
else if (pcs.max_ver < target_ver && pcs.stable_ver < pcs.max_ver)
{
act.rollback = true;
act.rollback_to = pcs.stable_ver;
}
if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver)
if (pcs.max_ver >= target_ver && pcs.stable_ver < target_ver)
{
act.make_stable = true;
act.stable_to = st.target_ver;
act.stable_to = target_ver;
}
}
}
@ -141,21 +278,21 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
// FIXME: Write at least some tests for this function
void pg_t::calc_object_states()
{
auto & pg = *this;
// Copy all object lists into one array
std::vector<obj_ver_role> all;
auto ps = pg.peering_state;
pg_obj_state_check_t st;
st.pg = this;
auto ps = peering_state;
for (auto it: ps->list_results)
{
auto nstab = it.second.stable_count;
auto n = it.second.total_count;
auto osd_num = it.first;
uint64_t start = all.size();
all.resize(start + n);
uint64_t start = st.list.size();
st.list.resize(start + n);
obj_ver_id *ov = it.second.buf;
for (uint64_t i = 0; i < n; i++, ov++)
{
all[start+i] = {
st.list[start+i] = {
.oid = ov->oid,
.version = ov->version,
.osd_num = osd_num,
@ -167,98 +304,10 @@ void pg_t::calc_object_states()
}
ps->list_results.clear();
// Sort
std::sort(all.begin(), all.end());
std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states
pg.clean_count = 0;
pg.total_count = 0;
pg.state = 0;
int replica = 0;
pg_obj_state_check_t st;
for (int i = 0; i < all.size(); i++)
{
if (st.oid.inode != all[i].oid.inode ||
st.oid.stripe != (all[i].oid.stripe & ~STRIPE_MASK))
{
if (st.oid.inode != 0)
{
// Remember object state
if (!st.target_ver && (st.n_stable > 0 || st.n_roles >= pg.pg_minsize))
{
// Version is either stable or recoverable
st.target_ver = st.last_ver;
st.ver_end = i;
}
st.obj_end = i;
remember_object(st, all);
}
st.obj_start = i;
st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe & ~STRIPE_MASK };
st.last_ver = st.max_ver = all[i].version;
st.target_ver = 0;
st.ver_start = i;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_mismatched = 0;
}
if (!st.target_ver && st.last_ver != all[i].version && (st.n_stable > 0 || st.n_roles >= pg.pg_minsize))
{
// Version is either stable or recoverable
st.target_ver = st.last_ver;
st.ver_end = i;
}
if (!st.target_ver)
{
if (st.last_ver != all[i].version)
{
st.ver_start = i;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_mismatched = 0;
st.last_ver = all[i].version;
}
replica = (all[i].oid.stripe & STRIPE_MASK);
st.n_copies++;
if (replica >= pg.pg_size)
{
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
st.is_buggy = true;
}
else
{
if (all[i].is_stable)
{
st.n_stable++;
}
if (pg.cur_set[replica] != all[i].osd_num)
{
st.n_mismatched++;
}
if (!(st.has_roles & (1 << replica)))
{
st.has_roles = st.has_roles | (1 << replica);
st.n_roles++;
}
}
}
else if (!all[i].is_stable)
{
st.has_old_unstable = true;
}
}
if (st.oid.inode != 0)
{
// Remember object state
if (!st.target_ver && (st.n_stable > 0 || st.n_roles >= pg.pg_minsize))
{
// Version is either stable or recoverable
st.target_ver = st.last_ver;
st.ver_end = all.size();
}
st.obj_end = all.size();
remember_object(st, all);
}
if (pg.pg_cursize < pg.pg_size)
{
pg.state = pg.state | PG_DEGRADED;
}
pg.state = pg.state | PG_ACTIVE;
pg.print_state();
st.walk();
print_state();
}
void pg_t::print_state()

View File

@ -69,38 +69,12 @@ struct pg_peering_state_t
int list_done = 0;
};
struct pg_obj_state_check_t
{
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
object_id oid = { 0 };
uint64_t max_ver = 0;
uint64_t last_ver = 0;
uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
bool is_buggy = false, has_old_unstable = false;
pg_osd_set_t osd_set;
};
struct obj_ver_role
{
object_id oid;
uint64_t version;
uint64_t osd_num;
bool is_stable;
};
struct obj_piece_id_t
{
object_id oid;
uint64_t osd_num;
};
struct obj_piece_ver_t
{
uint64_t max_ver = 0;
uint64_t stable_ver = 0;
};
struct flush_action_t
{
bool rollback = false, make_stable = false;
@ -142,7 +116,6 @@ struct pg_t
std::multimap<object_id, osd_op_t*> write_queue;
void calc_object_states();
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
void print_state();
};
@ -152,17 +125,6 @@ inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
}
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
{
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
)
);
}
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
{
return a.oid == b.oid && a.osd_num == b.osd_num;