From fd8e1a8418d66ac354d90cac62b50a38a7518074 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 23 Mar 2020 00:39:53 +0300 Subject: [PATCH] Slightly reorganize object state check code --- osd_peering_pg.cpp | 345 ++++++++++++++++++++++++++------------------- osd_peering_pg.h | 38 ----- 2 files changed, 197 insertions(+), 186 deletions(-) diff --git a/osd_peering_pg.cpp b/osd_peering_pg.cpp index ca925620..4f81a5f2 100644 --- a/osd_peering_pg.cpp +++ b/osd_peering_pg.cpp @@ -1,114 +1,251 @@ #include "osd_peering_pg.h" -void pg_t::remember_object(pg_obj_state_check_t &st, std::vector &all) +struct obj_ver_role { - auto & pg = *this; + object_id oid; + uint64_t version; + uint64_t osd_num; + bool is_stable; +}; + +inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) +{ + // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC + return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && ( + (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) || + (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && ( + a.version > b.version || a.version == b.version && a.osd_num < b.osd_num + ) + ); +} + +struct obj_piece_ver_t +{ + uint64_t max_ver = 0; + uint64_t stable_ver = 0; +}; + +struct pg_obj_state_check_t +{ + pg_t *pg; + int i; + std::vector list; + int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; + object_id oid = { 0 }; + uint64_t max_ver = 0; + uint64_t last_ver = 0; + uint64_t target_ver = 0; + uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; + bool is_buggy = false, has_old_unstable = false; + pg_osd_set_t osd_set; + + void walk(); + void start_object(); + void handle_version(); + void finish_object(); +}; + +void pg_obj_state_check_t::walk() +{ + pg->clean_count = 0; + pg->total_count = 0; + pg->state = 0; + for (i = 0; i < list.size(); i++) + { + if (oid.inode != list[i].oid.inode || + oid.stripe != (list[i].oid.stripe & ~STRIPE_MASK)) + { + if (oid.inode != 0) + { + finish_object(); + } + start_object(); + } + handle_version(); + } + if (oid.inode != 0) + { + finish_object(); + } + if (pg->pg_cursize < pg->pg_size) + { + pg->state = pg->state | PG_DEGRADED; + } + pg->state = pg->state | PG_ACTIVE; +} + +void pg_obj_state_check_t::start_object() +{ + obj_start = i; + oid = { .inode = list[i].oid.inode, .stripe = list[i].oid.stripe & ~STRIPE_MASK }; + last_ver = max_ver = list[i].version; + target_ver = 0; + ver_start = i; + has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; + is_buggy = false; +} + +void pg_obj_state_check_t::handle_version() +{ + if (!target_ver && last_ver != list[i].version && (n_stable > 0 || n_roles >= pg->pg_minsize)) + { + // Version is either stable or recoverable + target_ver = last_ver; + ver_end = i; + } + if (!target_ver) + { + if (last_ver != list[i].version) + { + ver_start = i; + has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; + last_ver = list[i].version; + } + int replica = (list[i].oid.stripe & STRIPE_MASK); + n_copies++; + if (replica >= pg->pg_size) + { + // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes + is_buggy = true; + } + else + { + if (list[i].is_stable) + { + n_stable++; + } + if (pg->cur_set[replica] != list[i].osd_num) + { + n_mismatched++; + } + if (!(has_roles & (1 << replica))) + { + has_roles = has_roles | (1 << replica); + n_roles++; + } + } + } + else if (!list[i].is_stable) + { + has_old_unstable = true; + } +} + +void pg_obj_state_check_t::finish_object() +{ + if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize)) + { + // Version is either stable or recoverable + target_ver = last_ver; + ver_end = i; + } + obj_end = i; // Remember the decision uint64_t state = OBJ_CLEAN; - if (st.target_ver > 0) + if (target_ver > 0) { - if (st.n_roles < pg.pg_minsize) + if (n_roles < pg->pg_minsize) { - printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver); - for (int i = st.ver_start; i < st.ver_end; i++) + printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); + for (int i = ver_start; i < ver_end; i++) { - printf("Present on: osd %lu, role %ld%s\n", all[i].osd_num, (all[i].oid.stripe & STRIPE_MASK), all[i].is_stable ? " (stable)" : ""); + printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); } state = OBJ_INCOMPLETE; - pg.state = pg.state | PG_HAS_INCOMPLETE; + pg->state = pg->state | PG_HAS_INCOMPLETE; } - else if (st.n_roles < pg.pg_cursize) + else if (n_roles < pg->pg_cursize) { - printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver); - for (int i = st.ver_start; i < st.ver_end; i++) + printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); + for (int i = ver_start; i < ver_end; i++) { - printf("Present on: osd %lu, role %ld%s\n", all[i].osd_num, (all[i].oid.stripe & STRIPE_MASK), all[i].is_stable ? " (stable)" : ""); + printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); } state = OBJ_DEGRADED; - pg.state = pg.state | PG_HAS_DEGRADED; + pg->state = pg->state | PG_HAS_DEGRADED; } - if (st.n_mismatched > 0) + if (n_mismatched > 0) { state |= OBJ_MISPLACED; - pg.state = pg.state | PG_HAS_MISPLACED; + pg->state = pg->state | PG_HAS_MISPLACED; } - if (st.n_stable < st.n_copies) + if (n_stable < n_copies) { state |= OBJ_NEEDS_STABLE; - pg.state = pg.state | PG_HAS_UNCLEAN; + pg->state = pg->state | PG_HAS_UNCLEAN; } } - if (st.target_ver < st.max_ver || st.has_old_unstable) + if (target_ver < max_ver || has_old_unstable) { state |= OBJ_NEEDS_ROLLBACK; - pg.state = pg.state | PG_HAS_UNCLEAN; + pg->state = pg->state | PG_HAS_UNCLEAN; } - if (st.is_buggy) + if (is_buggy) { state |= OBJ_BUGGY; // FIXME: bring pg offline throw std::runtime_error("buggy object state"); } - pg.total_count++; + pg->total_count++; if (state == OBJ_CLEAN) { - pg.clean_count++; + pg->clean_count++; } else { - st.osd_set.clear(); - for (int i = st.ver_start; i < st.ver_end; i++) + osd_set.clear(); + for (int i = ver_start; i < ver_end; i++) { - st.osd_set.push_back((pg_obj_loc_t){ - .role = (all[i].oid.stripe & STRIPE_MASK), - .osd_num = all[i].osd_num, - .stable = all[i].is_stable, + osd_set.push_back((pg_obj_loc_t){ + .role = (list[i].oid.stripe & STRIPE_MASK), + .osd_num = list[i].osd_num, + .stable = list[i].is_stable, }); } - std::sort(st.osd_set.begin(), st.osd_set.end()); - auto it = pg.state_dict.find(st.osd_set); - if (it == pg.state_dict.end()) + std::sort(osd_set.begin(), osd_set.end()); + auto it = pg->state_dict.find(osd_set); + if (it == pg->state_dict.end()) { std::vector read_target; - read_target.resize(pg.pg_size); - for (int i = 0; i < pg.pg_size; i++) + read_target.resize(pg->pg_size); + for (int i = 0; i < pg->pg_size; i++) { read_target[i] = 0; } - for (auto & o: st.osd_set) + for (auto & o: osd_set) { read_target[o.role] = o.osd_num; } - pg.state_dict[st.osd_set] = { + pg->state_dict[osd_set] = { .read_target = read_target, - .osd_set = st.osd_set, + .osd_set = osd_set, .state = state, .object_count = 1, }; - it = pg.state_dict.find(st.osd_set); + it = pg->state_dict.find(osd_set); } else { it->second.object_count++; } - pg.obj_states[st.oid] = &it->second; - if (st.target_ver < st.max_ver) + pg->obj_states[oid] = &it->second; + if (target_ver < max_ver) { - pg.ver_override[st.oid] = st.target_ver; + pg->ver_override[oid] = target_ver; } if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE)) { std::unordered_map pieces; - for (int i = st.obj_start; i < st.obj_end; i++) + for (int i = obj_start; i < obj_end; i++) { - auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }]; + auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }]; if (!pcs.max_ver) { - pcs.max_ver = all[i].version; + pcs.max_ver = list[i].version; } - if (all[i].is_stable && !pcs.stable_ver) + if (list[i].is_stable && !pcs.stable_ver) { - pcs.stable_ver = all[i].version; + pcs.stable_ver = list[i].version; } } for (auto pp: pieces) @@ -116,21 +253,21 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector & auto & pcs = pp.second; if (pcs.stable_ver < pcs.max_ver) { - auto & act = flush_actions[pp.first]; - if (pcs.max_ver > st.target_ver) + auto & act = pg->flush_actions[pp.first]; + if (pcs.max_ver > target_ver) { act.rollback = true; - act.rollback_to = st.target_ver; + act.rollback_to = target_ver; } - else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver) + else if (pcs.max_ver < target_ver && pcs.stable_ver < pcs.max_ver) { act.rollback = true; act.rollback_to = pcs.stable_ver; } - if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver) + if (pcs.max_ver >= target_ver && pcs.stable_ver < target_ver) { act.make_stable = true; - act.stable_to = st.target_ver; + act.stable_to = target_ver; } } } @@ -141,21 +278,21 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector & // FIXME: Write at least some tests for this function void pg_t::calc_object_states() { - auto & pg = *this; // Copy all object lists into one array - std::vector all; - auto ps = pg.peering_state; + pg_obj_state_check_t st; + st.pg = this; + auto ps = peering_state; for (auto it: ps->list_results) { auto nstab = it.second.stable_count; auto n = it.second.total_count; auto osd_num = it.first; - uint64_t start = all.size(); - all.resize(start + n); + uint64_t start = st.list.size(); + st.list.resize(start + n); obj_ver_id *ov = it.second.buf; for (uint64_t i = 0; i < n; i++, ov++) { - all[start+i] = { + st.list[start+i] = { .oid = ov->oid, .version = ov->version, .osd_num = osd_num, @@ -167,98 +304,10 @@ void pg_t::calc_object_states() } ps->list_results.clear(); // Sort - std::sort(all.begin(), all.end()); + std::sort(st.list.begin(), st.list.end()); // Walk over it and check object states - pg.clean_count = 0; - pg.total_count = 0; - pg.state = 0; - int replica = 0; - pg_obj_state_check_t st; - for (int i = 0; i < all.size(); i++) - { - if (st.oid.inode != all[i].oid.inode || - st.oid.stripe != (all[i].oid.stripe & ~STRIPE_MASK)) - { - if (st.oid.inode != 0) - { - // Remember object state - if (!st.target_ver && (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)) - { - // Version is either stable or recoverable - st.target_ver = st.last_ver; - st.ver_end = i; - } - st.obj_end = i; - remember_object(st, all); - } - st.obj_start = i; - st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe & ~STRIPE_MASK }; - st.last_ver = st.max_ver = all[i].version; - st.target_ver = 0; - st.ver_start = i; - st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_mismatched = 0; - } - if (!st.target_ver && st.last_ver != all[i].version && (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)) - { - // Version is either stable or recoverable - st.target_ver = st.last_ver; - st.ver_end = i; - } - if (!st.target_ver) - { - if (st.last_ver != all[i].version) - { - st.ver_start = i; - st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_mismatched = 0; - st.last_ver = all[i].version; - } - replica = (all[i].oid.stripe & STRIPE_MASK); - st.n_copies++; - if (replica >= pg.pg_size) - { - // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes - st.is_buggy = true; - } - else - { - if (all[i].is_stable) - { - st.n_stable++; - } - if (pg.cur_set[replica] != all[i].osd_num) - { - st.n_mismatched++; - } - if (!(st.has_roles & (1 << replica))) - { - st.has_roles = st.has_roles | (1 << replica); - st.n_roles++; - } - } - } - else if (!all[i].is_stable) - { - st.has_old_unstable = true; - } - } - if (st.oid.inode != 0) - { - // Remember object state - if (!st.target_ver && (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)) - { - // Version is either stable or recoverable - st.target_ver = st.last_ver; - st.ver_end = all.size(); - } - st.obj_end = all.size(); - remember_object(st, all); - } - if (pg.pg_cursize < pg.pg_size) - { - pg.state = pg.state | PG_DEGRADED; - } - pg.state = pg.state | PG_ACTIVE; - pg.print_state(); + st.walk(); + print_state(); } void pg_t::print_state() diff --git a/osd_peering_pg.h b/osd_peering_pg.h index c37de105..ff226e1f 100644 --- a/osd_peering_pg.h +++ b/osd_peering_pg.h @@ -69,38 +69,12 @@ struct pg_peering_state_t int list_done = 0; }; -struct pg_obj_state_check_t -{ - int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; - object_id oid = { 0 }; - uint64_t max_ver = 0; - uint64_t last_ver = 0; - uint64_t target_ver = 0; - uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; - bool is_buggy = false, has_old_unstable = false; - pg_osd_set_t osd_set; -}; - -struct obj_ver_role -{ - object_id oid; - uint64_t version; - uint64_t osd_num; - bool is_stable; -}; - struct obj_piece_id_t { object_id oid; uint64_t osd_num; }; -struct obj_piece_ver_t -{ - uint64_t max_ver = 0; - uint64_t stable_ver = 0; -}; - struct flush_action_t { bool rollback = false, make_stable = false; @@ -142,7 +116,6 @@ struct pg_t std::multimap write_queue; void calc_object_states(); - void remember_object(pg_obj_state_check_t &st, std::vector &all); void print_state(); }; @@ -152,17 +125,6 @@ inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b) a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable; } -inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) -{ - // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC - return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && ( - (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) || - (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && ( - a.version > b.version || a.version == b.version && a.osd_num < b.osd_num - ) - ); -} - inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b) { return a.oid == b.oid && a.osd_num == b.osd_num;