From 036f4c5bf394307a6c190983129031ca17c7b1c6 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 23 Mar 2020 15:44:29 +0300 Subject: [PATCH] Fix unstable flushing, include extra OSDs with old object versions in osd_set --- osd_flush.cpp | 4 + osd_peering_pg.cpp | 202 +++++++++++++++++++++++++-------------------- osd_peering_pg.h | 8 +- 3 files changed, 121 insertions(+), 93 deletions(-) diff --git a/osd_flush.cpp b/osd_flush.cpp index bb4a06ae..241c47c4 100644 --- a/osd_flush.cpp +++ b/osd_flush.cpp @@ -90,6 +90,10 @@ void osd_t::handle_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t osd it->first.oid.inode != prev_it->first.oid.inode || (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) { + pg.ver_override.erase((object_id){ + .inode = prev_it->first.oid.inode, + .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK), + }); auto wr_it = pg.write_queue.find((object_id){ .inode = prev_it->first.oid.inode, .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK), diff --git a/osd_peering_pg.cpp b/osd_peering_pg.cpp index 4f81a5f2..48ccfd66 100644 --- a/osd_peering_pg.cpp +++ b/osd_peering_pg.cpp @@ -23,20 +23,21 @@ struct obj_piece_ver_t { uint64_t max_ver = 0; uint64_t stable_ver = 0; + uint64_t max_target = 0; }; struct pg_obj_state_check_t { pg_t *pg; - int i; std::vector list; + int list_pos; int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; object_id oid = { 0 }; uint64_t max_ver = 0; uint64_t last_ver = 0; uint64_t target_ver = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; - bool is_buggy = false, has_old_unstable = false; + uint64_t n_unstable = 0, n_buggy = 0; pg_osd_set_t osd_set; void walk(); @@ -50,10 +51,10 @@ void pg_obj_state_check_t::walk() pg->clean_count = 0; pg->total_count = 0; pg->state = 0; - for (i = 0; i < list.size(); i++) + for (list_pos = 0; list_pos < list.size(); list_pos++) { - if (oid.inode != list[i].oid.inode || - oid.stripe != (list[i].oid.stripe & ~STRIPE_MASK)) + if (oid.inode != list[list_pos].oid.inode || + oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK)) { if (oid.inode != 0) { @@ -76,45 +77,45 @@ void pg_obj_state_check_t::walk() void pg_obj_state_check_t::start_object() { - obj_start = i; - oid = { .inode = list[i].oid.inode, .stripe = list[i].oid.stripe & ~STRIPE_MASK }; - last_ver = max_ver = list[i].version; + obj_start = list_pos; + oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK }; + last_ver = max_ver = list[list_pos].version; target_ver = 0; - ver_start = i; + ver_start = list_pos; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; - is_buggy = false; + n_unstable = n_buggy = 0; } void pg_obj_state_check_t::handle_version() { - if (!target_ver && last_ver != list[i].version && (n_stable > 0 || n_roles >= pg->pg_minsize)) + if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize)) { // Version is either stable or recoverable target_ver = last_ver; - ver_end = i; + ver_end = list_pos; } if (!target_ver) { - if (last_ver != list[i].version) + if (last_ver != list[list_pos].version) { - ver_start = i; + ver_start = list_pos; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; - last_ver = list[i].version; + last_ver = list[list_pos].version; } - int replica = (list[i].oid.stripe & STRIPE_MASK); + int replica = (list[list_pos].oid.stripe & STRIPE_MASK); n_copies++; if (replica >= pg->pg_size) { // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes - is_buggy = true; + n_buggy++; } else { - if (list[i].is_stable) + if (list[list_pos].is_stable) { n_stable++; } - if (pg->cur_set[replica] != list[i].osd_num) + if (pg->cur_set[replica] != list[list_pos].osd_num) { n_mismatched++; } @@ -125,9 +126,9 @@ void pg_obj_state_check_t::handle_version() } } } - else if (!list[i].is_stable) + if (!list[list_pos].is_stable) { - has_old_unstable = true; + n_unstable++; } } @@ -137,11 +138,17 @@ void pg_obj_state_check_t::finish_object() { // Version is either stable or recoverable target_ver = last_ver; - ver_end = i; + ver_end = list_pos; } - obj_end = i; + obj_end = list_pos; // Remember the decision uint64_t state = OBJ_CLEAN; + if (n_buggy > 0) + { + state = OBJ_BUGGY; + // FIXME: bring pg offline + throw std::runtime_error("buggy object state"); + } if (target_ver > 0) { if (n_roles < pg->pg_minsize) @@ -169,29 +176,51 @@ void pg_obj_state_check_t::finish_object() state |= OBJ_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED; } - if (n_stable < n_copies) + pg->total_count++; + } + if (n_unstable > 0) + { + pg->state |= PG_HAS_UNCLEAN; + std::unordered_map pieces; + for (int i = obj_start; i < obj_end; i++) { - state |= OBJ_NEEDS_STABLE; - pg->state = pg->state | PG_HAS_UNCLEAN; + auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }]; + if (!pcs.max_ver) + { + pcs.max_ver = list[i].version; + } + if (list[i].is_stable && !pcs.stable_ver) + { + pcs.stable_ver = list[i].version; + } + if (list[i].version <= target_ver) + { + pcs.max_target = list[i].version; + } + } + for (auto pp: pieces) + { + auto & pcs = pp.second; + if (pcs.stable_ver < pcs.max_ver) + { + auto & act = pg->flush_actions[pp.first]; + // osd_set doesn't include rollback/stable states, so don't include them in the state code either + if (pcs.max_ver > target_ver) + { + //state |= OBJ_NEEDS_ROLLBACK; + act.rollback = true; + act.rollback_to = pcs.max_target; + } + if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver)) + { + //state |= OBJ_NEEDS_STABLE; + act.make_stable = true; + act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver; + } + } } } - if (target_ver < max_ver || has_old_unstable) - { - state |= OBJ_NEEDS_ROLLBACK; - pg->state = pg->state | PG_HAS_UNCLEAN; - } - if (is_buggy) - { - state |= OBJ_BUGGY; - // FIXME: bring pg offline - throw std::runtime_error("buggy object state"); - } - pg->total_count++; - if (state == OBJ_CLEAN) - { - pg->clean_count++; - } - else + if (state != OBJ_CLEAN || ver_end < obj_end) { osd_set.clear(); for (int i = ver_start; i < ver_end; i++) @@ -199,10 +228,45 @@ void pg_obj_state_check_t::finish_object() osd_set.push_back((pg_obj_loc_t){ .role = (list[i].oid.stripe & STRIPE_MASK), .osd_num = list[i].osd_num, - .stable = list[i].is_stable, + .outdated = false, }); } - std::sort(osd_set.begin(), osd_set.end()); + } + if (ver_end < obj_end) + { + // Check for outdated versions not present in the current target OSD set + for (int i = ver_end; i < obj_end; i++) + { + int j; + for (j = 0; j < osd_set.size(); j++) + { + if (osd_set[j].osd_num == list[i].osd_num) + { + break; + } + } + if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num) + { + osd_set.push_back((pg_obj_loc_t){ + .role = (list[i].oid.stripe & STRIPE_MASK), + .osd_num = list[i].osd_num, + .outdated = true, + }); + state |= OBJ_MISPLACED; + pg->state = pg->state | PG_HAS_MISPLACED; + } + } + } + if (target_ver < max_ver) + { + pg->ver_override[oid] = target_ver; + } + if (state == OBJ_CLEAN) + { + pg->clean_count++; + } + else + { auto it = pg->state_dict.find(osd_set); if (it == pg->state_dict.end()) { @@ -214,7 +278,10 @@ void pg_obj_state_check_t::finish_object() } for (auto & o: osd_set) { - read_target[o.role] = o.osd_num; + if (!o.outdated) + { + read_target[o.role] = o.osd_num; + } } pg->state_dict[osd_set] = { .read_target = read_target, @@ -229,49 +296,6 @@ void pg_obj_state_check_t::finish_object() it->second.object_count++; } pg->obj_states[oid] = &it->second; - if (target_ver < max_ver) - { - pg->ver_override[oid] = target_ver; - } - if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE)) - { - std::unordered_map pieces; - for (int i = obj_start; i < obj_end; i++) - { - auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }]; - if (!pcs.max_ver) - { - pcs.max_ver = list[i].version; - } - if (list[i].is_stable && !pcs.stable_ver) - { - pcs.stable_ver = list[i].version; - } - } - for (auto pp: pieces) - { - auto & pcs = pp.second; - if (pcs.stable_ver < pcs.max_ver) - { - auto & act = pg->flush_actions[pp.first]; - if (pcs.max_ver > target_ver) - { - act.rollback = true; - act.rollback_to = target_ver; - } - else if (pcs.max_ver < target_ver && pcs.stable_ver < pcs.max_ver) - { - act.rollback = true; - act.rollback_to = pcs.stable_ver; - } - if (pcs.max_ver >= target_ver && pcs.stable_ver < target_ver) - { - act.make_stable = true; - act.stable_to = target_ver; - } - } - } - } } } diff --git a/osd_peering_pg.h b/osd_peering_pg.h index ff226e1f..b713966d 100644 --- a/osd_peering_pg.h +++ b/osd_peering_pg.h @@ -37,7 +37,7 @@ struct pg_obj_loc_t { uint64_t role; osd_num_t osd_num; - bool stable; + bool outdated; }; typedef std::vector pg_osd_set_t; @@ -121,8 +121,9 @@ struct pg_t inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b) { - return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num || - a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable; + return a.role < b.role || + a.role == b.role && a.outdated < b.outdated || + a.role == b.role && a.outdated == b.outdated && a.osd_num < b.osd_num; } inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b) @@ -147,7 +148,6 @@ namespace std // Copy-pasted from spp::hash_combine() seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); - seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); } return seed; }