From 4c9bf6727b3f9c81906569764336c0b4d83d8766 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 10 Apr 2022 13:57:46 +0300 Subject: [PATCH] Experimental: Handle degraded deletions by comparing object versions with epochs CAUTION! This version is not fool proof yet. If you purge data of an OSD by overwriting the disk with zeroes and restart it then the same data will also be removed from other replicas :-). I plan to add protection from this situation before merging it into master. The idea is to make each OSD store a random "cookie" on disk and remove itself from history automatically if the cookie doesn't match. --- mon/PGUtil.js | 15 +++- mon/mon.js | 19 ++--- src/cli_rm_osd.cpp | 15 +++- src/cluster_client_list.cpp | 15 ++-- src/etcd_state_client.cpp | 29 ++++++- src/etcd_state_client.h | 2 +- src/osd.h | 1 + src/osd_cluster.cpp | 35 +++++++- src/osd_flush.cpp | 21 ++++- src/osd_id.h | 24 ++++++ src/osd_peering.cpp | 130 +++++++++++++++++++++--------- src/osd_peering_pg.cpp | 146 ++++++++++++++++++++++++++++++++-- src/osd_peering_pg.h | 4 +- src/osd_primary.cpp | 38 ++++++--- src/osd_primary_chain.cpp | 6 ++ src/osd_primary_subops.cpp | 12 +-- src/osd_primary_write.cpp | 12 +-- src/pg_states.h | 1 + tests/run_tests.sh | 2 + tests/test_degraded_delete.sh | 131 ++++++++++++++++++++++++++++++ 20 files changed, 553 insertions(+), 105 deletions(-) create mode 100755 tests/test_degraded_delete.sh diff --git a/mon/PGUtil.js b/mon/PGUtil.js index 99bc91c2..c5a0a184 100644 --- a/mon/PGUtil.js +++ b/mon/PGUtil.js @@ -10,18 +10,25 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p if (!new_pg_history[new_pg]) { new_pg_history[new_pg] = { - osd_sets: {}, + osd_set_epochs: {}, all_peers: {}, epoch: 0, }; } const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg]; - nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg]; + nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] }; if (oh && oh.osd_sets && oh.osd_sets.length) { for (const pg of oh.osd_sets) { - nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num)); + nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg.map(osd_num => Number(osd_num)) }; + } + } + if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length) + { + for (const pg of oh.osd_set_epochs) + { + nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set.map(osd_num => Number(osd_num)) }; } } if (oh && oh.all_peers && oh.all_peers.length) @@ -39,7 +46,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p function finish_pg_history(merged_history) { - merged_history.osd_sets = Object.values(merged_history.osd_sets); + merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs); merged_history.all_peers = Object.values(merged_history.all_peers); } diff --git a/mon/mon.js b/mon/mon.js index 01ef02f8..ba8ed70f 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -286,7 +286,12 @@ const etcd_tree = { history: { /* : { : { - osd_sets: osd_num_t[][], + osd_set_epochs: { + osd_set: osd_num_t[], + min_epoch: uint64_t, + max_epoch: uint64_t, + }[], + osd_sets: osd_num_t[][], // outdated all_peers: osd_num_t[], epoch: uint64_t, }, @@ -968,18 +973,6 @@ class Mon osd_set, primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds), }; - if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') && - prev_pgs[i].filter(osd_num => osd_num).length > 0) - { - pg_history[i] = pg_history[i] || {}; - pg_history[i].osd_sets = pg_history[i].osd_sets || []; - pg_history[i].osd_sets.push(prev_pgs[i]); - } - if (pg_history[i] && pg_history[i].osd_sets) - { - pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets - .reduce((a, c) => { a[c.join(' ')] = c; return a; }, {})); - } }); for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++) { diff --git a/src/cli_rm_osd.cpp b/src/cli_rm_osd.cpp index fedb6ccd..d1830391 100644 --- a/src/cli_rm_osd.cpp +++ b/src/cli_rm_osd.cpp @@ -88,7 +88,7 @@ struct rm_osd_t for (auto & hist_item: pg_cfg.target_history) { int hist_size = 0, hist_rm = 0; - for (auto & old_osd: hist_item) + for (auto & old_osd: hist_item.osd_set) { if (old_osd != 0) { @@ -382,7 +382,7 @@ struct rm_osd_t for (int i = 0; i < pg_cfg.target_history.size(); i++) { int hist_size = 0, hist_rm = 0; - for (auto & old_osd: pg_cfg.target_history[i]) + for (auto & old_osd: pg_cfg.target_history[i].osd_set) { if (old_osd != 0) { @@ -406,6 +406,15 @@ struct rm_osd_t } if (update_pg_history) { + json11::Json::array target_history; + for (auto & pgh: pg_cfg.target_history) + { + target_history.push_back(json11::Json::object { + { "osd_set", pgh.osd_set }, + { "min_epoch", pgh.min_epoch }, + { "max_epoch", pgh.max_epoch }, + }); + } std::string history_key = base64_encode( parent->cli->st_cli.etcd_prefix+"/pg/history/"+ std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num) @@ -416,7 +425,7 @@ struct rm_osd_t { "value", base64_encode(json11::Json(json11::Json::object { { "epoch", pg_cfg.epoch }, { "all_peers", pg_cfg.all_peers }, - { "osd_sets", pg_cfg.target_history }, + { "osd_set_epochs", target_history }, }).dump()) }, } }, }); diff --git a/src/cluster_client_list.cpp b/src/cluster_client_list.cpp index 30f57b49..058f5d1e 100644 --- a/src/cluster_client_list.cpp +++ b/src/cluster_client_list.cpp @@ -96,7 +96,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode, } for (auto & hist_item: pg.target_history) { - for (auto pg_osd: hist_item) + for (auto pg_osd: hist_item.osd_set) { if (pg_osd != 0) { @@ -106,11 +106,14 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode, } for (osd_num_t peer_osd: all_peers) { - r->list_osds.push_back((inode_list_osd_t){ - .pg = r, - .osd_num = peer_osd, - .sent = false, - }); + if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end()) + { + r->list_osds.push_back((inode_list_osd_t){ + .pg = r, + .osd_num = peer_osd, + .sent = false, + }); + } } } else diff --git a/src/etcd_state_client.cpp b/src/etcd_state_client.cpp index af5f973c..ee17e7a7 100644 --- a/src/etcd_state_client.cpp +++ b/src/etcd_state_client.cpp @@ -902,9 +902,32 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv) history_set.insert(it, pg_osd_num); } } - auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set); - if (it == pg_cfg.target_history.end() || *it != history_set) - pg_cfg.target_history.insert(it, history_set); + pg_history_set_t epoch_set = { .osd_set = history_set }; + auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), epoch_set); + if (it == pg_cfg.target_history.end() || *it != epoch_set) + pg_cfg.target_history.insert(it, epoch_set); + } + // Newer format with epochs + for (auto hist_item: value["osd_set_epochs"].array_items()) + { + pg_history_set_t history_set; + history_set.min_epoch = hist_item["min_epoch"].uint64_value(); + history_set.max_epoch = hist_item["max_epoch"].uint64_value(); + if (history_set.max_epoch < history_set.min_epoch) + { + history_set.max_epoch = 0; + history_set.min_epoch = 0; + } + for (auto pg_osd: hist_item["osd_set"].array_items()) + { + history_set.osd_set.push_back(pg_osd.uint64_value()); + } + if (history_set.max_epoch || history_set.osd_set.size()) + { + auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set); + if (it == pg_cfg.target_history.end() || *it != history_set) + pg_cfg.target_history.insert(it, history_set); + } } // Include these additional OSDs when peering the PG for (auto pg_osd: value["all_peers"].array_items()) diff --git a/src/etcd_state_client.h b/src/etcd_state_client.h index fc2d9de2..ce38c674 100644 --- a/src/etcd_state_client.h +++ b/src/etcd_state_client.h @@ -33,7 +33,7 @@ struct pg_config_t bool exists; osd_num_t primary; std::vector target_set; - std::vector> target_history; + std::vector target_history; std::vector all_peers; bool pause; osd_num_t cur_primary; diff --git a/src/osd.h b/src/osd.h index 461fef38..8deb81d3 100644 --- a/src/osd.h +++ b/src/osd.h @@ -192,6 +192,7 @@ class osd_t void reset_stats(); json11::Json get_statistics(); void report_statistics(); + void add_pg_history(pg_t & pg); void report_pg_state(pg_t & pg); void report_pg_states(); void apply_pg_count(); diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 7415e3f1..6d9522fb 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -674,7 +674,7 @@ void osd_t::apply_pg_config() } for (auto & hist_item: pg_cfg.target_history) { - for (auto pg_osd: hist_item) + for (auto pg_osd: hist_item.osd_set) { if (pg_osd != 0) { @@ -868,11 +868,40 @@ void osd_t::report_pg_states() // Prevent race conditions (for the case when the monitor is updating this key at the same time) pg.history_changed = false; std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); + json11::Json::array target_history; + for (auto & pgh: pg.target_history) + { + target_history.push_back(json11::Json::object { + { "osd_set", pgh.osd_set }, + { "min_epoch", pgh.min_epoch }, + { "max_epoch", pgh.max_epoch }, + }); + } + std::vector all_peers; + for (auto peer_osd: pg.all_peers) + { + bool found = false; + for (auto target_peer: pg.target_set) + { + if (target_peer == peer_osd) + { + found = true; + break; + } + } + if (!found) + { + all_peers.push_back(peer_osd); + } + } json11::Json::object history_value = { { "epoch", pg.epoch }, - { "all_peers", pg.all_peers }, - { "osd_sets", pg.target_history }, + { "osd_set_epochs", target_history }, }; + if (all_peers.size()) + { + history_value["all_peers"] = all_peers; + } checks.push_back(json11::Json::object { { "target", "MOD" }, { "key", history_key }, diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index d4317760..18380ae9 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -287,6 +287,25 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op) void osd_t::submit_recovery_op(osd_recovery_op_t *op) { + // Check if the object is deleted + bool is_deleted = false; + pool_id_t pool_id = INODE_POOL(op->oid.inode); + auto pool_cfg_it = st_cli.pool_config.find(pool_id); + if (pool_cfg_it != st_cli.pool_config.end()) + { + pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg() + auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num }); + if (pg_it != pgs.end()) + { + pg_osd_set_state_t *object_state; + get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state); + if (object_state && (object_state->state & OBJ_DELETED)) + { + // Object is deleted, but not from all OSDs - delete remaining copies + is_deleted = true; + } + } + } op->osd_op = new osd_op_t(); op->osd_op->op_type = OSD_OP_OUT; op->osd_op->req = (osd_any_op_t){ @@ -294,7 +313,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op) .header = { .magic = SECONDARY_OSD_OP_MAGIC, .id = 1, - .opcode = OSD_OP_WRITE, + .opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE), }, .inode = op->oid.inode, .offset = op->oid.stripe, diff --git a/src/osd_id.h b/src/osd_id.h index 2ebeb817..aabe1c07 100644 --- a/src/osd_id.h +++ b/src/osd_id.h @@ -3,6 +3,8 @@ #pragma once +#include + #define POOL_SCHEME_REPLICATED 1 #define POOL_SCHEME_XOR 2 #define POOL_SCHEME_EC 3 @@ -38,3 +40,25 @@ inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b) { return a.pool_id != b.pool_id || a.pg_num != b.pg_num; } + +struct pg_history_set_t +{ + std::vector osd_set; + uint64_t min_epoch, max_epoch; +}; + +inline bool operator == (const pg_history_set_t & a, const pg_history_set_t & b) +{ + return a.min_epoch == b.min_epoch && a.max_epoch == b.max_epoch && a.osd_set == b.osd_set; +} + +inline bool operator != (const pg_history_set_t & a, const pg_history_set_t & b) +{ + return a.min_epoch != b.min_epoch || a.max_epoch != b.max_epoch || a.osd_set != b.osd_set; +} + +inline bool operator < (const pg_history_set_t & a, const pg_history_set_t & b) +{ + return a.min_epoch < b.min_epoch || a.min_epoch == b.min_epoch && + (a.max_epoch < b.max_epoch || a.max_epoch == b.max_epoch && a.osd_set < b.osd_set); +} diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 6a021e32..2f184f39 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -231,7 +231,7 @@ void osd_t::start_pg_peering(pg_t & pg) for (auto & history_set: pg.target_history) { bool found = true; - for (auto history_osd: history_set) + for (auto history_osd: history_set.osd_set) { if (history_osd != 0) { @@ -471,61 +471,71 @@ void osd_t::finish_stop_pg(pg_t & pg) report_pg_state(pg); } +static int count_nonzero_osds(const std::vector & v) +{ + int n = 0; + for (auto & osd_num: v) + { + if (osd_num != 0) + { + n++; + } + } + return n; +} + void osd_t::report_pg_state(pg_t & pg) { pg.print_state(); this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); - if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size())) + if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) && + (pg.target_history.size() != 1 || + pg.target_history[0].osd_set != pg.target_set || + pg.target_history[0].min_epoch != 0 || + pg.target_history[0].max_epoch != pg.epoch || + pg.all_peers.size() > count_nonzero_osds(pg.target_set))) { // Clear history of active+clean PGs pg.history_changed = true; pg.target_history.clear(); - pg.all_peers = pg.target_set; - std::sort(pg.all_peers.begin(), pg.all_peers.end()); - pg.cur_peers = pg.target_set; - // Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata - auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num]; - pg_cfg.target_history = pg.target_history; - pg_cfg.all_peers = pg.all_peers; - } - else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) - { - // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers - if (pg.target_history.size()) + pg.target_history.push_back((pg_history_set_t){ + .osd_set = pg.cur_set, + .min_epoch = 0, + .max_epoch = pg.epoch, + }); + if (pg.state == PG_ACTIVE) { - pg.history_changed = true; - pg.target_history.clear(); - } - std::set dead_peers; - for (auto pg_osd: pg.all_peers) - { - dead_peers.insert(pg_osd); - } - for (auto pg_osd: pg.cur_peers) - { - dead_peers.erase(pg_osd); - } - for (auto pg_osd: pg.target_set) - { - if (pg_osd) + pg.all_peers.clear(); + for (auto pg_osd: pg.target_set) { - dead_peers.insert(pg_osd); + if (pg_osd) + pg.all_peers.push_back(pg_osd); } } - auto new_all_peers = std::vector(dead_peers.begin(), dead_peers.end()); - if (pg.all_peers != new_all_peers) + else { - pg.history_changed = true; - pg.all_peers = new_all_peers; + // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers + std::set dead_peers(pg.all_peers.begin(), pg.all_peers.end()); + for (auto pg_osd: pg.cur_peers) + { + dead_peers.erase(pg_osd); + } + for (auto pg_osd: pg.target_set) + { + if (pg_osd) + dead_peers.insert(pg_osd); + } + pg.all_peers.clear(); + pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end()); } + std::sort(pg.all_peers.begin(), pg.all_peers.end()); pg.cur_peers.clear(); for (auto pg_osd: pg.target_set) { if (pg_osd) - { pg.cur_peers.push_back(pg_osd); - } } + // Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num]; pg_cfg.target_history = pg.target_history; pg_cfg.all_peers = pg.all_peers; @@ -536,3 +546,51 @@ void osd_t::report_pg_state(pg_t & pg) } report_pg_states(); } + +void osd_t::add_pg_history(pg_t & pg) +{ + bool epoch_already_reported = false; + int max_epoch_pos = -1; + for (int i = pg.target_history.size()-1; i >= 0; i--) + { + if (pg.target_history[i].min_epoch > pg.epoch) + { + printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n", + pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch); + force_stop(1); + return; + } + if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch) + { + max_epoch_pos = i; + } + if (pg.target_history[i].min_epoch <= pg.epoch && + pg.target_history[i].max_epoch >= pg.epoch) + { + if (pg.target_history[i].osd_set != pg.cur_set) + { + printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch); + force_stop(1); + return; + } + // Already reported + epoch_already_reported = true; + break; + } + } + if (!epoch_already_reported) + { + if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set) + { + pg.target_history[max_epoch_pos].max_epoch = pg.epoch; + } + else + { + pg.target_history.push_back((pg_history_set_t){ + .osd_set = pg.cur_set, + .min_epoch = pg.epoch, + .max_epoch = pg.epoch, + }); + } + } +} diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 249f268b..8bd53abf 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -52,6 +52,7 @@ struct pg_obj_state_check_t void walk(); void start_object(); + void recheck_version_osd_set(); void handle_version(); void finish_object(); }; @@ -84,6 +85,7 @@ void pg_obj_state_check_t::walk() pg->state = PG_INCOMPLETE | PG_HAS_INVALID; return; } + // Activate PG if (pg->pg_cursize < pg->pg_size) { // Activate as degraded @@ -108,13 +110,85 @@ void pg_obj_state_check_t::start_object() n_unstable = n_invalid = 0; } +// FIXME: Put this under a feature flag +// FIXME: Implement OSD 'cookies' to be fool-proof so that if an OSD is wiped and +// recreated it doesn't also wipe all other data +void pg_obj_state_check_t::recheck_version_osd_set() +{ + uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS)); + if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size) + { + // Enough copies + return; + } + auto epoch_it = pg->target_by_epoch.lower_bound(epoch); + if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch) + { + // Epoch info not found + return; + } + if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size()) + { + // For the (unlikely) case of PG size change - enough copies + return; + } + // Recheck version against the OSD set corresponding to epoch if it's known + if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch) + { + for (int j = 0; j < epoch_it->second.osd_set.size(); j++) + { + osd_num_t cur_osd = epoch_it->second.osd_set[j]; + bool found = false; + for (int i = ver_start; i < ver_end; i++) + { + if (cur_osd == list[i].osd_num) + { + found = true; + break; + } + } + if (!found) + { + // Check if a newer version is present on the same OSD and masks the older one + // It happens for overwritten replicas in the following case: + // Version 1 is present on OSD 1,2,3 + // Client tries to write Version 2 + // OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again + // OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3 + // (version 1 on OSD 3 is already masked/removed) + // Version 1 is not present on a full set, but it must not be removed + if (replicated) + { + for (int i = obj_start; i < ver_start; i++) + { + if (cur_osd == list[i].osd_num) + { + found = true; + break; + } + } + } + if (!found) + { + // Object is missing from one of the OSDs of that set. + // This means it's deleted or moved and we can safely drop this version. + target_ver = 0; + break; + } + } + } + } +} + void pg_obj_state_check_t::handle_version() { if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size)) { // Version is either stable or recoverable - target_ver = last_ver; ver_end = list_pos; + target_ver = last_ver; + // Skip versions that are not present on any of OSDs for the corresponding PG epoch + recheck_version_osd_set(); } if (!target_ver) { @@ -178,6 +252,8 @@ void pg_obj_state_check_t::finish_object() // Version is either stable or recoverable target_ver = last_ver; ver_end = list_pos; + // Skip versions that are not present on any of OSDs for the corresponding PG epoch + recheck_version_osd_set(); } obj_end = list_pos; // Remember the decision @@ -231,11 +307,23 @@ void pg_obj_state_check_t::finish_object() } } } - if (!target_ver) + if (!target_ver && (n_unstable >= obj_end-obj_start)) { return; } - if (!replicated && n_roles < pg->pg_data_size) + if (!target_ver) + { + // Object is present, but should not be :) i.e. it's a deleted object that reappeared + if (log_level > 1) + { + printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); + } + state = OBJ_DELETED; + pg->state = pg->state | PG_HAS_MISPLACED; + // To record all versions as outdated: + ver_end = obj_start; + } + else if (!replicated && n_roles < pg->pg_data_size) { if (log_level > 1) { @@ -263,7 +351,7 @@ void pg_obj_state_check_t::finish_object() pg->state = pg->state | PG_HAS_MISPLACED; } if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) || - log_level > 2 && (state & OBJ_MISPLACED)) + log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED))) { for (int i = obj_start; i < obj_end; i++) { @@ -272,9 +360,9 @@ void pg_obj_state_check_t::finish_object() } } pg->total_count++; - if (state != 0 || ver_end < obj_end) + osd_set.clear(); + if (target_ver != 0 && (state != 0 || ver_end < obj_end)) { - osd_set.clear(); for (int i = ver_start; i < ver_end; i++) { osd_set.push_back((pg_obj_loc_t){ @@ -297,7 +385,8 @@ void pg_obj_state_check_t::finish_object() break; } } - if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num) + if (j >= osd_set.size() && ((state & OBJ_DELETED) || + pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)) { osd_set.push_back((pg_obj_loc_t){ .role = (list[i].oid.stripe & STRIPE_MASK), @@ -312,7 +401,11 @@ void pg_obj_state_check_t::finish_object() } } } - if (target_ver < max_ver) + if (state & OBJ_DELETED) + { + pg->ver_override[oid] = max_ver; + } + else if (target_ver < max_ver) { pg->ver_override[oid] = target_ver; } @@ -366,6 +459,7 @@ void pg_obj_state_check_t::finish_object() } else { + assert(it->second.state == state); it->second.object_count++; } if (state & OBJ_INCOMPLETE) @@ -386,6 +480,34 @@ void pg_obj_state_check_t::finish_object() // FIXME: Write at least some tests for this function void pg_t::calc_object_states(int log_level) { + // Calculate intersections of target_history with cur_peers + for (auto & history_item: target_history) + { + if (history_item.max_epoch) + { + pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch]; + set_copy.min_epoch = history_item.min_epoch; + set_copy.max_epoch = history_item.max_epoch; + for (int i = 0; i < history_item.osd_set.size(); i++) + { + if (history_item.osd_set[i] != 0) + { + for (int j = 0; j < cur_set.size(); j++) + { + if (cur_set[j] == history_item.osd_set[i]) + { + set_copy.osd_set.push_back(history_item.osd_set[i]); + break; + } + } + } + } + if (set_copy.osd_set.size() != pg_size) + { + epoch_sizes_differ = true; + } + } + } // Copy all object lists into one array pg_obj_state_check_t st; st.log_level = log_level; @@ -422,10 +544,18 @@ void pg_t::calc_object_states(int log_level) std::sort(st.list.begin(), st.list.end()); // Walk over it and check object states st.walk(); + target_by_epoch.clear(); // needed only in this function if (this->state != PG_ACTIVE) { assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1)); epoch++; + for (auto & pgh: target_history) + { + if (epoch <= pgh.max_epoch) + { + epoch = pgh.max_epoch+1; + } + } } if (log_level > 0) { diff --git a/src/osd_peering_pg.h b/src/osd_peering_pg.h index d0e729ee..57c451ab 100644 --- a/src/osd_peering_pg.h +++ b/src/osd_peering_pg.h @@ -89,7 +89,9 @@ struct pg_t // epoch number - should increase with each non-clean activation of the PG uint64_t epoch = 0, reported_epoch = 0; // target history and all potential peers - std::vector> target_history; + std::vector target_history; + std::map target_by_epoch; + bool epoch_sizes_differ = false; std::vector all_peers; bool history_changed = false; // peer list from the last peering event diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 95d947c3..bc18674a 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -199,6 +199,21 @@ void osd_t::continue_primary_read(osd_op_t *cur_op) { // PG may be degraded or have misplaced objects op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); + if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED)) + { + // Object is deleted, just return zeroes + cur_op->reply.rw.version = 0; + cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size; + uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len; + while (zero_len >= 0) + { + uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size; + cur_op->iov.push_back(zero_buffer, cur_zero_len); + zero_len -= cur_zero_len; + } + finish_op(cur_op, cur_op->req.rw.len); + return; + } } if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) { @@ -290,7 +305,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object report_pg_state(pg); } } - else if (object_state->state & OBJ_MISPLACED) + else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED)) { this->misplaced_objects--; pg.misplaced_objects.erase(oid); @@ -329,12 +344,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op) else if (op_data->st == 4) goto resume_4; else if (op_data->st == 5) goto resume_5; assert(op_data->st == 0); - // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs - if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD)) - { - finish_op(cur_op, -EBUSY); - return; - } if (!check_write_queue(cur_op, pg)) { return; @@ -342,11 +351,18 @@ void osd_t::continue_primary_del(osd_op_t *cur_op) resume_1: // Determine which OSDs contain this object and delete it op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); - // Submit 1 read to determine the actual version number - submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); + if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED)) + { + op_data->fact_ver = pg.ver_override[op_data->oid]; + } + else + { + // Submit 1 read to determine the actual version number + submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); resume_2: - op_data->st = 2; - return; + op_data->st = 2; + return; + } resume_3: if (op_data->errors > 0) { diff --git a/src/osd_primary_chain.cpp b/src/osd_primary_chain.cpp index 01fc39ed..90f4f9ad 100644 --- a/src/osd_primary_chain.cpp +++ b/src/osd_primary_chain.cpp @@ -133,6 +133,12 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vectorsecond : UINT64_MAX; pg_osd_set_state_t *object_state; uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); + if (object_state && (object_state->state & OBJ_DELETED)) + { + // Object is deleted, zero out the bitmap + memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size); + continue; + } if (pg.scheme == POOL_SCHEME_REPLICATED) { osd_num_t read_target = 0; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 5b6ab48d..8aa65ce8 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -119,17 +119,19 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0)) n_subops++; } - if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep)) + if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0) n_subops = 1; else zero_read = -1; - osd_op_t *subops = new osd_op_t[n_subops]; op_data->fact_ver = 0; op_data->done = op_data->errors = op_data->errcode = 0; op_data->n_subops = n_subops; - op_data->subops = subops; - int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read); - assert(sent == n_subops); + if (n_subops > 0) + { + op_data->subops = new osd_op_t[n_subops]; + int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read); + assert(sent == n_subops); + } } int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version, diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index dd6e6652..ef92a618 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -156,21 +156,13 @@ resume_3: { // Report newer epoch before writing // FIXME: We don't have to report all changed PG states here - this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); if (pg.state != PG_ACTIVE) { // Check that current OSD set is in history and/or add it there - std::vector history_set; - for (auto peer_osd: pg.cur_set) - if (peer_osd != 0) - history_set.push_back(peer_osd); - std::sort(history_set.begin(), history_set.end()); - auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set); - if (it == pg.target_history.end() || *it != history_set) - pg.target_history.insert(it, history_set); + add_pg_history(pg); } pg.history_changed = true; - report_pg_states(); + report_pg_state(pg); resume_10: if (pg.epoch > pg.reported_epoch) { diff --git a/src/pg_states.h b/src/pg_states.h index 605c817a..df81ed67 100644 --- a/src/pg_states.h +++ b/src/pg_states.h @@ -32,6 +32,7 @@ #define OBJ_DEGRADED 0x02 #define OBJ_INCOMPLETE 0x04 #define OBJ_MISPLACED 0x08 +#define OBJ_DELETED 0x10 #define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_ROLLBACK 0x20000 diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 82fb5d98..122d03e0 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -14,6 +14,8 @@ SCHEME=ec ./test_change_pg_count.sh ./test_create_nomaxid.sh +./test_degraded_delete.sh + ./test_etcd_fail.sh ./test_failure_domain.sh diff --git a/tests/test_degraded_delete.sh b/tests/test_degraded_delete.sh new file mode 100755 index 00000000..389717bd --- /dev/null +++ b/tests/test_degraded_delete.sh @@ -0,0 +1,131 @@ +#!/bin/bash -ex + +# Run 3 OSDs + +. `dirname $0`/run_3osds.sh + +# Write inodes 1 and 2 + +LD_PRELOAD="build/src/libfio_vitastor.so" \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ + -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 + +LD_PRELOAD="build/src/libfio_vitastor.so" \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ + -rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=128M -runtime=10 + +LD_PRELOAD="build/src/libfio_vitastor.so" \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 \ + -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 &>/dev/null & + +sleep 5 + +# Stop OSD 1 + +kill -INT $OSD1_PID +sleep 2 + +# Remove inode 2 + +build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 2 + +# Run 3 more OSDs and move PG to 4,5,6 + +for i in $(seq 4 6); do + dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1)) + build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log & + eval OSD${i}_PID=$! +done + +$ETCDCTL put /vitastor/config/osd/1 '{"reweight":0}' +$ETCDCTL put /vitastor/config/osd/2 '{"reweight":0}' +$ETCDCTL put /vitastor/config/osd/3 '{"reweight":0}' + +# Wait for rebalance to finish + +wait_finish_rebalance() +{ + local sec=$1 + local st=$2 + local i=0 + while [[ $i -lt $sec ]]; do + if $ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e \ + '([ .[] | select(.state == ['$st'] and (.peers | contains([1]) | not) and (.peers | contains([2,3]) | not)) ] | length) == '$PG_COUNT; then + break + fi + sleep 1 + i=$((i+1)) + if [ $i -eq $sec ]; then + format_error "Rebalance couldn't finish in $sec seconds" + fi + done +} + +wait_finish_rebalance 60 '"active","left_on_dead"' + +# Stop OSD 2,3 + +kill -INT $OSD2_PID +kill -INT $OSD3_PID +sleep 2 + +# Verify that PGs are still active + +if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active","left_on_dead"]) ] | length == '$PG_COUNT); then + format_error "FAILED: $PG_COUNT PG(s) NOT UP" +fi + +# Start OSD 1 + +build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log & +OSD1_PID=$! + +# Verify that inode 2 is removed and inode 1 is in place + +wait_repeer_1() +{ + local sec=$1 + local i=0 + while [[ $i -lt $sec ]]; do + if grep -q 'Repeer because of OSD 1' testdata/osd4.log testdata/osd5.log testdata/osd6.log; then + break + fi + sleep 1 + i=$((i+1)) + if [ $i -eq $sec ]; then + format_error "OSD 4/5/6 do not peer with older OSD 1" + fi + done +} + +wait_repeer_1 15 + +wait_finish_rebalance 15 '"active"' + +if [ "$SCHEME" = "replicated" ]; then + NOBJ=1024 +else + NOBJ=$((1024/(PG_SIZE-1))) +fi + +if ! ($ETCDCTL get /vitastor/pg/stats/1/1 --print-value-only | jq -s -e '.[0].object_count == '$NOBJ); then + format_error "FAILED: PG SHOULD CONTAIN EXACTLY 128 MB OF DATA, BUT IT DOESN'T" +fi + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=4096" \ + -O raw ./testdata/inode1.bin + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size="$((128*1024*1024)) \ + -O raw ./testdata/inode2.bin + +if (dd if=/dev/zero bs=4096 count=1 | diff - ./testdata/inode1.bin); then + format_error "FAILED: INODE 1 SEEMS LOST" +fi + +if ! (dd if=/dev/zero bs=1M count=128 | diff - ./testdata/inode2.bin); then + format_error "FAILED: INODE 2 SEEMS RESTORED" +fi + +format_green OK