diff --git a/mon/PGUtil.js b/mon/PGUtil.js index 99bc91c2..c5a0a184 100644 --- a/mon/PGUtil.js +++ b/mon/PGUtil.js @@ -10,18 +10,25 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p if (!new_pg_history[new_pg]) { new_pg_history[new_pg] = { - osd_sets: {}, + osd_set_epochs: {}, all_peers: {}, epoch: 0, }; } const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg]; - nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg]; + nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] }; if (oh && oh.osd_sets && oh.osd_sets.length) { for (const pg of oh.osd_sets) { - nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num)); + nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg.map(osd_num => Number(osd_num)) }; + } + } + if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length) + { + for (const pg of oh.osd_set_epochs) + { + nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set.map(osd_num => Number(osd_num)) }; } } if (oh && oh.all_peers && oh.all_peers.length) @@ -39,7 +46,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p function finish_pg_history(merged_history) { - merged_history.osd_sets = Object.values(merged_history.osd_sets); + merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs); merged_history.all_peers = Object.values(merged_history.all_peers); } diff --git a/mon/mon.js b/mon/mon.js index 01ef02f8..ba8ed70f 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -286,7 +286,12 @@ const etcd_tree = { history: { /* : { : { - osd_sets: osd_num_t[][], + osd_set_epochs: { + osd_set: osd_num_t[], + min_epoch: uint64_t, + max_epoch: uint64_t, + }[], + osd_sets: osd_num_t[][], // outdated all_peers: osd_num_t[], epoch: uint64_t, }, @@ -968,18 +973,6 @@ class Mon osd_set, primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds), }; - if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') && - prev_pgs[i].filter(osd_num => osd_num).length > 0) - { - pg_history[i] = pg_history[i] || {}; - pg_history[i].osd_sets = pg_history[i].osd_sets || []; - pg_history[i].osd_sets.push(prev_pgs[i]); - } - if (pg_history[i] && pg_history[i].osd_sets) - { - pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets - .reduce((a, c) => { a[c.join(' ')] = c; return a; }, {})); - } }); for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++) { diff --git a/src/cli_rm_osd.cpp b/src/cli_rm_osd.cpp index fedb6ccd..d1830391 100644 --- a/src/cli_rm_osd.cpp +++ b/src/cli_rm_osd.cpp @@ -88,7 +88,7 @@ struct rm_osd_t for (auto & hist_item: pg_cfg.target_history) { int hist_size = 0, hist_rm = 0; - for (auto & old_osd: hist_item) + for (auto & old_osd: hist_item.osd_set) { if (old_osd != 0) { @@ -382,7 +382,7 @@ struct rm_osd_t for (int i = 0; i < pg_cfg.target_history.size(); i++) { int hist_size = 0, hist_rm = 0; - for (auto & old_osd: pg_cfg.target_history[i]) + for (auto & old_osd: pg_cfg.target_history[i].osd_set) { if (old_osd != 0) { @@ -406,6 +406,15 @@ struct rm_osd_t } if (update_pg_history) { + json11::Json::array target_history; + for (auto & pgh: pg_cfg.target_history) + { + target_history.push_back(json11::Json::object { + { "osd_set", pgh.osd_set }, + { "min_epoch", pgh.min_epoch }, + { "max_epoch", pgh.max_epoch }, + }); + } std::string history_key = base64_encode( parent->cli->st_cli.etcd_prefix+"/pg/history/"+ std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num) @@ -416,7 +425,7 @@ struct rm_osd_t { "value", base64_encode(json11::Json(json11::Json::object { { "epoch", pg_cfg.epoch }, { "all_peers", pg_cfg.all_peers }, - { "osd_sets", pg_cfg.target_history }, + { "osd_set_epochs", target_history }, }).dump()) }, } }, }); diff --git a/src/cluster_client_list.cpp b/src/cluster_client_list.cpp index 30f57b49..058f5d1e 100644 --- a/src/cluster_client_list.cpp +++ b/src/cluster_client_list.cpp @@ -96,7 +96,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode, } for (auto & hist_item: pg.target_history) { - for (auto pg_osd: hist_item) + for (auto pg_osd: hist_item.osd_set) { if (pg_osd != 0) { @@ -106,11 +106,14 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode, } for (osd_num_t peer_osd: all_peers) { - r->list_osds.push_back((inode_list_osd_t){ - .pg = r, - .osd_num = peer_osd, - .sent = false, - }); + if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end()) + { + r->list_osds.push_back((inode_list_osd_t){ + .pg = r, + .osd_num = peer_osd, + .sent = false, + }); + } } } else diff --git a/src/etcd_state_client.cpp b/src/etcd_state_client.cpp index af5f973c..ee17e7a7 100644 --- a/src/etcd_state_client.cpp +++ b/src/etcd_state_client.cpp @@ -902,9 +902,32 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv) history_set.insert(it, pg_osd_num); } } - auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set); - if (it == pg_cfg.target_history.end() || *it != history_set) - pg_cfg.target_history.insert(it, history_set); + pg_history_set_t epoch_set = { .osd_set = history_set }; + auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), epoch_set); + if (it == pg_cfg.target_history.end() || *it != epoch_set) + pg_cfg.target_history.insert(it, epoch_set); + } + // Newer format with epochs + for (auto hist_item: value["osd_set_epochs"].array_items()) + { + pg_history_set_t history_set; + history_set.min_epoch = hist_item["min_epoch"].uint64_value(); + history_set.max_epoch = hist_item["max_epoch"].uint64_value(); + if (history_set.max_epoch < history_set.min_epoch) + { + history_set.max_epoch = 0; + history_set.min_epoch = 0; + } + for (auto pg_osd: hist_item["osd_set"].array_items()) + { + history_set.osd_set.push_back(pg_osd.uint64_value()); + } + if (history_set.max_epoch || history_set.osd_set.size()) + { + auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set); + if (it == pg_cfg.target_history.end() || *it != history_set) + pg_cfg.target_history.insert(it, history_set); + } } // Include these additional OSDs when peering the PG for (auto pg_osd: value["all_peers"].array_items()) diff --git a/src/etcd_state_client.h b/src/etcd_state_client.h index fc2d9de2..ce38c674 100644 --- a/src/etcd_state_client.h +++ b/src/etcd_state_client.h @@ -33,7 +33,7 @@ struct pg_config_t bool exists; osd_num_t primary; std::vector target_set; - std::vector> target_history; + std::vector target_history; std::vector all_peers; bool pause; osd_num_t cur_primary; diff --git a/src/osd.h b/src/osd.h index 461fef38..8deb81d3 100644 --- a/src/osd.h +++ b/src/osd.h @@ -192,6 +192,7 @@ class osd_t void reset_stats(); json11::Json get_statistics(); void report_statistics(); + void add_pg_history(pg_t & pg); void report_pg_state(pg_t & pg); void report_pg_states(); void apply_pg_count(); diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 7415e3f1..6d9522fb 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -674,7 +674,7 @@ void osd_t::apply_pg_config() } for (auto & hist_item: pg_cfg.target_history) { - for (auto pg_osd: hist_item) + for (auto pg_osd: hist_item.osd_set) { if (pg_osd != 0) { @@ -868,11 +868,40 @@ void osd_t::report_pg_states() // Prevent race conditions (for the case when the monitor is updating this key at the same time) pg.history_changed = false; std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); + json11::Json::array target_history; + for (auto & pgh: pg.target_history) + { + target_history.push_back(json11::Json::object { + { "osd_set", pgh.osd_set }, + { "min_epoch", pgh.min_epoch }, + { "max_epoch", pgh.max_epoch }, + }); + } + std::vector all_peers; + for (auto peer_osd: pg.all_peers) + { + bool found = false; + for (auto target_peer: pg.target_set) + { + if (target_peer == peer_osd) + { + found = true; + break; + } + } + if (!found) + { + all_peers.push_back(peer_osd); + } + } json11::Json::object history_value = { { "epoch", pg.epoch }, - { "all_peers", pg.all_peers }, - { "osd_sets", pg.target_history }, + { "osd_set_epochs", target_history }, }; + if (all_peers.size()) + { + history_value["all_peers"] = all_peers; + } checks.push_back(json11::Json::object { { "target", "MOD" }, { "key", history_key }, diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index d4317760..18380ae9 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -287,6 +287,25 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op) void osd_t::submit_recovery_op(osd_recovery_op_t *op) { + // Check if the object is deleted + bool is_deleted = false; + pool_id_t pool_id = INODE_POOL(op->oid.inode); + auto pool_cfg_it = st_cli.pool_config.find(pool_id); + if (pool_cfg_it != st_cli.pool_config.end()) + { + pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg() + auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num }); + if (pg_it != pgs.end()) + { + pg_osd_set_state_t *object_state; + get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state); + if (object_state && (object_state->state & OBJ_DELETED)) + { + // Object is deleted, but not from all OSDs - delete remaining copies + is_deleted = true; + } + } + } op->osd_op = new osd_op_t(); op->osd_op->op_type = OSD_OP_OUT; op->osd_op->req = (osd_any_op_t){ @@ -294,7 +313,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op) .header = { .magic = SECONDARY_OSD_OP_MAGIC, .id = 1, - .opcode = OSD_OP_WRITE, + .opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE), }, .inode = op->oid.inode, .offset = op->oid.stripe, diff --git a/src/osd_id.h b/src/osd_id.h index 2ebeb817..aabe1c07 100644 --- a/src/osd_id.h +++ b/src/osd_id.h @@ -3,6 +3,8 @@ #pragma once +#include + #define POOL_SCHEME_REPLICATED 1 #define POOL_SCHEME_XOR 2 #define POOL_SCHEME_EC 3 @@ -38,3 +40,25 @@ inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b) { return a.pool_id != b.pool_id || a.pg_num != b.pg_num; } + +struct pg_history_set_t +{ + std::vector osd_set; + uint64_t min_epoch, max_epoch; +}; + +inline bool operator == (const pg_history_set_t & a, const pg_history_set_t & b) +{ + return a.min_epoch == b.min_epoch && a.max_epoch == b.max_epoch && a.osd_set == b.osd_set; +} + +inline bool operator != (const pg_history_set_t & a, const pg_history_set_t & b) +{ + return a.min_epoch != b.min_epoch || a.max_epoch != b.max_epoch || a.osd_set != b.osd_set; +} + +inline bool operator < (const pg_history_set_t & a, const pg_history_set_t & b) +{ + return a.min_epoch < b.min_epoch || a.min_epoch == b.min_epoch && + (a.max_epoch < b.max_epoch || a.max_epoch == b.max_epoch && a.osd_set < b.osd_set); +} diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 6a021e32..2f184f39 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -231,7 +231,7 @@ void osd_t::start_pg_peering(pg_t & pg) for (auto & history_set: pg.target_history) { bool found = true; - for (auto history_osd: history_set) + for (auto history_osd: history_set.osd_set) { if (history_osd != 0) { @@ -471,61 +471,71 @@ void osd_t::finish_stop_pg(pg_t & pg) report_pg_state(pg); } +static int count_nonzero_osds(const std::vector & v) +{ + int n = 0; + for (auto & osd_num: v) + { + if (osd_num != 0) + { + n++; + } + } + return n; +} + void osd_t::report_pg_state(pg_t & pg) { pg.print_state(); this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); - if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size())) + if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) && + (pg.target_history.size() != 1 || + pg.target_history[0].osd_set != pg.target_set || + pg.target_history[0].min_epoch != 0 || + pg.target_history[0].max_epoch != pg.epoch || + pg.all_peers.size() > count_nonzero_osds(pg.target_set))) { // Clear history of active+clean PGs pg.history_changed = true; pg.target_history.clear(); - pg.all_peers = pg.target_set; - std::sort(pg.all_peers.begin(), pg.all_peers.end()); - pg.cur_peers = pg.target_set; - // Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata - auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num]; - pg_cfg.target_history = pg.target_history; - pg_cfg.all_peers = pg.all_peers; - } - else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) - { - // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers - if (pg.target_history.size()) + pg.target_history.push_back((pg_history_set_t){ + .osd_set = pg.cur_set, + .min_epoch = 0, + .max_epoch = pg.epoch, + }); + if (pg.state == PG_ACTIVE) { - pg.history_changed = true; - pg.target_history.clear(); - } - std::set dead_peers; - for (auto pg_osd: pg.all_peers) - { - dead_peers.insert(pg_osd); - } - for (auto pg_osd: pg.cur_peers) - { - dead_peers.erase(pg_osd); - } - for (auto pg_osd: pg.target_set) - { - if (pg_osd) + pg.all_peers.clear(); + for (auto pg_osd: pg.target_set) { - dead_peers.insert(pg_osd); + if (pg_osd) + pg.all_peers.push_back(pg_osd); } } - auto new_all_peers = std::vector(dead_peers.begin(), dead_peers.end()); - if (pg.all_peers != new_all_peers) + else { - pg.history_changed = true; - pg.all_peers = new_all_peers; + // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers + std::set dead_peers(pg.all_peers.begin(), pg.all_peers.end()); + for (auto pg_osd: pg.cur_peers) + { + dead_peers.erase(pg_osd); + } + for (auto pg_osd: pg.target_set) + { + if (pg_osd) + dead_peers.insert(pg_osd); + } + pg.all_peers.clear(); + pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end()); } + std::sort(pg.all_peers.begin(), pg.all_peers.end()); pg.cur_peers.clear(); for (auto pg_osd: pg.target_set) { if (pg_osd) - { pg.cur_peers.push_back(pg_osd); - } } + // Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num]; pg_cfg.target_history = pg.target_history; pg_cfg.all_peers = pg.all_peers; @@ -536,3 +546,51 @@ void osd_t::report_pg_state(pg_t & pg) } report_pg_states(); } + +void osd_t::add_pg_history(pg_t & pg) +{ + bool epoch_already_reported = false; + int max_epoch_pos = -1; + for (int i = pg.target_history.size()-1; i >= 0; i--) + { + if (pg.target_history[i].min_epoch > pg.epoch) + { + printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n", + pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch); + force_stop(1); + return; + } + if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch) + { + max_epoch_pos = i; + } + if (pg.target_history[i].min_epoch <= pg.epoch && + pg.target_history[i].max_epoch >= pg.epoch) + { + if (pg.target_history[i].osd_set != pg.cur_set) + { + printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch); + force_stop(1); + return; + } + // Already reported + epoch_already_reported = true; + break; + } + } + if (!epoch_already_reported) + { + if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set) + { + pg.target_history[max_epoch_pos].max_epoch = pg.epoch; + } + else + { + pg.target_history.push_back((pg_history_set_t){ + .osd_set = pg.cur_set, + .min_epoch = pg.epoch, + .max_epoch = pg.epoch, + }); + } + } +} diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 249f268b..8bd53abf 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -52,6 +52,7 @@ struct pg_obj_state_check_t void walk(); void start_object(); + void recheck_version_osd_set(); void handle_version(); void finish_object(); }; @@ -84,6 +85,7 @@ void pg_obj_state_check_t::walk() pg->state = PG_INCOMPLETE | PG_HAS_INVALID; return; } + // Activate PG if (pg->pg_cursize < pg->pg_size) { // Activate as degraded @@ -108,13 +110,85 @@ void pg_obj_state_check_t::start_object() n_unstable = n_invalid = 0; } +// FIXME: Put this under a feature flag +// FIXME: Implement OSD 'cookies' to be fool-proof so that if an OSD is wiped and +// recreated it doesn't also wipe all other data +void pg_obj_state_check_t::recheck_version_osd_set() +{ + uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS)); + if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size) + { + // Enough copies + return; + } + auto epoch_it = pg->target_by_epoch.lower_bound(epoch); + if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch) + { + // Epoch info not found + return; + } + if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size()) + { + // For the (unlikely) case of PG size change - enough copies + return; + } + // Recheck version against the OSD set corresponding to epoch if it's known + if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch) + { + for (int j = 0; j < epoch_it->second.osd_set.size(); j++) + { + osd_num_t cur_osd = epoch_it->second.osd_set[j]; + bool found = false; + for (int i = ver_start; i < ver_end; i++) + { + if (cur_osd == list[i].osd_num) + { + found = true; + break; + } + } + if (!found) + { + // Check if a newer version is present on the same OSD and masks the older one + // It happens for overwritten replicas in the following case: + // Version 1 is present on OSD 1,2,3 + // Client tries to write Version 2 + // OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again + // OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3 + // (version 1 on OSD 3 is already masked/removed) + // Version 1 is not present on a full set, but it must not be removed + if (replicated) + { + for (int i = obj_start; i < ver_start; i++) + { + if (cur_osd == list[i].osd_num) + { + found = true; + break; + } + } + } + if (!found) + { + // Object is missing from one of the OSDs of that set. + // This means it's deleted or moved and we can safely drop this version. + target_ver = 0; + break; + } + } + } + } +} + void pg_obj_state_check_t::handle_version() { if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size)) { // Version is either stable or recoverable - target_ver = last_ver; ver_end = list_pos; + target_ver = last_ver; + // Skip versions that are not present on any of OSDs for the corresponding PG epoch + recheck_version_osd_set(); } if (!target_ver) { @@ -178,6 +252,8 @@ void pg_obj_state_check_t::finish_object() // Version is either stable or recoverable target_ver = last_ver; ver_end = list_pos; + // Skip versions that are not present on any of OSDs for the corresponding PG epoch + recheck_version_osd_set(); } obj_end = list_pos; // Remember the decision @@ -231,11 +307,23 @@ void pg_obj_state_check_t::finish_object() } } } - if (!target_ver) + if (!target_ver && (n_unstable >= obj_end-obj_start)) { return; } - if (!replicated && n_roles < pg->pg_data_size) + if (!target_ver) + { + // Object is present, but should not be :) i.e. it's a deleted object that reappeared + if (log_level > 1) + { + printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); + } + state = OBJ_DELETED; + pg->state = pg->state | PG_HAS_MISPLACED; + // To record all versions as outdated: + ver_end = obj_start; + } + else if (!replicated && n_roles < pg->pg_data_size) { if (log_level > 1) { @@ -263,7 +351,7 @@ void pg_obj_state_check_t::finish_object() pg->state = pg->state | PG_HAS_MISPLACED; } if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) || - log_level > 2 && (state & OBJ_MISPLACED)) + log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED))) { for (int i = obj_start; i < obj_end; i++) { @@ -272,9 +360,9 @@ void pg_obj_state_check_t::finish_object() } } pg->total_count++; - if (state != 0 || ver_end < obj_end) + osd_set.clear(); + if (target_ver != 0 && (state != 0 || ver_end < obj_end)) { - osd_set.clear(); for (int i = ver_start; i < ver_end; i++) { osd_set.push_back((pg_obj_loc_t){ @@ -297,7 +385,8 @@ void pg_obj_state_check_t::finish_object() break; } } - if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num) + if (j >= osd_set.size() && ((state & OBJ_DELETED) || + pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)) { osd_set.push_back((pg_obj_loc_t){ .role = (list[i].oid.stripe & STRIPE_MASK), @@ -312,7 +401,11 @@ void pg_obj_state_check_t::finish_object() } } } - if (target_ver < max_ver) + if (state & OBJ_DELETED) + { + pg->ver_override[oid] = max_ver; + } + else if (target_ver < max_ver) { pg->ver_override[oid] = target_ver; } @@ -366,6 +459,7 @@ void pg_obj_state_check_t::finish_object() } else { + assert(it->second.state == state); it->second.object_count++; } if (state & OBJ_INCOMPLETE) @@ -386,6 +480,34 @@ void pg_obj_state_check_t::finish_object() // FIXME: Write at least some tests for this function void pg_t::calc_object_states(int log_level) { + // Calculate intersections of target_history with cur_peers + for (auto & history_item: target_history) + { + if (history_item.max_epoch) + { + pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch]; + set_copy.min_epoch = history_item.min_epoch; + set_copy.max_epoch = history_item.max_epoch; + for (int i = 0; i < history_item.osd_set.size(); i++) + { + if (history_item.osd_set[i] != 0) + { + for (int j = 0; j < cur_set.size(); j++) + { + if (cur_set[j] == history_item.osd_set[i]) + { + set_copy.osd_set.push_back(history_item.osd_set[i]); + break; + } + } + } + } + if (set_copy.osd_set.size() != pg_size) + { + epoch_sizes_differ = true; + } + } + } // Copy all object lists into one array pg_obj_state_check_t st; st.log_level = log_level; @@ -422,10 +544,18 @@ void pg_t::calc_object_states(int log_level) std::sort(st.list.begin(), st.list.end()); // Walk over it and check object states st.walk(); + target_by_epoch.clear(); // needed only in this function if (this->state != PG_ACTIVE) { assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1)); epoch++; + for (auto & pgh: target_history) + { + if (epoch <= pgh.max_epoch) + { + epoch = pgh.max_epoch+1; + } + } } if (log_level > 0) { diff --git a/src/osd_peering_pg.h b/src/osd_peering_pg.h index d0e729ee..57c451ab 100644 --- a/src/osd_peering_pg.h +++ b/src/osd_peering_pg.h @@ -89,7 +89,9 @@ struct pg_t // epoch number - should increase with each non-clean activation of the PG uint64_t epoch = 0, reported_epoch = 0; // target history and all potential peers - std::vector> target_history; + std::vector target_history; + std::map target_by_epoch; + bool epoch_sizes_differ = false; std::vector all_peers; bool history_changed = false; // peer list from the last peering event diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 95d947c3..bc18674a 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -199,6 +199,21 @@ void osd_t::continue_primary_read(osd_op_t *cur_op) { // PG may be degraded or have misplaced objects op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); + if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED)) + { + // Object is deleted, just return zeroes + cur_op->reply.rw.version = 0; + cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size; + uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len; + while (zero_len >= 0) + { + uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size; + cur_op->iov.push_back(zero_buffer, cur_zero_len); + zero_len -= cur_zero_len; + } + finish_op(cur_op, cur_op->req.rw.len); + return; + } } if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) { @@ -290,7 +305,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object report_pg_state(pg); } } - else if (object_state->state & OBJ_MISPLACED) + else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED)) { this->misplaced_objects--; pg.misplaced_objects.erase(oid); @@ -329,12 +344,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op) else if (op_data->st == 4) goto resume_4; else if (op_data->st == 5) goto resume_5; assert(op_data->st == 0); - // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs - if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD)) - { - finish_op(cur_op, -EBUSY); - return; - } if (!check_write_queue(cur_op, pg)) { return; @@ -342,11 +351,18 @@ void osd_t::continue_primary_del(osd_op_t *cur_op) resume_1: // Determine which OSDs contain this object and delete it op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); - // Submit 1 read to determine the actual version number - submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); + if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED)) + { + op_data->fact_ver = pg.ver_override[op_data->oid]; + } + else + { + // Submit 1 read to determine the actual version number + submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); resume_2: - op_data->st = 2; - return; + op_data->st = 2; + return; + } resume_3: if (op_data->errors > 0) { diff --git a/src/osd_primary_chain.cpp b/src/osd_primary_chain.cpp index 01fc39ed..90f4f9ad 100644 --- a/src/osd_primary_chain.cpp +++ b/src/osd_primary_chain.cpp @@ -133,6 +133,12 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vectorsecond : UINT64_MAX; pg_osd_set_state_t *object_state; uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); + if (object_state && (object_state->state & OBJ_DELETED)) + { + // Object is deleted, zero out the bitmap + memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size); + continue; + } if (pg.scheme == POOL_SCHEME_REPLICATED) { osd_num_t read_target = 0; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 5b6ab48d..8aa65ce8 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -119,17 +119,19 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0)) n_subops++; } - if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep)) + if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0) n_subops = 1; else zero_read = -1; - osd_op_t *subops = new osd_op_t[n_subops]; op_data->fact_ver = 0; op_data->done = op_data->errors = op_data->errcode = 0; op_data->n_subops = n_subops; - op_data->subops = subops; - int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read); - assert(sent == n_subops); + if (n_subops > 0) + { + op_data->subops = new osd_op_t[n_subops]; + int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read); + assert(sent == n_subops); + } } int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version, diff --git a/src/osd_primary_write.cpp b/src/osd_primary_write.cpp index dd6e6652..ef92a618 100644 --- a/src/osd_primary_write.cpp +++ b/src/osd_primary_write.cpp @@ -156,21 +156,13 @@ resume_3: { // Report newer epoch before writing // FIXME: We don't have to report all changed PG states here - this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); if (pg.state != PG_ACTIVE) { // Check that current OSD set is in history and/or add it there - std::vector history_set; - for (auto peer_osd: pg.cur_set) - if (peer_osd != 0) - history_set.push_back(peer_osd); - std::sort(history_set.begin(), history_set.end()); - auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set); - if (it == pg.target_history.end() || *it != history_set) - pg.target_history.insert(it, history_set); + add_pg_history(pg); } pg.history_changed = true; - report_pg_states(); + report_pg_state(pg); resume_10: if (pg.epoch > pg.reported_epoch) { diff --git a/src/pg_states.h b/src/pg_states.h index 605c817a..df81ed67 100644 --- a/src/pg_states.h +++ b/src/pg_states.h @@ -32,6 +32,7 @@ #define OBJ_DEGRADED 0x02 #define OBJ_INCOMPLETE 0x04 #define OBJ_MISPLACED 0x08 +#define OBJ_DELETED 0x10 #define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_ROLLBACK 0x20000 diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 82fb5d98..122d03e0 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -14,6 +14,8 @@ SCHEME=ec ./test_change_pg_count.sh ./test_create_nomaxid.sh +./test_degraded_delete.sh + ./test_etcd_fail.sh ./test_failure_domain.sh diff --git a/tests/test_degraded_delete.sh b/tests/test_degraded_delete.sh new file mode 100755 index 00000000..389717bd --- /dev/null +++ b/tests/test_degraded_delete.sh @@ -0,0 +1,131 @@ +#!/bin/bash -ex + +# Run 3 OSDs + +. `dirname $0`/run_3osds.sh + +# Write inodes 1 and 2 + +LD_PRELOAD="build/src/libfio_vitastor.so" \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ + -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 + +LD_PRELOAD="build/src/libfio_vitastor.so" \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ + -rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=128M -runtime=10 + +LD_PRELOAD="build/src/libfio_vitastor.so" \ + fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 \ + -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 &>/dev/null & + +sleep 5 + +# Stop OSD 1 + +kill -INT $OSD1_PID +sleep 2 + +# Remove inode 2 + +build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 2 + +# Run 3 more OSDs and move PG to 4,5,6 + +for i in $(seq 4 6); do + dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1)) + build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log & + eval OSD${i}_PID=$! +done + +$ETCDCTL put /vitastor/config/osd/1 '{"reweight":0}' +$ETCDCTL put /vitastor/config/osd/2 '{"reweight":0}' +$ETCDCTL put /vitastor/config/osd/3 '{"reweight":0}' + +# Wait for rebalance to finish + +wait_finish_rebalance() +{ + local sec=$1 + local st=$2 + local i=0 + while [[ $i -lt $sec ]]; do + if $ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e \ + '([ .[] | select(.state == ['$st'] and (.peers | contains([1]) | not) and (.peers | contains([2,3]) | not)) ] | length) == '$PG_COUNT; then + break + fi + sleep 1 + i=$((i+1)) + if [ $i -eq $sec ]; then + format_error "Rebalance couldn't finish in $sec seconds" + fi + done +} + +wait_finish_rebalance 60 '"active","left_on_dead"' + +# Stop OSD 2,3 + +kill -INT $OSD2_PID +kill -INT $OSD3_PID +sleep 2 + +# Verify that PGs are still active + +if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active","left_on_dead"]) ] | length == '$PG_COUNT); then + format_error "FAILED: $PG_COUNT PG(s) NOT UP" +fi + +# Start OSD 1 + +build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log & +OSD1_PID=$! + +# Verify that inode 2 is removed and inode 1 is in place + +wait_repeer_1() +{ + local sec=$1 + local i=0 + while [[ $i -lt $sec ]]; do + if grep -q 'Repeer because of OSD 1' testdata/osd4.log testdata/osd5.log testdata/osd6.log; then + break + fi + sleep 1 + i=$((i+1)) + if [ $i -eq $sec ]; then + format_error "OSD 4/5/6 do not peer with older OSD 1" + fi + done +} + +wait_repeer_1 15 + +wait_finish_rebalance 15 '"active"' + +if [ "$SCHEME" = "replicated" ]; then + NOBJ=1024 +else + NOBJ=$((1024/(PG_SIZE-1))) +fi + +if ! ($ETCDCTL get /vitastor/pg/stats/1/1 --print-value-only | jq -s -e '.[0].object_count == '$NOBJ); then + format_error "FAILED: PG SHOULD CONTAIN EXACTLY 128 MB OF DATA, BUT IT DOESN'T" +fi + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=4096" \ + -O raw ./testdata/inode1.bin + +qemu-img convert -S 4096 -p \ + -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size="$((128*1024*1024)) \ + -O raw ./testdata/inode2.bin + +if (dd if=/dev/zero bs=4096 count=1 | diff - ./testdata/inode1.bin); then + format_error "FAILED: INODE 1 SEEMS LOST" +fi + +if ! (dd if=/dev/zero bs=1M count=128 | diff - ./testdata/inode2.bin); then + format_error "FAILED: INODE 2 SEEMS RESTORED" +fi + +format_green OK