From d2a3f0c6dd5034271c7af11d7a1317ba84897684 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 23 Jan 2020 21:43:45 +0300 Subject: [PATCH] Begin object state calculation --- osd.h | 100 +++++++++++++++++++++++------ osd_peering.cpp | 164 ++++++++++++++++++++++++++++++++++++++++++++++-- test.cpp | 8 ++- 3 files changed, 244 insertions(+), 28 deletions(-) diff --git a/osd.h b/osd.h index 521c201d..65c25f69 100644 --- a/osd.h +++ b/osd.h @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include "blockstore.h" @@ -96,27 +96,49 @@ struct osd_client_t int write_state = 0; }; -struct osd_pg_role_t +struct osd_obj_loc_t { - // role = (stripe role: 1, 2, 3, ...) | (stable ? 0 : 1<<63) uint64_t role; uint64_t osd_num; + bool stable; }; -typedef std::vector osd_acting_set_t; +inline bool operator < (const osd_obj_loc_t &a, const osd_obj_loc_t &b) +{ + return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num; +} + +struct osd_obj_state_t +{ + std::vector loc; + uint64_t state = 0; + uint64_t object_count = 0; +}; + +struct osd_ver_override_t +{ + uint64_t max_ver; + uint64_t target_ver; +}; + +inline bool operator < (const osd_obj_state_t &a, const osd_obj_state_t &b) +{ + return a.loc < b.loc; +} namespace std { - template<> struct hash + template<> struct hash { - inline size_t operator()(const osd_acting_set_t &s) const + inline size_t operator()(const osd_obj_state_t &s) const { size_t seed = 0; - for (int i = 0; i < s.size(); i++) + for (int i = 0; i < s.loc.size(); i++) { // Copy-pasted from spp::hash_combine() - seed ^= (s[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); - seed ^= (s[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + seed ^= (s.loc[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + seed ^= (s.loc[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + seed ^= ((s.loc[i].stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); } return seed; } @@ -135,18 +157,21 @@ namespace std #define PG_HAS_MISPLACED (1<<6) // OSD object states -#define OSD_CLEAN 0x01 -#define OSD_MISPLACED 0x02 -#define OSD_DEGRADED 0x03 -#define OSD_INCOMPLETE 0x04 -#define OSD_HALF_STABLE 0x10000 -#define OSD_NEEDS_ROLLBACK 0x20000 +#define OBJ_CLEAN 0x01 +#define OBJ_MISPLACED 0x02 +#define OBJ_DEGRADED 0x03 +#define OBJ_INCOMPLETE 0x04 +#define OBJ_NONSTABILIZED 0x10000 +#define OBJ_UNDERWRITTEN 0x20000 +#define OBJ_OVERCOPIED 0x40000 +#define OBJ_BUGGY 0x80000 class osd_t; struct osd_pg_peering_state_t { osd_t* self; + // FIXME: add types for pg_num and osd_num? uint64_t pg_num; std::unordered_map list_ops; int list_done = 0; @@ -155,19 +180,52 @@ struct osd_pg_peering_state_t struct osd_pg_t { int state; + uint64_t pg_size = 3, pg_minsize = 2; uint64_t pg_num; - uint64_t n_unfound = 0, n_degraded = 0, n_misplaced = 0; - std::vector target_set; + // target_set = (role => osd_num). role starts from zero + std::vector target_set; // moved object map. by default, each object is considered to reside on the target_set. // this map stores all objects that differ. // this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario - std::unordered_map acting_set_ids; - std::map acting_sets; - spp::sparse_hash_map object_map; + std::set state_dict; + spp::sparse_hash_map obj_states; + spp::sparse_hash_map ver_override; osd_pg_peering_state_t *peering_state = NULL; }; +struct obj_ver_role +{ + object_id oid; + uint64_t version; + uint64_t osd_num; + bool is_stable; +}; + +inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) +{ + return a.oid < b.oid || + // object versions go in descending order + a.oid == b.oid && a.version > b.version || + a.oid == b.oid && a.version == b.version || + a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num; +} + +// Max 64 replicas +#define STRIPE_MASK 0x3F +#define STRIPE_SHIFT 6 + +struct osd_obj_state_check_t +{ + int start = 0; + object_id oid = { 0 }; + uint64_t max_ver = 0; + uint64_t target_ver = 0; + uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0; + bool is_buggy = false; + osd_obj_state_t state_obj; +}; + struct osd_peer_def_t { uint64_t osd_num = 0; @@ -234,6 +292,8 @@ class osd_t void init_primary(); void handle_peers(); void start_pg_peering(int i); + void calc_object_states(osd_pg_t &pg); + void remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector &all, int end); // op execution void exec_op(osd_op_t *cur_op); diff --git a/osd_peering.cpp b/osd_peering.cpp index 76a6ea08..6855582b 100644 --- a/osd_peering.cpp +++ b/osd_peering.cpp @@ -1,5 +1,8 @@ #include #include + +#include + #include "osd.h" void osd_t::init_primary() @@ -14,8 +17,9 @@ void osd_t::init_primary() pgs.push_back((osd_pg_t){ .state = PG_OFFLINE, .pg_num = 1, - .target_set = { { .role = 1, .osd_num = 1 }, { .role = 2, .osd_num = 2 }, { .role = 3, .osd_num = 3 } }, - .object_map = spp::sparse_hash_map(), + .target_set = { 1, 2, 3 }, + .obj_states = spp::sparse_hash_map(), + .ver_override = spp::sparse_hash_map(), }); pg_count = 1; peering_state = 1; @@ -144,9 +148,9 @@ void osd_t::handle_peers() { // Start PG peering pgs[0].state = PG_PEERING; - pgs[0].acting_set_ids.clear(); - pgs[0].acting_sets.clear(); - pgs[0].object_map.clear(); + pgs[0].state_dict.clear(); + pgs[0].obj_states.clear(); + pgs[0].ver_override.clear(); if (pgs[0].peering_state) delete pgs[0].peering_state; peering_state = 2; @@ -168,7 +172,7 @@ void osd_t::handle_peers() } else if (pgs[i].peering_state->list_done >= 3) { - // FIXME + calc_object_states(pgs[i]); peering_state = 0; } } @@ -193,6 +197,9 @@ void osd_t::start_pg_peering(int pg_idx) "Got object list from OSD %lu (local): %d objects (%lu of them stable)\n", ps->self->osd_num, bs_op->retval, bs_op->version ); + op->buf = op->bs_op.buf; + op->reply.hdr.retval = op->bs_op.retval; + op->reply.sec_list.stable_count = op->bs_op.version; ps->list_done++; }; pg.peering_state->list_ops[osd_num] = op; @@ -228,3 +235,148 @@ void osd_t::start_pg_peering(int pg_idx) outbox_push(cl, op); } } + +void osd_t::remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector &all, int end) +{ + // Remember the decision + uint64_t state = 0; + if (st.n_roles == pg.pg_size) + { + if (st.n_matched == pg.pg_size) + state = OBJ_CLEAN; + else + state = OBJ_MISPLACED; + } + else if (st.n_roles < pg.pg_minsize) + state = OBJ_INCOMPLETE; + else + state = OBJ_DEGRADED; + if (st.n_copies > pg.pg_size) + state |= OBJ_OVERCOPIED; + if (st.n_stable < st.n_copies) + state |= OBJ_NONSTABILIZED; + if (st.target_ver < st.max_ver) + state |= OBJ_UNDERWRITTEN; + if (st.is_buggy) + state |= OBJ_BUGGY; + if (state != OBJ_CLEAN) + { + st.state_obj.state = state; + st.state_obj.loc.clear(); + for (int i = st.start; i < end; i++) + { + st.state_obj.loc.push_back((osd_obj_loc_t){ + .role = (all[i].oid.stripe & STRIPE_MASK), + .osd_num = all[i].osd_num, + .stable = all[i].is_stable, + }); + } + std::sort(st.state_obj.loc.begin(), st.state_obj.loc.end()); + auto ins = pg.state_dict.insert(st.state_obj); + pg.obj_states[st.oid] = &(*(ins.first)); + if (state & OBJ_UNDERWRITTEN) + { + pg.ver_override[st.oid] = { + .max_ver = st.max_ver, + .target_ver = st.target_ver, + }; + } + } +} + +void osd_t::calc_object_states(osd_pg_t &pg) +{ + // Copy all object lists into one array + std::vector all; + auto ps = pg.peering_state; + for (auto e: ps->list_ops) + { + osd_op_t* op = e.second; + auto nstab = op->reply.sec_list.stable_count; + auto n = op->reply.hdr.retval; + auto osd_num = clients[op->peer_fd].osd_num; + all.resize(all.size() + n); + obj_ver_id *ov = (obj_ver_id*)op->buf; + for (uint64_t i = 0; i < n; i++, ov++) + { + all[i] = { + .oid = ov->oid, + .version = ov->version, + .osd_num = osd_num, + .is_stable = i < nstab, + }; + } + free(op->buf); + op->buf = NULL; + } + // Sort + std::sort(all.begin(), all.end()); + // Walk over it and check object states + int replica = 0; + osd_obj_state_check_t st; + for (int i = 0; i < all.size(); i++) + { + if (st.oid.inode != all[i].oid.inode || + st.oid.stripe != (all[i].oid.stripe >> STRIPE_SHIFT)) + { + if (st.oid.inode != 0) + { + // Remember object state + remember_object(pg, st, all, i); + } + st.start = i; + st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT }; + st.max_ver = st.target_ver = all[i].version; + st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; + st.is_buggy = false; + } + if (st.target_ver != all[i].version) + { + if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize) + { + // Version is either recoverable or stable, choose it as target and skip previous versions + remember_object(pg, st, all, i); + while (i < all.size() && st.oid.inode == all[i].oid.inode && + st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT)) + { + i++; + } + continue; + } + else + { + // Remember that there are newer unrecoverable versions + st.target_ver = all[i].version; + st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; + } + } + replica = (all[i].oid.stripe & STRIPE_MASK); + st.n_copies++; + if (replica >= pg.pg_size) + { + // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes + st.is_buggy = true; + } + else + { + if (all[i].is_stable) + { + st.n_stable++; + } + else if (pg.target_set[replica] == all[i].osd_num) + { + st.n_matched++; + } + if (!(st.has_roles & (1 << replica))) + { + st.has_roles = st.has_roles | (1 << replica); + st.n_roles++; + } + } + } + if (st.oid.inode != 0) + { + // Remember object state + remember_object(pg, st, all, all.size()); + } +} diff --git a/test.cpp b/test.cpp index 9e35c013..7c73ce10 100644 --- a/test.cpp +++ b/test.cpp @@ -344,7 +344,11 @@ int main(int argc, char *argv[]) }; } printf("Sorting\n"); - // sort takes 7 s - std::sort(to_sort.begin(), to_sort.end()); + // sorting the whole array takes 7 s + //std::sort(to_sort.begin(), to_sort.end()); + // sorting in 3 parts... almost the same, 6 s + std::sort(to_sort.begin(), to_sort.begin() + to_sort.size()/3); + std::sort(to_sort.begin() + to_sort.size()/3, to_sort.begin() + to_sort.size()*2/3); + std::sort(to_sort.begin() + to_sort.size()*2/3, to_sort.end()); return 0; }