From 98efdb78bd4cd94df61d3f0decf6505816ea0278 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 24 Jan 2020 02:23:27 +0300 Subject: [PATCH] Extract object state calculation to a separate file and slightly test it --- Makefile | 15 ++-- blockstore_impl.h | 1 - osd.h | 161 ++++--------------------------------- osd_peering.cpp | 192 +++++++-------------------------------------- osd_peering_pg.cpp | 162 ++++++++++++++++++++++++++++++++++++++ osd_peering_pg.h | 142 +++++++++++++++++++++++++++++++++ test.cpp | 70 +++++++++++------ 7 files changed, 402 insertions(+), 341 deletions(-) create mode 100644 osd_peering_pg.cpp create mode 100644 osd_peering_pg.h diff --git a/Makefile b/Makefile index 08522a28d..80864b648 100644 --- a/Makefile +++ b/Makefile @@ -24,18 +24,21 @@ libblockstore.so: $(BLOCKSTORE_OBJS) libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring +OSD_OBJS := osd.o osd_exec_secondary.o osd_read.o osd_send.o osd_peering.o osd_peering_pg.o json11.o osd_exec_secondary.o: osd_exec_secondary.cpp osd.h osd_ops.h g++ $(CXXFLAGS) -c -o $@ $< osd_read.o: osd_read.cpp osd.h osd_ops.h g++ $(CXXFLAGS) -c -o $@ $< osd_send.o: osd_send.cpp osd.h osd_ops.h g++ $(CXXFLAGS) -c -o $@ $< -osd_peering.o: osd_peering.cpp osd.h osd_ops.h +osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h g++ $(CXXFLAGS) -c -o $@ $< -osd.o: osd.cpp osd.h osd_ops.h +osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h g++ $(CXXFLAGS) -c -o $@ $< -osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h osd.o osd_exec_secondary.o osd_read.o osd_send.o osd_peering.o json11.o - g++ $(CXXFLAGS) -o osd osd_main.cpp osd.o osd_exec_secondary.o osd_read.o osd_send.o osd_peering.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring +osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h + g++ $(CXXFLAGS) -c -o $@ $< +osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS) + g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring stub_osd: stub_osd.cpp osd_ops.h g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp -ltcmalloc_minimal @@ -44,7 +47,7 @@ libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h test_blockstore: ./libblockstore.so test_blockstore.cpp g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp ./libblockstore.so -ltcmalloc_minimal -luring -test: test.cpp - g++ $(CXXFLAGS) -o test test.cpp -luring +test: test.cpp osd_peering_pg.o + g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring test_allocator: test_allocator.cpp allocator.o g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o diff --git a/blockstore_impl.h b/blockstore_impl.h index 2697597f7..9c2591161 100644 --- a/blockstore_impl.h +++ b/blockstore_impl.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "sparsepp/sparsepp/spp.h" diff --git a/osd.h b/osd.h index 65c25f698..88159b062 100644 --- a/osd.h +++ b/osd.h @@ -10,18 +10,15 @@ #include #include -#include #include #include "blockstore.h" #include "ringloop.h" #include "osd_ops.h" +#include "osd_peering_pg.h" #include "sparsepp/sparsepp/spp.h" -#define STRIPE_NUM(stripe) ((stripe) >> 4) -#define STRIPE_REPLICA(stripe) ((stripe) & 0xf) - #define OSD_OP_IN 0 #define OSD_OP_OUT 1 @@ -34,8 +31,13 @@ #define CL_WRITE_DATA 3 #define MAX_EPOLL_EVENTS 16 +#define PEER_CONNECTING 1 +#define PEER_CONNECTED 2 + //#define OSD_STUB +// FIXME: add types for pg_num and osd_num? + struct osd_op_t { int op_type; @@ -57,8 +59,13 @@ struct osd_op_t ~osd_op_t(); }; -#define PEER_CONNECTING 1 -#define PEER_CONNECTED 2 +struct osd_peer_def_t +{ + uint64_t osd_num = 0; + std::string addr; + int port = 0; + time_t last_connect_attempt = 0; +}; struct osd_client_t { @@ -96,144 +103,6 @@ struct osd_client_t int write_state = 0; }; -struct osd_obj_loc_t -{ - uint64_t role; - uint64_t osd_num; - bool stable; -}; - -inline bool operator < (const osd_obj_loc_t &a, const osd_obj_loc_t &b) -{ - return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num; -} - -struct osd_obj_state_t -{ - std::vector loc; - uint64_t state = 0; - uint64_t object_count = 0; -}; - -struct osd_ver_override_t -{ - uint64_t max_ver; - uint64_t target_ver; -}; - -inline bool operator < (const osd_obj_state_t &a, const osd_obj_state_t &b) -{ - return a.loc < b.loc; -} - -namespace std -{ - template<> struct hash - { - inline size_t operator()(const osd_obj_state_t &s) const - { - size_t seed = 0; - for (int i = 0; i < s.loc.size(); i++) - { - // Copy-pasted from spp::hash_combine() - seed ^= (s.loc[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); - seed ^= (s.loc[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); - seed ^= ((s.loc[i].stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); - } - return seed; - } - }; -} - -// Placement group states -// Exactly one of these: -#define PG_OFFLINE (1<<0) -#define PG_PEERING (1<<1) -#define PG_INCOMPLETE (1<<2) -#define PG_ACTIVE (1<<3) -// Plus any of these: -#define PG_HAS_UNFOUND (1<<4) -#define PG_HAS_DEGRADED (1<<5) -#define PG_HAS_MISPLACED (1<<6) - -// OSD object states -#define OBJ_CLEAN 0x01 -#define OBJ_MISPLACED 0x02 -#define OBJ_DEGRADED 0x03 -#define OBJ_INCOMPLETE 0x04 -#define OBJ_NONSTABILIZED 0x10000 -#define OBJ_UNDERWRITTEN 0x20000 -#define OBJ_OVERCOPIED 0x40000 -#define OBJ_BUGGY 0x80000 - -class osd_t; - -struct osd_pg_peering_state_t -{ - osd_t* self; - // FIXME: add types for pg_num and osd_num? - uint64_t pg_num; - std::unordered_map list_ops; - int list_done = 0; -}; - -struct osd_pg_t -{ - int state; - uint64_t pg_size = 3, pg_minsize = 2; - uint64_t pg_num; - // target_set = (role => osd_num). role starts from zero - std::vector target_set; - // moved object map. by default, each object is considered to reside on the target_set. - // this map stores all objects that differ. - // this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario - // which is up to ~192 MB per 1 TB in the worst case scenario - std::set state_dict; - spp::sparse_hash_map obj_states; - spp::sparse_hash_map ver_override; - osd_pg_peering_state_t *peering_state = NULL; -}; - -struct obj_ver_role -{ - object_id oid; - uint64_t version; - uint64_t osd_num; - bool is_stable; -}; - -inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) -{ - return a.oid < b.oid || - // object versions go in descending order - a.oid == b.oid && a.version > b.version || - a.oid == b.oid && a.version == b.version || - a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num; -} - -// Max 64 replicas -#define STRIPE_MASK 0x3F -#define STRIPE_SHIFT 6 - -struct osd_obj_state_check_t -{ - int start = 0; - object_id oid = { 0 }; - uint64_t max_ver = 0; - uint64_t target_ver = 0; - uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0; - bool is_buggy = false; - osd_obj_state_t state_obj; -}; - -struct osd_peer_def_t -{ - uint64_t osd_num = 0; - std::string addr; - int port = 0; - time_t last_connect_attempt = 0; -}; - class osd_t { // config @@ -250,7 +119,7 @@ class osd_t // peer OSDs std::map osd_peer_fds; - std::vector pgs; + std::vector pgs; int peering_state = 0; unsigned pg_count = 0; @@ -292,8 +161,6 @@ class osd_t void init_primary(); void handle_peers(); void start_pg_peering(int i); - void calc_object_states(osd_pg_t &pg); - void remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector &all, int end); // op execution void exec_op(osd_op_t *cur_op); diff --git a/osd_peering.cpp b/osd_peering.cpp index 6855582bd..c489c0cd4 100644 --- a/osd_peering.cpp +++ b/osd_peering.cpp @@ -14,12 +14,10 @@ void osd_t::init_primary() peers.push_back(parse_peer(config["peer2"])); if (peers[1].osd_num == peers[0].osd_num) throw std::runtime_error("peer1 and peer2 osd numbers are the same"); - pgs.push_back((osd_pg_t){ + pgs.push_back((pg_t){ .state = PG_OFFLINE, .pg_num = 1, .target_set = { 1, 2, 3 }, - .obj_states = spp::sparse_hash_map(), - .ver_override = spp::sparse_hash_map(), }); pg_count = 1; peering_state = 1; @@ -172,7 +170,7 @@ void osd_t::handle_peers() } else if (pgs[i].peering_state->list_done >= 3) { - calc_object_states(pgs[i]); + pgs[i].calc_object_states(); peering_state = 0; } } @@ -183,30 +181,36 @@ void osd_t::handle_peers() void osd_t::start_pg_peering(int pg_idx) { auto & pg = pgs[pg_idx]; - auto ps = pg.peering_state = new osd_pg_peering_state_t(); - ps->self = this; - ps->pg_num = pg_idx; // FIXME probably shouldn't be pg_idx + auto ps = pg.peering_state = new pg_peering_state_t(); { + uint64_t osd_num = this->osd_num; osd_op_t *op = new osd_op_t(); op->op_type = 0; op->peer_fd = 0; op->bs_op.opcode = BS_OP_LIST; - op->bs_op.callback = [ps, op](blockstore_op_t *bs_op) + op->bs_op.callback = [ps, op, osd_num](blockstore_op_t *bs_op) { + if (op->bs_op.retval < 0) + { + throw std::runtime_error("OP_LIST failed"); + } printf( "Got object list from OSD %lu (local): %d objects (%lu of them stable)\n", - ps->self->osd_num, bs_op->retval, bs_op->version + osd_num, bs_op->retval, bs_op->version ); - op->buf = op->bs_op.buf; - op->reply.hdr.retval = op->bs_op.retval; - op->reply.sec_list.stable_count = op->bs_op.version; + ps->list_results[osd_num] = { + .buf = (obj_ver_id*)op->bs_op.buf, + .total_count = (uint64_t)op->bs_op.retval, + .stable_count = op->bs_op.version, + }; ps->list_done++; + delete op; }; - pg.peering_state->list_ops[osd_num] = op; bs->enqueue_op(&op->bs_op); } for (int i = 0; i < peers.size(); i++) { + uint64_t osd_num = peers[i].osd_num; auto & cl = clients[osd_peer_fds[peers[i].osd_num]]; osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; @@ -222,161 +226,25 @@ void osd_t::start_pg_peering(int pg_idx) .pgtotal = 1, }, }; - op->callback = [ps](osd_op_t *op) + op->callback = [ps, osd_num](osd_op_t *op) { + if (op->reply.hdr.retval < 0) + { + throw std::runtime_error("OP_LIST failed"); + } printf( "Got object list from OSD %lu: %ld objects (%lu of them stable)\n", - ps->self->clients[op->peer_fd].osd_num, op->reply.hdr.retval, - op->reply.sec_list.stable_count + osd_num, op->reply.hdr.retval, op->reply.sec_list.stable_count ); + ps->list_results[osd_num] = { + .buf = (obj_ver_id*)op->buf, + .total_count = (uint64_t)op->reply.hdr.retval, + .stable_count = op->reply.sec_list.stable_count, + }; + op->buf = NULL; ps->list_done++; + delete op; }; - pg.peering_state->list_ops[cl.osd_num] = op; outbox_push(cl, op); } } - -void osd_t::remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector &all, int end) -{ - // Remember the decision - uint64_t state = 0; - if (st.n_roles == pg.pg_size) - { - if (st.n_matched == pg.pg_size) - state = OBJ_CLEAN; - else - state = OBJ_MISPLACED; - } - else if (st.n_roles < pg.pg_minsize) - state = OBJ_INCOMPLETE; - else - state = OBJ_DEGRADED; - if (st.n_copies > pg.pg_size) - state |= OBJ_OVERCOPIED; - if (st.n_stable < st.n_copies) - state |= OBJ_NONSTABILIZED; - if (st.target_ver < st.max_ver) - state |= OBJ_UNDERWRITTEN; - if (st.is_buggy) - state |= OBJ_BUGGY; - if (state != OBJ_CLEAN) - { - st.state_obj.state = state; - st.state_obj.loc.clear(); - for (int i = st.start; i < end; i++) - { - st.state_obj.loc.push_back((osd_obj_loc_t){ - .role = (all[i].oid.stripe & STRIPE_MASK), - .osd_num = all[i].osd_num, - .stable = all[i].is_stable, - }); - } - std::sort(st.state_obj.loc.begin(), st.state_obj.loc.end()); - auto ins = pg.state_dict.insert(st.state_obj); - pg.obj_states[st.oid] = &(*(ins.first)); - if (state & OBJ_UNDERWRITTEN) - { - pg.ver_override[st.oid] = { - .max_ver = st.max_ver, - .target_ver = st.target_ver, - }; - } - } -} - -void osd_t::calc_object_states(osd_pg_t &pg) -{ - // Copy all object lists into one array - std::vector all; - auto ps = pg.peering_state; - for (auto e: ps->list_ops) - { - osd_op_t* op = e.second; - auto nstab = op->reply.sec_list.stable_count; - auto n = op->reply.hdr.retval; - auto osd_num = clients[op->peer_fd].osd_num; - all.resize(all.size() + n); - obj_ver_id *ov = (obj_ver_id*)op->buf; - for (uint64_t i = 0; i < n; i++, ov++) - { - all[i] = { - .oid = ov->oid, - .version = ov->version, - .osd_num = osd_num, - .is_stable = i < nstab, - }; - } - free(op->buf); - op->buf = NULL; - } - // Sort - std::sort(all.begin(), all.end()); - // Walk over it and check object states - int replica = 0; - osd_obj_state_check_t st; - for (int i = 0; i < all.size(); i++) - { - if (st.oid.inode != all[i].oid.inode || - st.oid.stripe != (all[i].oid.stripe >> STRIPE_SHIFT)) - { - if (st.oid.inode != 0) - { - // Remember object state - remember_object(pg, st, all, i); - } - st.start = i; - st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT }; - st.max_ver = st.target_ver = all[i].version; - st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; - st.is_buggy = false; - } - if (st.target_ver != all[i].version) - { - if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize) - { - // Version is either recoverable or stable, choose it as target and skip previous versions - remember_object(pg, st, all, i); - while (i < all.size() && st.oid.inode == all[i].oid.inode && - st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT)) - { - i++; - } - continue; - } - else - { - // Remember that there are newer unrecoverable versions - st.target_ver = all[i].version; - st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; - } - } - replica = (all[i].oid.stripe & STRIPE_MASK); - st.n_copies++; - if (replica >= pg.pg_size) - { - // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes - st.is_buggy = true; - } - else - { - if (all[i].is_stable) - { - st.n_stable++; - } - else if (pg.target_set[replica] == all[i].osd_num) - { - st.n_matched++; - } - if (!(st.has_roles & (1 << replica))) - { - st.has_roles = st.has_roles | (1 << replica); - st.n_roles++; - } - } - } - if (st.oid.inode != 0) - { - // Remember object state - remember_object(pg, st, all, all.size()); - } -} diff --git a/osd_peering_pg.cpp b/osd_peering_pg.cpp new file mode 100644 index 000000000..a27326c7f --- /dev/null +++ b/osd_peering_pg.cpp @@ -0,0 +1,162 @@ +#include "osd_peering_pg.h" + +void pg_t::remember_object(pg_obj_state_check_t &st, std::vector &all, int end) +{ + auto & pg = *this; + // Remember the decision + uint64_t state = 0; + if (st.n_roles == pg.pg_size) + { + if (st.n_matched == pg.pg_size) + state = OBJ_CLEAN; + else + state = OBJ_MISPLACED; + } + else if (st.n_roles < pg.pg_minsize) + state = OBJ_INCOMPLETE; + else + state = OBJ_DEGRADED; + if (st.n_copies > pg.pg_size) + state |= OBJ_OVERCOPIED; + if (st.n_stable < st.n_copies) + state |= OBJ_NONSTABILIZED; + if (st.target_ver < st.max_ver) + state |= OBJ_UNDERWRITTEN; + if (st.is_buggy) + state |= OBJ_BUGGY; + if (state != OBJ_CLEAN) + { + st.osd_set.clear(); + for (int i = st.start; i < end; i++) + { + st.osd_set.push_back((pg_obj_loc_t){ + .role = (all[i].oid.stripe & STRIPE_MASK), + .osd_num = all[i].osd_num, + .stable = all[i].is_stable, + }); + } + std::sort(st.osd_set.begin(), st.osd_set.end()); + auto it = pg.state_dict.find(st.osd_set); + if (it == pg.state_dict.end()) + { + pg.state_dict[st.osd_set] = { + .osd_set = st.osd_set, + .state = state, + .object_count = 1, + }; + it = pg.state_dict.find(st.osd_set); + } + else + it->second.object_count++; + pg.obj_states[st.oid] = &it->second; + if (state & OBJ_UNDERWRITTEN) + { + pg.ver_override[st.oid] = { + .max_ver = st.max_ver, + .target_ver = st.target_ver, + }; + } + } + else + pg.clean_count++; +} + +void pg_t::calc_object_states() +{ + auto & pg = *this; + // Copy all object lists into one array + std::vector all; + auto ps = pg.peering_state; + for (auto it: ps->list_results) + { + auto nstab = it.second.stable_count; + auto n = it.second.total_count; + auto osd_num = it.first; + uint64_t start = all.size(); + all.resize(start + n); + obj_ver_id *ov = it.second.buf; + for (uint64_t i = 0; i < n; i++, ov++) + { + all[start+i] = { + .oid = ov->oid, + .version = ov->version, + .osd_num = osd_num, + .is_stable = i < nstab, + }; + } + free(it.second.buf); + it.second.buf = NULL; + } + ps->list_results.clear(); + // Sort + std::sort(all.begin(), all.end()); + // Walk over it and check object states + pg.clean_count = 0; + int replica = 0; + pg_obj_state_check_t st; + for (int i = 0; i < all.size(); i++) + { + if (st.oid.inode != all[i].oid.inode || + st.oid.stripe != (all[i].oid.stripe >> STRIPE_SHIFT)) + { + if (st.oid.inode != 0) + { + // Remember object state + remember_object(st, all, i); + } + st.start = i; + st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT }; + st.max_ver = st.target_ver = all[i].version; + st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; + st.is_buggy = false; + } + if (st.target_ver != all[i].version) + { + if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize) + { + // Version is either recoverable or stable, choose it as target and skip previous versions + remember_object(st, all, i); + while (i < all.size() && st.oid.inode == all[i].oid.inode && + st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT)) + { + i++; + } + continue; + } + else + { + // Remember that there are newer unrecoverable versions + st.target_ver = all[i].version; + st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; + } + } + replica = (all[i].oid.stripe & STRIPE_MASK); + st.n_copies++; + if (replica >= pg.pg_size) + { + // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes + st.is_buggy = true; + } + else + { + if (all[i].is_stable) + { + st.n_stable++; + } + if (pg.target_set[replica] == all[i].osd_num) + { + st.n_matched++; + } + if (!(st.has_roles & (1 << replica))) + { + st.has_roles = st.has_roles | (1 << replica); + st.n_roles++; + } + } + } + if (st.oid.inode != 0) + { + // Remember object state + remember_object(st, all, all.size()); + } +} diff --git a/osd_peering_pg.h b/osd_peering_pg.h new file mode 100644 index 000000000..2d60f82c2 --- /dev/null +++ b/osd_peering_pg.h @@ -0,0 +1,142 @@ +#include +#include +#include + +#include "object_id.h" + +#include "sparsepp/sparsepp/spp.h" + +// Placement group states +// Exactly one of these: +#define PG_OFFLINE (1<<0) +#define PG_PEERING (1<<1) +#define PG_INCOMPLETE (1<<2) +#define PG_ACTIVE (1<<3) +// Plus any of these: +#define PG_HAS_UNFOUND (1<<4) +#define PG_HAS_DEGRADED (1<<5) +#define PG_HAS_MISPLACED (1<<6) + +// OSD object states +#define OBJ_CLEAN 0x01 +#define OBJ_MISPLACED 0x02 +#define OBJ_DEGRADED 0x03 +#define OBJ_INCOMPLETE 0x04 +#define OBJ_NONSTABILIZED 0x10000 +#define OBJ_UNDERWRITTEN 0x20000 +#define OBJ_OVERCOPIED 0x40000 +#define OBJ_BUGGY 0x80000 + +// Max 64 replicas +#define STRIPE_MASK 0x3F +#define STRIPE_SHIFT 6 + +struct pg_obj_loc_t +{ + uint64_t role; + uint64_t osd_num; + bool stable; +}; + +typedef std::vector pg_osd_set_t; + +struct pg_osd_set_state_t +{ + pg_osd_set_t osd_set; + uint64_t state = 0; + uint64_t object_count = 0; +}; + +struct pg_ver_override_t +{ + uint64_t max_ver; + uint64_t target_ver; +}; + +struct pg_list_result_t +{ + obj_ver_id *buf; + uint64_t total_count; + uint64_t stable_count; +}; + +struct pg_peering_state_t +{ + // osd_num -> list result + spp::sparse_hash_map list_results; + int list_done = 0; +}; + +struct pg_obj_state_check_t +{ + int start = 0; + object_id oid = { 0 }; + uint64_t max_ver = 0; + uint64_t target_ver = 0; + uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0; + bool is_buggy = false; + pg_osd_set_t osd_set; +}; + +struct obj_ver_role +{ + object_id oid; + uint64_t version; + uint64_t osd_num; + bool is_stable; +}; + +struct pg_t +{ + int state; + uint64_t pg_size = 3, pg_minsize = 2; + uint64_t pg_num; + uint64_t clean_count = 0; + // target_set = (role => osd_num). role starts from zero + std::vector target_set; + // moved object map. by default, each object is considered to reside on the target_set. + // this map stores all objects that differ. + // this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario + // which is up to ~192 MB per 1 TB in the worst case scenario + std::map state_dict; + spp::sparse_hash_map obj_states; + spp::sparse_hash_map ver_override; + pg_peering_state_t *peering_state = NULL; + + void calc_object_states(); + void remember_object(pg_obj_state_check_t &st, std::vector &all, int end); +}; + +inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b) +{ + return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num || + a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable; +} + +inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) +{ + return a.oid < b.oid || + // object versions come in descending order + a.oid == b.oid && a.version > b.version || + a.oid == b.oid && a.version == b.version || + a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num; +} + +namespace std +{ + template<> struct hash + { + inline size_t operator()(const pg_osd_set_t &s) const + { + size_t seed = 0; + for (auto e: s) + { + // Copy-pasted from spp::hash_combine() + seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + } + return seed; + } + }; +} diff --git a/test.cpp b/test.cpp index 7c73ce103..eb7f9b8e5 100644 --- a/test.cpp +++ b/test.cpp @@ -25,6 +25,7 @@ #include "blockstore.h" #include "blockstore_impl.h" +#include "osd_peering_pg.h" static int setup_context(unsigned entries, struct io_uring *ring) { @@ -285,23 +286,7 @@ int main03(int argc, char *argv[]) return 0; } -struct obj_ver_role -{ - object_id oid; - uint64_t version; - uint32_t osd_num; - uint32_t is_stable; -}; - -inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) -{ - return a.oid < b.oid || - a.oid == b.oid && a.version < b.version || - a.oid == b.oid && a.version == b.version || - a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num; -} - -int main(int argc, char *argv[]) +int main04(int argc, char *argv[]) { /*spp::sparse_hash_set osd1, osd2; // fill takes 18.9 s @@ -336,19 +321,54 @@ int main(int argc, char *argv[]) { to_sort[i] = { .oid = (object_id){ - .inode = rand() % 500, - .stripe = rand(), + .inode = (uint64_t)(rand() % 500), + .stripe = (uint64_t)rand(), }, - .version = rand(), - .osd_num = rand() % 16, + .version = (uint64_t)rand(), + .osd_num = (uint64_t)(rand() % 16), }; } printf("Sorting\n"); // sorting the whole array takes 7 s - //std::sort(to_sort.begin(), to_sort.end()); // sorting in 3 parts... almost the same, 6 s - std::sort(to_sort.begin(), to_sort.begin() + to_sort.size()/3); - std::sort(to_sort.begin() + to_sort.size()/3, to_sort.begin() + to_sort.size()*2/3); - std::sort(to_sort.begin() + to_sort.size()*2/3, to_sort.end()); + std::sort(to_sort.begin(), to_sort.end()); + return 0; +} + +int main(int argc, char *argv[]) +{ + // FIXME extract this into a test + pg_t pg = { + .state = PG_PEERING, + .pg_num = 1, + .target_set = { 1, 2, 3 }, + .peering_state = new pg_peering_state_t(), + }; + pg.peering_state->list_done = 3; + for (uint64_t osd_num = 1; osd_num <= 3; osd_num++) + { + pg_list_result_t r = { + .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8), + .total_count = 1024*1024*8, + .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)), + }; + for (uint64_t i = 0; i < r.total_count; i++) + { + r.buf[i] = { + .oid = { + .inode = 1, + .stripe = (i << STRIPE_SHIFT) | (osd_num-1), + }, + .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1), + }; + } + pg.peering_state->list_results[osd_num] = r; + } + pg.calc_object_states(); + printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count); + for (auto it: pg.state_dict) + { + printf("dev: state=%lx\n", it.second.state); + } return 0; }