diff --git a/object_id.h b/object_id.h index e08c7da5d..ccc14c7db 100644 --- a/object_id.h +++ b/object_id.h @@ -2,8 +2,12 @@ #include +// Max 64 replicas +#define STRIPE_MASK 0x3F +#define STRIPE_SHIFT 6 + // 16 bytes per object/stripe id -// stripe includes replica number in 4 least significant bits +// stripe includes replica number in 6 (or maybe 4, see above) least significant bits struct __attribute__((__packed__)) object_id { uint64_t inode; @@ -12,7 +16,7 @@ struct __attribute__((__packed__)) object_id inline uint64_t operator % (const object_id & a, const uint64_t b) { - return ((a.inode % b) * (0x100000000 % b) * (0x100000000 % b) + a.stripe % b) % b; + return ((a.inode % b) * (0x100000000 % b) * (0x100000000 % b) + (a.stripe >> STRIPE_SHIFT) % b) % b; } inline bool operator == (const object_id & a, const object_id & b) diff --git a/osd_peering.cpp b/osd_peering.cpp index c489c0cd4..a160ccdcb 100644 --- a/osd_peering.cpp +++ b/osd_peering.cpp @@ -180,6 +180,7 @@ void osd_t::handle_peers() void osd_t::start_pg_peering(int pg_idx) { + // FIXME: Set PG_INCOMPLETE if incomplete auto & pg = pgs[pg_idx]; auto ps = pg.peering_state = new pg_peering_state_t(); { diff --git a/osd_peering_pg.cpp b/osd_peering_pg.cpp index a27326c7f..e0c1c9dfc 100644 --- a/osd_peering_pg.cpp +++ b/osd_peering_pg.cpp @@ -1,33 +1,56 @@ #include "osd_peering_pg.h" -void pg_t::remember_object(pg_obj_state_check_t &st, std::vector &all, int end) +void pg_t::remember_object(pg_obj_state_check_t &st, std::vector &all) { auto & pg = *this; // Remember the decision uint64_t state = 0; - if (st.n_roles == pg.pg_size) + if (st.n_roles == pg.pg_cursize) { - if (st.n_matched == pg.pg_size) + if (st.n_matched == pg.pg_cursize) state = OBJ_CLEAN; else + { state = OBJ_MISPLACED; + pg.state = pg.state | PG_HAS_MISPLACED; + } } else if (st.n_roles < pg.pg_minsize) + { state = OBJ_INCOMPLETE; + pg.state = pg.state | PG_HAS_INCOMPLETE; + } else + { state = OBJ_DEGRADED; + pg.state = pg.state | PG_HAS_DEGRADED; + } if (st.n_copies > pg.pg_size) + { state |= OBJ_OVERCOPIED; + pg.state = pg.state | PG_HAS_UNCLEAN; + } if (st.n_stable < st.n_copies) - state |= OBJ_NONSTABILIZED; - if (st.target_ver < st.max_ver) - state |= OBJ_UNDERWRITTEN; + { + state |= OBJ_NEEDS_STABLE; + pg.state = pg.state | PG_HAS_UNCLEAN; + } + if (st.target_ver < st.max_ver || st.has_old_unstable) + { + state |= OBJ_NEEDS_ROLLBACK; + pg.state = pg.state | PG_HAS_UNCLEAN; + pg.ver_override[st.oid] = st.target_ver; + } if (st.is_buggy) + { state |= OBJ_BUGGY; + // FIXME: bring pg offline + throw std::runtime_error("buggy object state"); + } if (state != OBJ_CLEAN) { st.osd_set.clear(); - for (int i = st.start; i < end; i++) + for (int i = st.ver_start; i < st.ver_end; i++) { st.osd_set.push_back((pg_obj_loc_t){ .role = (all[i].oid.stripe & STRIPE_MASK), @@ -47,14 +70,52 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector & it = pg.state_dict.find(st.osd_set); } else - it->second.object_count++; - pg.obj_states[st.oid] = &it->second; - if (state & OBJ_UNDERWRITTEN) { - pg.ver_override[st.oid] = { - .max_ver = st.max_ver, - .target_ver = st.target_ver, - }; + it->second.object_count++; + } + pg.obj_states[st.oid] = &it->second; + if (st.target_ver < st.max_ver) + { + pg.ver_override[st.oid] = st.target_ver; + } + if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE)) + { + spp::sparse_hash_map pieces; + for (int i = st.obj_start; i < st.obj_end; i++) + { + auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }]; + if (!pcs.max_ver) + { + pcs.max_ver = all[i].version; + } + if (all[i].is_stable && !pcs.stable_ver) + { + pcs.stable_ver = all[i].version; + } + } + for (auto pp: pieces) + { + auto & pcs = pp.second; + if (pcs.stable_ver < pcs.max_ver) + { + auto & act = obj_stab_actions[pp.first]; + if (pcs.max_ver > st.target_ver) + { + act.rollback = true; + act.rollback_to = st.target_ver; + } + else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver) + { + act.rollback = true; + act.rollback_to = pcs.stable_ver; + } + if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver) + { + act.make_stable = true; + act.stable_to = st.target_ver; + } + } + } } } else @@ -102,30 +163,40 @@ void pg_t::calc_object_states() if (st.oid.inode != 0) { // Remember object state - remember_object(st, all, i); + st.obj_end = st.ver_end = i; + remember_object(st, all); } - st.start = i; + st.obj_start = st.ver_start = i; st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT }; st.max_ver = st.target_ver = all[i].version; st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; - st.is_buggy = false; + st.is_buggy = st.has_old_unstable = false; } - if (st.target_ver != all[i].version) + else if (st.target_ver != all[i].version) { if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize) { // Version is either recoverable or stable, choose it as target and skip previous versions - remember_object(st, all, i); + st.ver_end = i; + i++; while (i < all.size() && st.oid.inode == all[i].oid.inode && st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT)) { + if (!all[i].is_stable) + { + st.has_old_unstable = true; + } i++; } + st.obj_end = i; + remember_object(st, all); + i--; continue; } else { // Remember that there are newer unrecoverable versions + st.ver_start = i; st.target_ver = all[i].version; st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0; } @@ -157,6 +228,8 @@ void pg_t::calc_object_states() if (st.oid.inode != 0) { // Remember object state - remember_object(st, all, all.size()); + st.obj_end = st.ver_end = all.size(); + remember_object(st, all); } + pg.state = pg.state | PG_ACTIVE; } diff --git a/osd_peering_pg.h b/osd_peering_pg.h index 2d60f82c2..cad21d0b3 100644 --- a/osd_peering_pg.h +++ b/osd_peering_pg.h @@ -13,24 +13,21 @@ #define PG_INCOMPLETE (1<<2) #define PG_ACTIVE (1<<3) // Plus any of these: -#define PG_HAS_UNFOUND (1<<4) +#define PG_HAS_INCOMPLETE (1<<4) #define PG_HAS_DEGRADED (1<<5) #define PG_HAS_MISPLACED (1<<6) +#define PG_HAS_UNCLEAN (1<<7) // OSD object states #define OBJ_CLEAN 0x01 #define OBJ_MISPLACED 0x02 #define OBJ_DEGRADED 0x03 #define OBJ_INCOMPLETE 0x04 -#define OBJ_NONSTABILIZED 0x10000 -#define OBJ_UNDERWRITTEN 0x20000 +#define OBJ_NEEDS_STABLE 0x10000 +#define OBJ_NEEDS_ROLLBACK 0x20000 #define OBJ_OVERCOPIED 0x40000 #define OBJ_BUGGY 0x80000 -// Max 64 replicas -#define STRIPE_MASK 0x3F -#define STRIPE_SHIFT 6 - struct pg_obj_loc_t { uint64_t role; @@ -47,12 +44,6 @@ struct pg_osd_set_state_t uint64_t object_count = 0; }; -struct pg_ver_override_t -{ - uint64_t max_ver; - uint64_t target_ver; -}; - struct pg_list_result_t { obj_ver_id *buf; @@ -69,12 +60,12 @@ struct pg_peering_state_t struct pg_obj_state_check_t { - int start = 0; + int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; object_id oid = { 0 }; uint64_t max_ver = 0; uint64_t target_ver = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0; - bool is_buggy = false; + bool is_buggy = false, has_old_unstable = false; pg_osd_set_t osd_set; }; @@ -86,25 +77,45 @@ struct obj_ver_role bool is_stable; }; +struct obj_piece_id_t +{ + object_id oid; + uint64_t osd_num; +}; + +struct obj_piece_ver_t +{ + uint64_t max_ver = 0; + uint64_t stable_ver = 0; +}; + +struct obj_stab_action_t +{ + bool rollback = false, make_stable = false; + uint64_t stable_to = 0, rollback_to = 0; +}; + struct pg_t { int state; - uint64_t pg_size = 3, pg_minsize = 2; + uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2; uint64_t pg_num; uint64_t clean_count = 0; - // target_set = (role => osd_num). role starts from zero + // target_set = (role => osd_num). role numbers start with zero + // when PG is degraded, target_set only includes 2 OSDs std::vector target_set; // moved object map. by default, each object is considered to reside on the target_set. // this map stores all objects that differ. - // this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario + // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario std::map state_dict; spp::sparse_hash_map obj_states; - spp::sparse_hash_map ver_override; + std::map obj_stab_actions; + spp::sparse_hash_map ver_override; pg_peering_state_t *peering_state = NULL; void calc_object_states(); - void remember_object(pg_obj_state_check_t &st, std::vector &all, int end); + void remember_object(pg_obj_state_check_t &st, std::vector &all); }; inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b) @@ -122,6 +133,16 @@ inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num; } +inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b) +{ + return a.oid == b.oid && a.osd_num == b.osd_num; +} + +inline bool operator < (const obj_piece_id_t & a, const obj_piece_id_t & b) +{ + return a.oid < b.oid || a.oid == b.oid && a.osd_num < b.osd_num; +} + namespace std { template<> struct hash @@ -139,4 +160,15 @@ namespace std return seed; } }; + + template<> struct hash + { + inline size_t operator()(const obj_piece_id_t &s) const + { + size_t seed = std::hash()(s.oid); + // Copy-pasted from spp::hash_combine() + seed ^= (s.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); + return seed; + } + }; } diff --git a/test.cpp b/test.cpp index eb7f9b8e5..fae4ae36a 100644 --- a/test.cpp +++ b/test.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include "blockstore.h" #include "blockstore_impl.h" #include "osd_peering_pg.h" +//#include "cpp-btree/btree_map.h" static int setup_context(unsigned entries, struct io_uring *ring) { @@ -335,7 +337,7 @@ int main04(int argc, char *argv[]) return 0; } -int main(int argc, char *argv[]) +int main05(int argc, char *argv[]) { // FIXME extract this into a test pg_t pg = { @@ -372,3 +374,50 @@ int main(int argc, char *argv[]) } return 0; } + +int main(int argc, char *argv[]) +{ + timeval fill_start, fill_end, filter_end; + spp::sparse_hash_map clean_db; + //std::map clean_db; + //btree::btree_map clean_db; + gettimeofday(&fill_start, NULL); + printf("filling\n"); + uint64_t total = 1024*1024*8*4; + clean_db.resize(total); + for (uint64_t i = 0; i < total; i++) + { + clean_db[(object_id){ + .inode = 1, + //.stripe = (i << STRIPE_SHIFT), + .stripe = (((367*i) % total) << STRIPE_SHIFT), + }] = (clean_entry){ + .version = 1, + .location = i << DEFAULT_ORDER, + }; + } + gettimeofday(&fill_end, NULL); + // no resize(): + // spp = 17.87s (seq), 41.81s (rand), 3.29s (seq+resize), 8.3s (rand+resize), ~1.3G RAM in all cases + // std::unordered_map = 6.14 sec, ~2.3G RAM + // std::map = 13 sec (seq), 5.54 sec (rand), ~2.5G RAM + // cpp-btree = 2.47 sec (seq) ~1.2G RAM, 20.6 sec (pseudo-random 367*i % total) ~1.5G RAM + printf("filled %.2f sec\n", (fill_end.tv_sec - fill_start.tv_sec) + (fill_end.tv_usec - fill_start.tv_usec) / 1000000.0); + for (int pg = 0; pg < 100; pg++) + { + obj_ver_id* buf1 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * ((total+99)/100)); + int j = 0; + for (auto it: clean_db) + if ((it.first % 100) == pg) + buf1[j++] = { .oid = it.first, .version = it.second.version }; + free(buf1); + printf("filtered %d\n", j); + } + gettimeofday(&filter_end, NULL); + // spp = 42.15 sec / 60 sec (rand) + // std::unordered_map = 43.7 sec + // std::map = 156.13 sec + // cpp-btree = 21.87 sec (seq), 44.33 sec (rand) + printf("100 times filter %.2f sec\n", (filter_end.tv_sec - fill_end.tv_sec) + (filter_end.tv_usec - fill_end.tv_usec) / 1000000.0); + return 0; +}