Calculate required stabilize/rollback actions, add more map perf tests

blocking-uring-test
Vitaliy Filippov 2020-01-27 01:34:24 +03:00
parent 079f129390
commit 1447c44b68
5 changed files with 202 additions and 43 deletions

View File

@ -2,8 +2,12 @@
#include <stdint.h>
// Max 64 replicas
#define STRIPE_MASK 0x3F
#define STRIPE_SHIFT 6
// 16 bytes per object/stripe id
// stripe includes replica number in 4 least significant bits
// stripe includes replica number in 6 (or maybe 4, see above) least significant bits
struct __attribute__((__packed__)) object_id
{
uint64_t inode;
@ -12,7 +16,7 @@ struct __attribute__((__packed__)) object_id
inline uint64_t operator % (const object_id & a, const uint64_t b)
{
return ((a.inode % b) * (0x100000000 % b) * (0x100000000 % b) + a.stripe % b) % b;
return ((a.inode % b) * (0x100000000 % b) * (0x100000000 % b) + (a.stripe >> STRIPE_SHIFT) % b) % b;
}
inline bool operator == (const object_id & a, const object_id & b)

View File

@ -180,6 +180,7 @@ void osd_t::handle_peers()
void osd_t::start_pg_peering(int pg_idx)
{
// FIXME: Set PG_INCOMPLETE if incomplete
auto & pg = pgs[pg_idx];
auto ps = pg.peering_state = new pg_peering_state_t();
{

View File

@ -1,33 +1,56 @@
#include "osd_peering_pg.h"
void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all, int end)
void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all)
{
auto & pg = *this;
// Remember the decision
uint64_t state = 0;
if (st.n_roles == pg.pg_size)
if (st.n_roles == pg.pg_cursize)
{
if (st.n_matched == pg.pg_size)
if (st.n_matched == pg.pg_cursize)
state = OBJ_CLEAN;
else
{
state = OBJ_MISPLACED;
pg.state = pg.state | PG_HAS_MISPLACED;
}
}
else if (st.n_roles < pg.pg_minsize)
{
state = OBJ_INCOMPLETE;
pg.state = pg.state | PG_HAS_INCOMPLETE;
}
else
{
state = OBJ_DEGRADED;
pg.state = pg.state | PG_HAS_DEGRADED;
}
if (st.n_copies > pg.pg_size)
{
state |= OBJ_OVERCOPIED;
pg.state = pg.state | PG_HAS_UNCLEAN;
}
if (st.n_stable < st.n_copies)
state |= OBJ_NONSTABILIZED;
if (st.target_ver < st.max_ver)
state |= OBJ_UNDERWRITTEN;
{
state |= OBJ_NEEDS_STABLE;
pg.state = pg.state | PG_HAS_UNCLEAN;
}
if (st.target_ver < st.max_ver || st.has_old_unstable)
{
state |= OBJ_NEEDS_ROLLBACK;
pg.state = pg.state | PG_HAS_UNCLEAN;
pg.ver_override[st.oid] = st.target_ver;
}
if (st.is_buggy)
{
state |= OBJ_BUGGY;
// FIXME: bring pg offline
throw std::runtime_error("buggy object state");
}
if (state != OBJ_CLEAN)
{
st.osd_set.clear();
for (int i = st.start; i < end; i++)
for (int i = st.ver_start; i < st.ver_end; i++)
{
st.osd_set.push_back((pg_obj_loc_t){
.role = (all[i].oid.stripe & STRIPE_MASK),
@ -47,14 +70,52 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
it = pg.state_dict.find(st.osd_set);
}
else
it->second.object_count++;
pg.obj_states[st.oid] = &it->second;
if (state & OBJ_UNDERWRITTEN)
{
pg.ver_override[st.oid] = {
.max_ver = st.max_ver,
.target_ver = st.target_ver,
};
it->second.object_count++;
}
pg.obj_states[st.oid] = &it->second;
if (st.target_ver < st.max_ver)
{
pg.ver_override[st.oid] = st.target_ver;
}
if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
{
spp::sparse_hash_map<obj_piece_id_t, obj_piece_ver_t> pieces;
for (int i = st.obj_start; i < st.obj_end; i++)
{
auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }];
if (!pcs.max_ver)
{
pcs.max_ver = all[i].version;
}
if (all[i].is_stable && !pcs.stable_ver)
{
pcs.stable_ver = all[i].version;
}
}
for (auto pp: pieces)
{
auto & pcs = pp.second;
if (pcs.stable_ver < pcs.max_ver)
{
auto & act = obj_stab_actions[pp.first];
if (pcs.max_ver > st.target_ver)
{
act.rollback = true;
act.rollback_to = st.target_ver;
}
else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver)
{
act.rollback = true;
act.rollback_to = pcs.stable_ver;
}
if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver)
{
act.make_stable = true;
act.stable_to = st.target_ver;
}
}
}
}
}
else
@ -102,30 +163,40 @@ void pg_t::calc_object_states()
if (st.oid.inode != 0)
{
// Remember object state
remember_object(st, all, i);
st.obj_end = st.ver_end = i;
remember_object(st, all);
}
st.start = i;
st.obj_start = st.ver_start = i;
st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT };
st.max_ver = st.target_ver = all[i].version;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
st.is_buggy = false;
st.is_buggy = st.has_old_unstable = false;
}
if (st.target_ver != all[i].version)
else if (st.target_ver != all[i].version)
{
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
{
// Version is either recoverable or stable, choose it as target and skip previous versions
remember_object(st, all, i);
st.ver_end = i;
i++;
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT))
{
if (!all[i].is_stable)
{
st.has_old_unstable = true;
}
i++;
}
st.obj_end = i;
remember_object(st, all);
i--;
continue;
}
else
{
// Remember that there are newer unrecoverable versions
st.ver_start = i;
st.target_ver = all[i].version;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
}
@ -157,6 +228,8 @@ void pg_t::calc_object_states()
if (st.oid.inode != 0)
{
// Remember object state
remember_object(st, all, all.size());
st.obj_end = st.ver_end = all.size();
remember_object(st, all);
}
pg.state = pg.state | PG_ACTIVE;
}

View File

@ -13,24 +13,21 @@
#define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3)
// Plus any of these:
#define PG_HAS_UNFOUND (1<<4)
#define PG_HAS_INCOMPLETE (1<<4)
#define PG_HAS_DEGRADED (1<<5)
#define PG_HAS_MISPLACED (1<<6)
#define PG_HAS_UNCLEAN (1<<7)
// OSD object states
#define OBJ_CLEAN 0x01
#define OBJ_MISPLACED 0x02
#define OBJ_DEGRADED 0x03
#define OBJ_INCOMPLETE 0x04
#define OBJ_NONSTABILIZED 0x10000
#define OBJ_UNDERWRITTEN 0x20000
#define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000
#define OBJ_OVERCOPIED 0x40000
#define OBJ_BUGGY 0x80000
// Max 64 replicas
#define STRIPE_MASK 0x3F
#define STRIPE_SHIFT 6
struct pg_obj_loc_t
{
uint64_t role;
@ -47,12 +44,6 @@ struct pg_osd_set_state_t
uint64_t object_count = 0;
};
struct pg_ver_override_t
{
uint64_t max_ver;
uint64_t target_ver;
};
struct pg_list_result_t
{
obj_ver_id *buf;
@ -69,12 +60,12 @@ struct pg_peering_state_t
struct pg_obj_state_check_t
{
int start = 0;
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
object_id oid = { 0 };
uint64_t max_ver = 0;
uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
bool is_buggy = false;
bool is_buggy = false, has_old_unstable = false;
pg_osd_set_t osd_set;
};
@ -86,25 +77,45 @@ struct obj_ver_role
bool is_stable;
};
struct obj_piece_id_t
{
object_id oid;
uint64_t osd_num;
};
struct obj_piece_ver_t
{
uint64_t max_ver = 0;
uint64_t stable_ver = 0;
};
struct obj_stab_action_t
{
bool rollback = false, make_stable = false;
uint64_t stable_to = 0, rollback_to = 0;
};
struct pg_t
{
int state;
uint64_t pg_size = 3, pg_minsize = 2;
uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
uint64_t pg_num;
uint64_t clean_count = 0;
// target_set = (role => osd_num). role starts from zero
// target_set = (role => osd_num). role numbers start with zero
// when PG is degraded, target_set only includes 2 OSDs
std::vector<uint64_t> target_set;
// moved object map. by default, each object is considered to reside on the target_set.
// this map stores all objects that differ.
// this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
// which is up to ~192 MB per 1 TB in the worst case scenario
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
spp::sparse_hash_map<object_id, pg_osd_set_state_t*> obj_states;
spp::sparse_hash_map<object_id, pg_ver_override_t> ver_override;
std::map<obj_piece_id_t, obj_stab_action_t> obj_stab_actions;
spp::sparse_hash_map<object_id, uint64_t> ver_override;
pg_peering_state_t *peering_state = NULL;
void calc_object_states();
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all, int end);
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
};
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
@ -122,6 +133,16 @@ inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num;
}
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
{
return a.oid == b.oid && a.osd_num == b.osd_num;
}
inline bool operator < (const obj_piece_id_t & a, const obj_piece_id_t & b)
{
return a.oid < b.oid || a.oid == b.oid && a.osd_num < b.osd_num;
}
namespace std
{
template<> struct hash<pg_osd_set_t>
@ -139,4 +160,15 @@ namespace std
return seed;
}
};
template<> struct hash<obj_piece_id_t>
{
inline size_t operator()(const obj_piece_id_t &s) const
{
size_t seed = std::hash<object_id>()(s.oid);
// Copy-pasted from spp::hash_combine()
seed ^= (s.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
return seed;
}
};
}

View File

@ -2,6 +2,7 @@
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
@ -26,6 +27,7 @@
#include "blockstore.h"
#include "blockstore_impl.h"
#include "osd_peering_pg.h"
//#include "cpp-btree/btree_map.h"
static int setup_context(unsigned entries, struct io_uring *ring)
{
@ -335,7 +337,7 @@ int main04(int argc, char *argv[])
return 0;
}
int main(int argc, char *argv[])
int main05(int argc, char *argv[])
{
// FIXME extract this into a test
pg_t pg = {
@ -372,3 +374,50 @@ int main(int argc, char *argv[])
}
return 0;
}
int main(int argc, char *argv[])
{
timeval fill_start, fill_end, filter_end;
spp::sparse_hash_map<object_id, clean_entry> clean_db;
//std::map<object_id, clean_entry> clean_db;
//btree::btree_map<object_id, clean_entry> clean_db;
gettimeofday(&fill_start, NULL);
printf("filling\n");
uint64_t total = 1024*1024*8*4;
clean_db.resize(total);
for (uint64_t i = 0; i < total; i++)
{
clean_db[(object_id){
.inode = 1,
//.stripe = (i << STRIPE_SHIFT),
.stripe = (((367*i) % total) << STRIPE_SHIFT),
}] = (clean_entry){
.version = 1,
.location = i << DEFAULT_ORDER,
};
}
gettimeofday(&fill_end, NULL);
// no resize():
// spp = 17.87s (seq), 41.81s (rand), 3.29s (seq+resize), 8.3s (rand+resize), ~1.3G RAM in all cases
// std::unordered_map = 6.14 sec, ~2.3G RAM
// std::map = 13 sec (seq), 5.54 sec (rand), ~2.5G RAM
// cpp-btree = 2.47 sec (seq) ~1.2G RAM, 20.6 sec (pseudo-random 367*i % total) ~1.5G RAM
printf("filled %.2f sec\n", (fill_end.tv_sec - fill_start.tv_sec) + (fill_end.tv_usec - fill_start.tv_usec) / 1000000.0);
for (int pg = 0; pg < 100; pg++)
{
obj_ver_id* buf1 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * ((total+99)/100));
int j = 0;
for (auto it: clean_db)
if ((it.first % 100) == pg)
buf1[j++] = { .oid = it.first, .version = it.second.version };
free(buf1);
printf("filtered %d\n", j);
}
gettimeofday(&filter_end, NULL);
// spp = 42.15 sec / 60 sec (rand)
// std::unordered_map = 43.7 sec
// std::map = 156.13 sec
// cpp-btree = 21.87 sec (seq), 44.33 sec (rand)
printf("100 times filter %.2f sec\n", (filter_end.tv_sec - fill_end.tv_sec) + (filter_end.tv_usec - fill_end.tv_usec) / 1000000.0);
return 0;
}