forked from vitalif/vitastor
Extract object state calculation to a separate file and slightly test it
parent
d2a3f0c6dd
commit
98efdb78bd
@ -0,0 +1,162 @@
|
||||
#include "osd_peering_pg.h"
|
||||
|
||||
void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all, int end)
|
||||
{
|
||||
auto & pg = *this;
|
||||
// Remember the decision
|
||||
uint64_t state = 0;
|
||||
if (st.n_roles == pg.pg_size)
|
||||
{
|
||||
if (st.n_matched == pg.pg_size)
|
||||
state = OBJ_CLEAN;
|
||||
else
|
||||
state = OBJ_MISPLACED;
|
||||
}
|
||||
else if (st.n_roles < pg.pg_minsize)
|
||||
state = OBJ_INCOMPLETE;
|
||||
else
|
||||
state = OBJ_DEGRADED;
|
||||
if (st.n_copies > pg.pg_size)
|
||||
state |= OBJ_OVERCOPIED;
|
||||
if (st.n_stable < st.n_copies)
|
||||
state |= OBJ_NONSTABILIZED;
|
||||
if (st.target_ver < st.max_ver)
|
||||
state |= OBJ_UNDERWRITTEN;
|
||||
if (st.is_buggy)
|
||||
state |= OBJ_BUGGY;
|
||||
if (state != OBJ_CLEAN)
|
||||
{
|
||||
st.osd_set.clear();
|
||||
for (int i = st.start; i < end; i++)
|
||||
{
|
||||
st.osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (all[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = all[i].osd_num,
|
||||
.stable = all[i].is_stable,
|
||||
});
|
||||
}
|
||||
std::sort(st.osd_set.begin(), st.osd_set.end());
|
||||
auto it = pg.state_dict.find(st.osd_set);
|
||||
if (it == pg.state_dict.end())
|
||||
{
|
||||
pg.state_dict[st.osd_set] = {
|
||||
.osd_set = st.osd_set,
|
||||
.state = state,
|
||||
.object_count = 1,
|
||||
};
|
||||
it = pg.state_dict.find(st.osd_set);
|
||||
}
|
||||
else
|
||||
it->second.object_count++;
|
||||
pg.obj_states[st.oid] = &it->second;
|
||||
if (state & OBJ_UNDERWRITTEN)
|
||||
{
|
||||
pg.ver_override[st.oid] = {
|
||||
.max_ver = st.max_ver,
|
||||
.target_ver = st.target_ver,
|
||||
};
|
||||
}
|
||||
}
|
||||
else
|
||||
pg.clean_count++;
|
||||
}
|
||||
|
||||
void pg_t::calc_object_states()
|
||||
{
|
||||
auto & pg = *this;
|
||||
// Copy all object lists into one array
|
||||
std::vector<obj_ver_role> all;
|
||||
auto ps = pg.peering_state;
|
||||
for (auto it: ps->list_results)
|
||||
{
|
||||
auto nstab = it.second.stable_count;
|
||||
auto n = it.second.total_count;
|
||||
auto osd_num = it.first;
|
||||
uint64_t start = all.size();
|
||||
all.resize(start + n);
|
||||
obj_ver_id *ov = it.second.buf;
|
||||
for (uint64_t i = 0; i < n; i++, ov++)
|
||||
{
|
||||
all[start+i] = {
|
||||
.oid = ov->oid,
|
||||
.version = ov->version,
|
||||
.osd_num = osd_num,
|
||||
.is_stable = i < nstab,
|
||||
};
|
||||
}
|
||||
free(it.second.buf);
|
||||
it.second.buf = NULL;
|
||||
}
|
||||
ps->list_results.clear();
|
||||
// Sort
|
||||
std::sort(all.begin(), all.end());
|
||||
// Walk over it and check object states
|
||||
pg.clean_count = 0;
|
||||
int replica = 0;
|
||||
pg_obj_state_check_t st;
|
||||
for (int i = 0; i < all.size(); i++)
|
||||
{
|
||||
if (st.oid.inode != all[i].oid.inode ||
|
||||
st.oid.stripe != (all[i].oid.stripe >> STRIPE_SHIFT))
|
||||
{
|
||||
if (st.oid.inode != 0)
|
||||
{
|
||||
// Remember object state
|
||||
remember_object(st, all, i);
|
||||
}
|
||||
st.start = i;
|
||||
st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT };
|
||||
st.max_ver = st.target_ver = all[i].version;
|
||||
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
|
||||
st.is_buggy = false;
|
||||
}
|
||||
if (st.target_ver != all[i].version)
|
||||
{
|
||||
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
|
||||
{
|
||||
// Version is either recoverable or stable, choose it as target and skip previous versions
|
||||
remember_object(st, all, i);
|
||||
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
|
||||
st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Remember that there are newer unrecoverable versions
|
||||
st.target_ver = all[i].version;
|
||||
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
|
||||
}
|
||||
}
|
||||
replica = (all[i].oid.stripe & STRIPE_MASK);
|
||||
st.n_copies++;
|
||||
if (replica >= pg.pg_size)
|
||||
{
|
||||
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
|
||||
st.is_buggy = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (all[i].is_stable)
|
||||
{
|
||||
st.n_stable++;
|
||||
}
|
||||
if (pg.target_set[replica] == all[i].osd_num)
|
||||
{
|
||||
st.n_matched++;
|
||||
}
|
||||
if (!(st.has_roles & (1 << replica)))
|
||||
{
|
||||
st.has_roles = st.has_roles | (1 << replica);
|
||||
st.n_roles++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (st.oid.inode != 0)
|
||||
{
|
||||
// Remember object state
|
||||
remember_object(st, all, all.size());
|
||||
}
|
||||
}
|
@ -0,0 +1,142 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "object_id.h"
|
||||
|
||||
#include "sparsepp/sparsepp/spp.h"
|
||||
|
||||
// Placement group states
|
||||
// Exactly one of these:
|
||||
#define PG_OFFLINE (1<<0)
|
||||
#define PG_PEERING (1<<1)
|
||||
#define PG_INCOMPLETE (1<<2)
|
||||
#define PG_ACTIVE (1<<3)
|
||||
// Plus any of these:
|
||||
#define PG_HAS_UNFOUND (1<<4)
|
||||
#define PG_HAS_DEGRADED (1<<5)
|
||||
#define PG_HAS_MISPLACED (1<<6)
|
||||
|
||||
// OSD object states
|
||||
#define OBJ_CLEAN 0x01
|
||||
#define OBJ_MISPLACED 0x02
|
||||
#define OBJ_DEGRADED 0x03
|
||||
#define OBJ_INCOMPLETE 0x04
|
||||
#define OBJ_NONSTABILIZED 0x10000
|
||||
#define OBJ_UNDERWRITTEN 0x20000
|
||||
#define OBJ_OVERCOPIED 0x40000
|
||||
#define OBJ_BUGGY 0x80000
|
||||
|
||||
// Max 64 replicas
|
||||
#define STRIPE_MASK 0x3F
|
||||
#define STRIPE_SHIFT 6
|
||||
|
||||
struct pg_obj_loc_t
|
||||
{
|
||||
uint64_t role;
|
||||
uint64_t osd_num;
|
||||
bool stable;
|
||||
};
|
||||
|
||||
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
||||
|
||||
struct pg_osd_set_state_t
|
||||
{
|
||||
pg_osd_set_t osd_set;
|
||||
uint64_t state = 0;
|
||||
uint64_t object_count = 0;
|
||||
};
|
||||
|
||||
struct pg_ver_override_t
|
||||
{
|
||||
uint64_t max_ver;
|
||||
uint64_t target_ver;
|
||||
};
|
||||
|
||||
struct pg_list_result_t
|
||||
{
|
||||
obj_ver_id *buf;
|
||||
uint64_t total_count;
|
||||
uint64_t stable_count;
|
||||
};
|
||||
|
||||
struct pg_peering_state_t
|
||||
{
|
||||
// osd_num -> list result
|
||||
spp::sparse_hash_map<uint64_t, pg_list_result_t> list_results;
|
||||
int list_done = 0;
|
||||
};
|
||||
|
||||
struct pg_obj_state_check_t
|
||||
{
|
||||
int start = 0;
|
||||
object_id oid = { 0 };
|
||||
uint64_t max_ver = 0;
|
||||
uint64_t target_ver = 0;
|
||||
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
|
||||
bool is_buggy = false;
|
||||
pg_osd_set_t osd_set;
|
||||
};
|
||||
|
||||
struct obj_ver_role
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint64_t osd_num;
|
||||
bool is_stable;
|
||||
};
|
||||
|
||||
struct pg_t
|
||||
{
|
||||
int state;
|
||||
uint64_t pg_size = 3, pg_minsize = 2;
|
||||
uint64_t pg_num;
|
||||
uint64_t clean_count = 0;
|
||||
// target_set = (role => osd_num). role starts from zero
|
||||
std::vector<uint64_t> target_set;
|
||||
// moved object map. by default, each object is considered to reside on the target_set.
|
||||
// this map stores all objects that differ.
|
||||
// this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
||||
spp::sparse_hash_map<object_id, pg_osd_set_state_t*> obj_states;
|
||||
spp::sparse_hash_map<object_id, pg_ver_override_t> ver_override;
|
||||
pg_peering_state_t *peering_state = NULL;
|
||||
|
||||
void calc_object_states();
|
||||
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all, int end);
|
||||
};
|
||||
|
||||
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||
{
|
||||
return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num ||
|
||||
a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
|
||||
}
|
||||
|
||||
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
|
||||
{
|
||||
return a.oid < b.oid ||
|
||||
// object versions come in descending order
|
||||
a.oid == b.oid && a.version > b.version ||
|
||||
a.oid == b.oid && a.version == b.version ||
|
||||
a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num;
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<> struct hash<pg_osd_set_t>
|
||||
{
|
||||
inline size_t operator()(const pg_osd_set_t &s) const
|
||||
{
|
||||
size_t seed = 0;
|
||||
for (auto e: s)
|
||||
{
|
||||
// Copy-pasted from spp::hash_combine()
|
||||
seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
}
|
Loading…
Reference in New Issue