forked from vitalif/vitastor
Fix unstable flushing, include extra OSDs with old object versions in osd_set
parent
fd8e1a8418
commit
036f4c5bf3
|
@ -90,6 +90,10 @@ void osd_t::handle_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t osd
|
|||
it->first.oid.inode != prev_it->first.oid.inode ||
|
||||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
|
||||
{
|
||||
pg.ver_override.erase((object_id){
|
||||
.inode = prev_it->first.oid.inode,
|
||||
.stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
|
||||
});
|
||||
auto wr_it = pg.write_queue.find((object_id){
|
||||
.inode = prev_it->first.oid.inode,
|
||||
.stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
|
||||
|
|
|
@ -23,20 +23,21 @@ struct obj_piece_ver_t
|
|||
{
|
||||
uint64_t max_ver = 0;
|
||||
uint64_t stable_ver = 0;
|
||||
uint64_t max_target = 0;
|
||||
};
|
||||
|
||||
struct pg_obj_state_check_t
|
||||
{
|
||||
pg_t *pg;
|
||||
int i;
|
||||
std::vector<obj_ver_role> list;
|
||||
int list_pos;
|
||||
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
|
||||
object_id oid = { 0 };
|
||||
uint64_t max_ver = 0;
|
||||
uint64_t last_ver = 0;
|
||||
uint64_t target_ver = 0;
|
||||
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
|
||||
bool is_buggy = false, has_old_unstable = false;
|
||||
uint64_t n_unstable = 0, n_buggy = 0;
|
||||
pg_osd_set_t osd_set;
|
||||
|
||||
void walk();
|
||||
|
@ -50,10 +51,10 @@ void pg_obj_state_check_t::walk()
|
|||
pg->clean_count = 0;
|
||||
pg->total_count = 0;
|
||||
pg->state = 0;
|
||||
for (i = 0; i < list.size(); i++)
|
||||
for (list_pos = 0; list_pos < list.size(); list_pos++)
|
||||
{
|
||||
if (oid.inode != list[i].oid.inode ||
|
||||
oid.stripe != (list[i].oid.stripe & ~STRIPE_MASK))
|
||||
if (oid.inode != list[list_pos].oid.inode ||
|
||||
oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
|
||||
{
|
||||
if (oid.inode != 0)
|
||||
{
|
||||
|
@ -76,45 +77,45 @@ void pg_obj_state_check_t::walk()
|
|||
|
||||
void pg_obj_state_check_t::start_object()
|
||||
{
|
||||
obj_start = i;
|
||||
oid = { .inode = list[i].oid.inode, .stripe = list[i].oid.stripe & ~STRIPE_MASK };
|
||||
last_ver = max_ver = list[i].version;
|
||||
obj_start = list_pos;
|
||||
oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
|
||||
last_ver = max_ver = list[list_pos].version;
|
||||
target_ver = 0;
|
||||
ver_start = i;
|
||||
ver_start = list_pos;
|
||||
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
|
||||
is_buggy = false;
|
||||
n_unstable = n_buggy = 0;
|
||||
}
|
||||
|
||||
void pg_obj_state_check_t::handle_version()
|
||||
{
|
||||
if (!target_ver && last_ver != list[i].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
|
||||
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
|
||||
{
|
||||
// Version is either stable or recoverable
|
||||
target_ver = last_ver;
|
||||
ver_end = i;
|
||||
ver_end = list_pos;
|
||||
}
|
||||
if (!target_ver)
|
||||
{
|
||||
if (last_ver != list[i].version)
|
||||
if (last_ver != list[list_pos].version)
|
||||
{
|
||||
ver_start = i;
|
||||
ver_start = list_pos;
|
||||
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
|
||||
last_ver = list[i].version;
|
||||
last_ver = list[list_pos].version;
|
||||
}
|
||||
int replica = (list[i].oid.stripe & STRIPE_MASK);
|
||||
int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
|
||||
n_copies++;
|
||||
if (replica >= pg->pg_size)
|
||||
{
|
||||
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
|
||||
is_buggy = true;
|
||||
n_buggy++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (list[i].is_stable)
|
||||
if (list[list_pos].is_stable)
|
||||
{
|
||||
n_stable++;
|
||||
}
|
||||
if (pg->cur_set[replica] != list[i].osd_num)
|
||||
if (pg->cur_set[replica] != list[list_pos].osd_num)
|
||||
{
|
||||
n_mismatched++;
|
||||
}
|
||||
|
@ -125,9 +126,9 @@ void pg_obj_state_check_t::handle_version()
|
|||
}
|
||||
}
|
||||
}
|
||||
else if (!list[i].is_stable)
|
||||
if (!list[list_pos].is_stable)
|
||||
{
|
||||
has_old_unstable = true;
|
||||
n_unstable++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -137,11 +138,17 @@ void pg_obj_state_check_t::finish_object()
|
|||
{
|
||||
// Version is either stable or recoverable
|
||||
target_ver = last_ver;
|
||||
ver_end = i;
|
||||
ver_end = list_pos;
|
||||
}
|
||||
obj_end = i;
|
||||
obj_end = list_pos;
|
||||
// Remember the decision
|
||||
uint64_t state = OBJ_CLEAN;
|
||||
if (n_buggy > 0)
|
||||
{
|
||||
state = OBJ_BUGGY;
|
||||
// FIXME: bring pg offline
|
||||
throw std::runtime_error("buggy object state");
|
||||
}
|
||||
if (target_ver > 0)
|
||||
{
|
||||
if (n_roles < pg->pg_minsize)
|
||||
|
@ -169,29 +176,51 @@ void pg_obj_state_check_t::finish_object()
|
|||
state |= OBJ_MISPLACED;
|
||||
pg->state = pg->state | PG_HAS_MISPLACED;
|
||||
}
|
||||
if (n_stable < n_copies)
|
||||
{
|
||||
state |= OBJ_NEEDS_STABLE;
|
||||
pg->state = pg->state | PG_HAS_UNCLEAN;
|
||||
}
|
||||
}
|
||||
if (target_ver < max_ver || has_old_unstable)
|
||||
{
|
||||
state |= OBJ_NEEDS_ROLLBACK;
|
||||
pg->state = pg->state | PG_HAS_UNCLEAN;
|
||||
}
|
||||
if (is_buggy)
|
||||
{
|
||||
state |= OBJ_BUGGY;
|
||||
// FIXME: bring pg offline
|
||||
throw std::runtime_error("buggy object state");
|
||||
}
|
||||
pg->total_count++;
|
||||
if (state == OBJ_CLEAN)
|
||||
{
|
||||
pg->clean_count++;
|
||||
}
|
||||
else
|
||||
if (n_unstable > 0)
|
||||
{
|
||||
pg->state |= PG_HAS_UNCLEAN;
|
||||
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
|
||||
for (int i = obj_start; i < obj_end; i++)
|
||||
{
|
||||
auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
|
||||
if (!pcs.max_ver)
|
||||
{
|
||||
pcs.max_ver = list[i].version;
|
||||
}
|
||||
if (list[i].is_stable && !pcs.stable_ver)
|
||||
{
|
||||
pcs.stable_ver = list[i].version;
|
||||
}
|
||||
if (list[i].version <= target_ver)
|
||||
{
|
||||
pcs.max_target = list[i].version;
|
||||
}
|
||||
}
|
||||
for (auto pp: pieces)
|
||||
{
|
||||
auto & pcs = pp.second;
|
||||
if (pcs.stable_ver < pcs.max_ver)
|
||||
{
|
||||
auto & act = pg->flush_actions[pp.first];
|
||||
// osd_set doesn't include rollback/stable states, so don't include them in the state code either
|
||||
if (pcs.max_ver > target_ver)
|
||||
{
|
||||
//state |= OBJ_NEEDS_ROLLBACK;
|
||||
act.rollback = true;
|
||||
act.rollback_to = pcs.max_target;
|
||||
}
|
||||
if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
|
||||
{
|
||||
//state |= OBJ_NEEDS_STABLE;
|
||||
act.make_stable = true;
|
||||
act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state != OBJ_CLEAN || ver_end < obj_end)
|
||||
{
|
||||
osd_set.clear();
|
||||
for (int i = ver_start; i < ver_end; i++)
|
||||
|
@ -199,10 +228,45 @@ void pg_obj_state_check_t::finish_object()
|
|||
osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = list[i].osd_num,
|
||||
.stable = list[i].is_stable,
|
||||
.outdated = false,
|
||||
});
|
||||
}
|
||||
std::sort(osd_set.begin(), osd_set.end());
|
||||
}
|
||||
if (ver_end < obj_end)
|
||||
{
|
||||
// Check for outdated versions not present in the current target OSD set
|
||||
for (int i = ver_end; i < obj_end; i++)
|
||||
{
|
||||
int j;
|
||||
for (j = 0; j < osd_set.size(); j++)
|
||||
{
|
||||
if (osd_set[j].osd_num == list[i].osd_num)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
|
||||
{
|
||||
osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = list[i].osd_num,
|
||||
.outdated = true,
|
||||
});
|
||||
state |= OBJ_MISPLACED;
|
||||
pg->state = pg->state | PG_HAS_MISPLACED;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (target_ver < max_ver)
|
||||
{
|
||||
pg->ver_override[oid] = target_ver;
|
||||
}
|
||||
if (state == OBJ_CLEAN)
|
||||
{
|
||||
pg->clean_count++;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto it = pg->state_dict.find(osd_set);
|
||||
if (it == pg->state_dict.end())
|
||||
{
|
||||
|
@ -213,9 +277,12 @@ void pg_obj_state_check_t::finish_object()
|
|||
read_target[i] = 0;
|
||||
}
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.outdated)
|
||||
{
|
||||
read_target[o.role] = o.osd_num;
|
||||
}
|
||||
}
|
||||
pg->state_dict[osd_set] = {
|
||||
.read_target = read_target,
|
||||
.osd_set = osd_set,
|
||||
|
@ -229,49 +296,6 @@ void pg_obj_state_check_t::finish_object()
|
|||
it->second.object_count++;
|
||||
}
|
||||
pg->obj_states[oid] = &it->second;
|
||||
if (target_ver < max_ver)
|
||||
{
|
||||
pg->ver_override[oid] = target_ver;
|
||||
}
|
||||
if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
|
||||
{
|
||||
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
|
||||
for (int i = obj_start; i < obj_end; i++)
|
||||
{
|
||||
auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
|
||||
if (!pcs.max_ver)
|
||||
{
|
||||
pcs.max_ver = list[i].version;
|
||||
}
|
||||
if (list[i].is_stable && !pcs.stable_ver)
|
||||
{
|
||||
pcs.stable_ver = list[i].version;
|
||||
}
|
||||
}
|
||||
for (auto pp: pieces)
|
||||
{
|
||||
auto & pcs = pp.second;
|
||||
if (pcs.stable_ver < pcs.max_ver)
|
||||
{
|
||||
auto & act = pg->flush_actions[pp.first];
|
||||
if (pcs.max_ver > target_ver)
|
||||
{
|
||||
act.rollback = true;
|
||||
act.rollback_to = target_ver;
|
||||
}
|
||||
else if (pcs.max_ver < target_ver && pcs.stable_ver < pcs.max_ver)
|
||||
{
|
||||
act.rollback = true;
|
||||
act.rollback_to = pcs.stable_ver;
|
||||
}
|
||||
if (pcs.max_ver >= target_ver && pcs.stable_ver < target_ver)
|
||||
{
|
||||
act.make_stable = true;
|
||||
act.stable_to = target_ver;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ struct pg_obj_loc_t
|
|||
{
|
||||
uint64_t role;
|
||||
osd_num_t osd_num;
|
||||
bool stable;
|
||||
bool outdated;
|
||||
};
|
||||
|
||||
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
||||
|
@ -121,8 +121,9 @@ struct pg_t
|
|||
|
||||
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||
{
|
||||
return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num ||
|
||||
a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
|
||||
return a.role < b.role ||
|
||||
a.role == b.role && a.outdated < b.outdated ||
|
||||
a.role == b.role && a.outdated == b.outdated && a.osd_num < b.osd_num;
|
||||
}
|
||||
|
||||
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
||||
|
@ -147,7 +148,6 @@ namespace std
|
|||
// Copy-pasted from spp::hash_combine()
|
||||
seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue