2020-01-24 02:23:27 +03:00
|
|
|
#include "osd_peering_pg.h"
|
|
|
|
|
2020-03-23 00:39:53 +03:00
|
|
|
struct obj_ver_role
|
2020-01-24 02:23:27 +03:00
|
|
|
{
|
2020-03-23 00:39:53 +03:00
|
|
|
object_id oid;
|
|
|
|
uint64_t version;
|
|
|
|
uint64_t osd_num;
|
|
|
|
bool is_stable;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
|
|
|
|
{
|
2020-03-24 01:13:04 +03:00
|
|
|
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, role ASC, osd_num ASC
|
2020-03-23 00:39:53 +03:00
|
|
|
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
|
|
|
|
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
|
|
|
|
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
|
2020-03-24 01:13:04 +03:00
|
|
|
a.version > b.version ||
|
|
|
|
a.version == b.version && (
|
|
|
|
a.oid.stripe < b.oid.stripe ||
|
|
|
|
a.oid.stripe == b.oid.stripe && a.osd_num < b.osd_num
|
|
|
|
)
|
2020-03-23 00:39:53 +03:00
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct obj_piece_ver_t
|
|
|
|
{
|
|
|
|
uint64_t max_ver = 0;
|
|
|
|
uint64_t stable_ver = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
uint64_t max_target = 0;
|
2020-03-23 00:39:53 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
struct pg_obj_state_check_t
|
|
|
|
{
|
|
|
|
pg_t *pg;
|
|
|
|
std::vector<obj_ver_role> list;
|
2020-03-23 15:44:29 +03:00
|
|
|
int list_pos;
|
2020-03-23 00:39:53 +03:00
|
|
|
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
|
|
|
|
object_id oid = { 0 };
|
|
|
|
uint64_t max_ver = 0;
|
|
|
|
uint64_t last_ver = 0;
|
|
|
|
uint64_t target_ver = 0;
|
|
|
|
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
uint64_t n_unstable = 0, n_buggy = 0;
|
2020-03-23 00:39:53 +03:00
|
|
|
pg_osd_set_t osd_set;
|
2020-04-27 14:32:59 +03:00
|
|
|
int log_level;
|
2020-03-23 00:39:53 +03:00
|
|
|
|
|
|
|
void walk();
|
|
|
|
void start_object();
|
|
|
|
void handle_version();
|
|
|
|
void finish_object();
|
|
|
|
};
|
|
|
|
|
|
|
|
void pg_obj_state_check_t::walk()
|
|
|
|
{
|
|
|
|
pg->clean_count = 0;
|
|
|
|
pg->total_count = 0;
|
|
|
|
pg->state = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
for (list_pos = 0; list_pos < list.size(); list_pos++)
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
if (oid.inode != list[list_pos].oid.inode ||
|
|
|
|
oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
|
|
|
if (oid.inode != 0)
|
|
|
|
{
|
|
|
|
finish_object();
|
|
|
|
}
|
|
|
|
start_object();
|
|
|
|
}
|
|
|
|
handle_version();
|
|
|
|
}
|
|
|
|
if (oid.inode != 0)
|
|
|
|
{
|
|
|
|
finish_object();
|
|
|
|
}
|
|
|
|
if (pg->pg_cursize < pg->pg_size)
|
|
|
|
{
|
2020-05-04 01:32:24 +03:00
|
|
|
pg->state |= PG_DEGRADED;
|
|
|
|
}
|
|
|
|
pg->state |= PG_ACTIVE;
|
|
|
|
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
|
|
|
{
|
|
|
|
pg->state |= PG_LEFT_ON_DEAD;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void pg_obj_state_check_t::start_object()
|
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
obj_start = list_pos;
|
|
|
|
oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
|
|
|
|
last_ver = max_ver = list[list_pos].version;
|
2020-03-23 00:39:53 +03:00
|
|
|
target_ver = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
ver_start = list_pos;
|
2020-03-23 00:39:53 +03:00
|
|
|
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
n_unstable = n_buggy = 0;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void pg_obj_state_check_t::handle_version()
|
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
|
|
|
// Version is either stable or recoverable
|
|
|
|
target_ver = last_ver;
|
2020-03-23 15:44:29 +03:00
|
|
|
ver_end = list_pos;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
|
|
|
if (!target_ver)
|
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
if (last_ver != list[list_pos].version)
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
ver_start = list_pos;
|
2020-03-23 00:39:53 +03:00
|
|
|
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
last_ver = list[list_pos].version;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
2020-03-23 15:44:29 +03:00
|
|
|
int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
|
2020-03-23 00:39:53 +03:00
|
|
|
n_copies++;
|
|
|
|
if (replica >= pg->pg_size)
|
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
n_buggy++;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
if (list[list_pos].is_stable)
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
|
|
|
n_stable++;
|
|
|
|
}
|
2020-03-23 15:44:29 +03:00
|
|
|
if (pg->cur_set[replica] != list[list_pos].osd_num)
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
|
|
|
n_mismatched++;
|
|
|
|
}
|
|
|
|
if (!(has_roles & (1 << replica)))
|
|
|
|
{
|
|
|
|
has_roles = has_roles | (1 << replica);
|
|
|
|
n_roles++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-03-23 15:44:29 +03:00
|
|
|
if (!list[list_pos].is_stable)
|
2020-03-23 00:39:53 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
n_unstable++;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void pg_obj_state_check_t::finish_object()
|
|
|
|
{
|
|
|
|
if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
|
|
|
|
{
|
|
|
|
// Version is either stable or recoverable
|
|
|
|
target_ver = last_ver;
|
2020-03-23 15:44:29 +03:00
|
|
|
ver_end = list_pos;
|
2020-03-23 00:39:53 +03:00
|
|
|
}
|
2020-03-23 15:44:29 +03:00
|
|
|
obj_end = list_pos;
|
2020-01-24 02:23:27 +03:00
|
|
|
// Remember the decision
|
2020-03-24 01:13:04 +03:00
|
|
|
uint64_t state = 0;
|
2020-03-23 15:44:29 +03:00
|
|
|
if (n_buggy > 0)
|
|
|
|
{
|
|
|
|
state = OBJ_BUGGY;
|
|
|
|
// FIXME: bring pg offline
|
|
|
|
throw std::runtime_error("buggy object state");
|
|
|
|
}
|
|
|
|
if (n_unstable > 0)
|
|
|
|
{
|
|
|
|
pg->state |= PG_HAS_UNCLEAN;
|
|
|
|
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
|
|
|
|
for (int i = obj_start; i < obj_end; i++)
|
|
|
|
{
|
|
|
|
auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
|
|
|
|
if (!pcs.max_ver)
|
|
|
|
{
|
|
|
|
pcs.max_ver = list[i].version;
|
|
|
|
}
|
|
|
|
if (list[i].is_stable && !pcs.stable_ver)
|
|
|
|
{
|
|
|
|
pcs.stable_ver = list[i].version;
|
|
|
|
}
|
2020-05-23 15:01:47 +03:00
|
|
|
if (list[i].version <= target_ver && !pcs.max_target)
|
2020-03-23 15:44:29 +03:00
|
|
|
{
|
|
|
|
pcs.max_target = list[i].version;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto pp: pieces)
|
2020-03-05 20:58:52 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
auto & pcs = pp.second;
|
|
|
|
if (pcs.stable_ver < pcs.max_ver)
|
|
|
|
{
|
|
|
|
auto & act = pg->flush_actions[pp.first];
|
|
|
|
// osd_set doesn't include rollback/stable states, so don't include them in the state code either
|
|
|
|
if (pcs.max_ver > target_ver)
|
|
|
|
{
|
|
|
|
act.rollback = true;
|
|
|
|
act.rollback_to = pcs.max_target;
|
|
|
|
}
|
|
|
|
if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
|
|
|
|
{
|
|
|
|
act.make_stable = true;
|
|
|
|
act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
|
|
|
|
}
|
|
|
|
}
|
2020-03-05 20:58:52 +03:00
|
|
|
}
|
2020-01-27 01:34:24 +03:00
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
if (!target_ver)
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (n_roles < pg->pg_minsize)
|
|
|
|
{
|
2020-04-27 14:32:59 +03:00
|
|
|
if (log_level > 1)
|
2020-03-24 01:13:04 +03:00
|
|
|
{
|
2020-04-27 14:32:59 +03:00
|
|
|
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
2020-03-31 17:50:50 +03:00
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
state = OBJ_INCOMPLETE;
|
|
|
|
pg->state = pg->state | PG_HAS_INCOMPLETE;
|
|
|
|
}
|
|
|
|
else if (n_roles < pg->pg_cursize)
|
|
|
|
{
|
2020-04-27 14:32:59 +03:00
|
|
|
if (log_level > 1)
|
2020-03-24 01:13:04 +03:00
|
|
|
{
|
2020-04-27 14:32:59 +03:00
|
|
|
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
2020-05-23 14:48:54 +03:00
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
state = OBJ_DEGRADED;
|
|
|
|
pg->state = pg->state | PG_HAS_DEGRADED;
|
|
|
|
}
|
|
|
|
if (n_mismatched > 0)
|
|
|
|
{
|
2020-06-02 18:44:23 +03:00
|
|
|
if (n_roles >= pg->pg_cursize && log_level > 1)
|
|
|
|
{
|
|
|
|
printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
|
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
state |= OBJ_MISPLACED;
|
|
|
|
pg->state = pg->state | PG_HAS_MISPLACED;
|
|
|
|
}
|
2020-06-02 18:44:23 +03:00
|
|
|
if (log_level > 1 && (n_roles < pg->pg_cursize || n_mismatched > 0))
|
|
|
|
{
|
|
|
|
if (log_level > 2)
|
|
|
|
{
|
|
|
|
for (int i = obj_start; i < obj_end; i++)
|
|
|
|
{
|
|
|
|
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (int i = ver_start; i < ver_end; i++)
|
|
|
|
{
|
|
|
|
printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
pg->total_count++;
|
|
|
|
if (state != 0 || ver_end < obj_end)
|
2020-01-27 01:34:24 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
osd_set.clear();
|
|
|
|
for (int i = ver_start; i < ver_end; i++)
|
|
|
|
{
|
|
|
|
osd_set.push_back((pg_obj_loc_t){
|
|
|
|
.role = (list[i].oid.stripe & STRIPE_MASK),
|
|
|
|
.osd_num = list[i].osd_num,
|
|
|
|
.outdated = false,
|
|
|
|
});
|
|
|
|
}
|
2020-01-27 01:34:24 +03:00
|
|
|
}
|
2020-03-23 15:44:29 +03:00
|
|
|
if (ver_end < obj_end)
|
2020-01-27 01:34:24 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
// Check for outdated versions not present in the current target OSD set
|
|
|
|
for (int i = ver_end; i < obj_end; i++)
|
|
|
|
{
|
|
|
|
int j;
|
|
|
|
for (j = 0; j < osd_set.size(); j++)
|
|
|
|
{
|
|
|
|
if (osd_set[j].osd_num == list[i].osd_num)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
|
|
|
|
{
|
|
|
|
osd_set.push_back((pg_obj_loc_t){
|
|
|
|
.role = (list[i].oid.stripe & STRIPE_MASK),
|
|
|
|
.osd_num = list[i].osd_num,
|
|
|
|
.outdated = true,
|
|
|
|
});
|
|
|
|
state |= OBJ_MISPLACED;
|
|
|
|
pg->state = pg->state | PG_HAS_MISPLACED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (target_ver < max_ver)
|
|
|
|
{
|
|
|
|
pg->ver_override[oid] = target_ver;
|
2020-01-27 01:34:24 +03:00
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
if (state == 0)
|
2020-03-05 20:58:52 +03:00
|
|
|
{
|
2020-03-23 00:39:53 +03:00
|
|
|
pg->clean_count++;
|
2020-03-05 20:58:52 +03:00
|
|
|
}
|
|
|
|
else
|
2020-01-24 02:23:27 +03:00
|
|
|
{
|
2020-03-23 00:39:53 +03:00
|
|
|
auto it = pg->state_dict.find(osd_set);
|
|
|
|
if (it == pg->state_dict.end())
|
2020-01-24 02:23:27 +03:00
|
|
|
{
|
2020-02-02 00:05:56 +03:00
|
|
|
std::vector<uint64_t> read_target;
|
2020-03-23 00:39:53 +03:00
|
|
|
read_target.resize(pg->pg_size);
|
|
|
|
for (int i = 0; i < pg->pg_size; i++)
|
2020-02-02 00:05:56 +03:00
|
|
|
{
|
|
|
|
read_target[i] = 0;
|
|
|
|
}
|
2020-03-23 00:39:53 +03:00
|
|
|
for (auto & o: osd_set)
|
2020-02-02 00:05:56 +03:00
|
|
|
{
|
2020-03-23 15:44:29 +03:00
|
|
|
if (!o.outdated)
|
|
|
|
{
|
|
|
|
read_target[o.role] = o.osd_num;
|
|
|
|
}
|
2020-02-02 00:05:56 +03:00
|
|
|
}
|
2020-03-23 00:39:53 +03:00
|
|
|
pg->state_dict[osd_set] = {
|
2020-02-02 00:05:56 +03:00
|
|
|
.read_target = read_target,
|
2020-03-23 00:39:53 +03:00
|
|
|
.osd_set = osd_set,
|
2020-01-24 02:23:27 +03:00
|
|
|
.state = state,
|
|
|
|
.object_count = 1,
|
|
|
|
};
|
2020-03-23 00:39:53 +03:00
|
|
|
it = pg->state_dict.find(osd_set);
|
2020-01-24 02:23:27 +03:00
|
|
|
}
|
|
|
|
else
|
2020-01-27 01:34:24 +03:00
|
|
|
{
|
2020-01-24 02:23:27 +03:00
|
|
|
it->second.object_count++;
|
2020-01-27 01:34:24 +03:00
|
|
|
}
|
2020-03-24 01:13:04 +03:00
|
|
|
if (state & OBJ_INCOMPLETE)
|
|
|
|
{
|
|
|
|
pg->incomplete_objects[oid] = &it->second;
|
|
|
|
}
|
|
|
|
else if (state & OBJ_DEGRADED)
|
|
|
|
{
|
|
|
|
pg->degraded_objects[oid] = &it->second;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
pg->misplaced_objects[oid] = &it->second;
|
|
|
|
}
|
2020-01-24 02:23:27 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-25 01:20:45 +03:00
|
|
|
// FIXME: Write at least some tests for this function
|
2020-04-27 14:32:59 +03:00
|
|
|
void pg_t::calc_object_states(int log_level)
|
2020-01-24 02:23:27 +03:00
|
|
|
{
|
|
|
|
// Copy all object lists into one array
|
2020-03-23 00:39:53 +03:00
|
|
|
pg_obj_state_check_t st;
|
2020-04-27 14:32:59 +03:00
|
|
|
st.log_level = log_level;
|
2020-03-23 00:39:53 +03:00
|
|
|
st.pg = this;
|
|
|
|
auto ps = peering_state;
|
2020-01-24 02:23:27 +03:00
|
|
|
for (auto it: ps->list_results)
|
|
|
|
{
|
|
|
|
auto nstab = it.second.stable_count;
|
|
|
|
auto n = it.second.total_count;
|
|
|
|
auto osd_num = it.first;
|
2020-03-23 00:39:53 +03:00
|
|
|
uint64_t start = st.list.size();
|
|
|
|
st.list.resize(start + n);
|
2020-01-24 02:23:27 +03:00
|
|
|
obj_ver_id *ov = it.second.buf;
|
|
|
|
for (uint64_t i = 0; i < n; i++, ov++)
|
|
|
|
{
|
2020-03-23 00:39:53 +03:00
|
|
|
st.list[start+i] = {
|
2020-01-24 02:23:27 +03:00
|
|
|
.oid = ov->oid,
|
|
|
|
.version = ov->version,
|
|
|
|
.osd_num = osd_num,
|
|
|
|
.is_stable = i < nstab,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
free(it.second.buf);
|
|
|
|
it.second.buf = NULL;
|
|
|
|
}
|
|
|
|
ps->list_results.clear();
|
|
|
|
// Sort
|
2020-03-23 00:39:53 +03:00
|
|
|
std::sort(st.list.begin(), st.list.end());
|
2020-01-24 02:23:27 +03:00
|
|
|
// Walk over it and check object states
|
2020-03-23 00:39:53 +03:00
|
|
|
st.walk();
|
2020-03-14 22:19:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void pg_t::print_state()
|
|
|
|
{
|
2020-02-11 02:30:46 +03:00
|
|
|
printf(
|
2020-05-01 01:32:34 +03:00
|
|
|
"[PG %u] is %s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
|
|
|
|
(state & PG_STARTING) ? "starting" : "",
|
2020-03-14 22:19:45 +03:00
|
|
|
(state & PG_OFFLINE) ? "offline" : "",
|
|
|
|
(state & PG_PEERING) ? "peering" : "",
|
|
|
|
(state & PG_INCOMPLETE) ? "incomplete" : "",
|
|
|
|
(state & PG_ACTIVE) ? "active" : "",
|
2020-05-01 01:32:34 +03:00
|
|
|
(state & PG_STOPPING) ? "stopping" : "",
|
2020-03-14 22:19:45 +03:00
|
|
|
(state & PG_DEGRADED) ? " + degraded" : "",
|
2020-03-15 18:39:31 +03:00
|
|
|
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
2020-03-14 22:19:45 +03:00
|
|
|
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
|
|
|
|
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
|
|
|
|
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
|
|
|
|
total_count
|
2020-02-11 02:30:46 +03:00
|
|
|
);
|
2020-01-24 02:23:27 +03:00
|
|
|
}
|