629 lines
22 KiB
C++
629 lines
22 KiB
C++
// Copyright (c) Vitaliy Filippov, 2019+
|
|
// License: VNPL-1.1 (see README.md for details)
|
|
|
|
#include "osd_primary.h"
|
|
|
|
#define SELF_FD -1
|
|
|
|
void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
|
|
{
|
|
pool_id_t pool_id = pg_id.pool_id;
|
|
pg_num_t pg_num = pg_id.pg_num;
|
|
assert(!scrub_list_op);
|
|
if (role_osd == this->osd_num)
|
|
{
|
|
// Self
|
|
osd_op_t *op = new osd_op_t();
|
|
op->op_type = 0;
|
|
op->peer_fd = SELF_FD;
|
|
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
|
op->bs_op = new blockstore_op_t();
|
|
op->bs_op->opcode = BS_OP_LIST;
|
|
op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
|
|
if (min_oid.inode != 0 || min_oid.stripe != 0)
|
|
op->bs_op->min_oid = min_oid;
|
|
else
|
|
{
|
|
op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
|
|
op->bs_op->min_oid.stripe = 0;
|
|
}
|
|
op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
|
op->bs_op->max_oid.stripe = UINT64_MAX;
|
|
op->bs_op->list_stable_limit = scrub_list_limit;
|
|
op->bs_op->pg_count = pg_counts[pool_id];
|
|
op->bs_op->pg_number = pg_num-1;
|
|
op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
|
|
{
|
|
scrub_list_op = NULL;
|
|
if (op->bs_op->retval < 0)
|
|
{
|
|
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
|
force_stop(1);
|
|
return;
|
|
}
|
|
add_bs_subop_stats(op);
|
|
scrub_cur_list = {
|
|
.buf = (obj_ver_id*)op->bs_op->buf,
|
|
.total_count = (uint64_t)op->bs_op->retval,
|
|
.stable_count = op->bs_op->version,
|
|
};
|
|
delete op->bs_op;
|
|
op->bs_op = NULL;
|
|
delete op;
|
|
continue_scrub();
|
|
};
|
|
scrub_list_op = op;
|
|
bs->enqueue_op(op->bs_op);
|
|
}
|
|
else
|
|
{
|
|
// Peer
|
|
osd_op_t *op = new osd_op_t();
|
|
op->op_type = OSD_OP_OUT;
|
|
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
|
op->req = (osd_any_op_t){
|
|
.sec_list = {
|
|
.header = {
|
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
|
.id = msgr.next_subop_id++,
|
|
.opcode = OSD_OP_SEC_LIST,
|
|
},
|
|
.list_pg = pg_num,
|
|
.pg_count = pg_counts[pool_id],
|
|
.pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
|
|
.min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
|
|
.max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
|
|
.min_stripe = min_oid.stripe,
|
|
.stable_limit = scrub_list_limit,
|
|
},
|
|
};
|
|
op->callback = [this, role_osd](osd_op_t *op)
|
|
{
|
|
scrub_list_op = NULL;
|
|
if (op->reply.hdr.retval < 0)
|
|
{
|
|
printf("Failed to get object list from OSD %ju (retval=%jd), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
|
int fail_fd = op->peer_fd;
|
|
delete op;
|
|
msgr.stop_client(fail_fd);
|
|
return;
|
|
}
|
|
scrub_cur_list = {
|
|
.buf = (obj_ver_id*)op->buf,
|
|
.total_count = (uint64_t)op->reply.hdr.retval,
|
|
.stable_count = op->reply.sec_list.stable_count,
|
|
};
|
|
// set op->buf to NULL so it doesn't get freed
|
|
op->buf = NULL;
|
|
delete op;
|
|
continue_scrub();
|
|
};
|
|
scrub_list_op = op;
|
|
msgr.outbox_push(op);
|
|
}
|
|
}
|
|
|
|
int osd_t::pick_next_scrub(object_id & next_oid)
|
|
{
|
|
if (!pgs.size())
|
|
{
|
|
if (scrub_cur_list.buf)
|
|
{
|
|
free(scrub_cur_list.buf);
|
|
scrub_cur_list = {};
|
|
scrub_last_pg = {};
|
|
}
|
|
return 0;
|
|
}
|
|
if (scrub_list_op)
|
|
{
|
|
return 1;
|
|
}
|
|
timespec tv_now;
|
|
clock_gettime(CLOCK_REALTIME, &tv_now);
|
|
bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
|
|
// Restart scanning from the same PG as the last time
|
|
auto pg_it = pgs.lower_bound(scrub_last_pg);
|
|
if (pg_it == pgs.end() && rescan)
|
|
{
|
|
pg_it = pgs.begin();
|
|
rescan = false;
|
|
}
|
|
while (pg_it != pgs.end())
|
|
{
|
|
if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub <= tv_now.tv_sec)
|
|
{
|
|
// Continue scrubbing from the next object
|
|
if (scrub_last_pg == pg_it->first)
|
|
{
|
|
while (scrub_list_pos < scrub_cur_list.total_count)
|
|
{
|
|
auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
|
|
oid.stripe &= ~STRIPE_MASK;
|
|
scrub_list_pos++;
|
|
if (recovery_ops.find(oid) == recovery_ops.end() &&
|
|
scrub_ops.find(oid) == scrub_ops.end() &&
|
|
pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end())
|
|
{
|
|
next_oid = oid;
|
|
if (!(pg_it->second.state & PG_SCRUBBING))
|
|
{
|
|
// Currently scrubbing this PG
|
|
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
|
report_pg_state(pg_it->second);
|
|
}
|
|
return 2;
|
|
}
|
|
}
|
|
}
|
|
if (scrub_last_pg == pg_it->first &&
|
|
scrub_list_pos >= scrub_cur_list.total_count &&
|
|
scrub_cur_list.stable_count < scrub_list_limit)
|
|
{
|
|
// End of the list, mark this PG as scrubbed and go to the next PG
|
|
}
|
|
else
|
|
{
|
|
// Continue listing
|
|
object_id scrub_last_oid = {};
|
|
if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0)
|
|
{
|
|
scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
|
|
scrub_last_oid.stripe++;
|
|
}
|
|
osd_num_t scrub_osd = 0;
|
|
for (osd_num_t pg_osd: pg_it->second.cur_set)
|
|
{
|
|
if (pg_osd == this->osd_num || scrub_osd == 0)
|
|
scrub_osd = pg_osd;
|
|
}
|
|
if (!(pg_it->second.state & PG_SCRUBBING))
|
|
{
|
|
// Currently scrubbing this PG
|
|
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
|
report_pg_state(pg_it->second);
|
|
}
|
|
if (scrub_cur_list.buf)
|
|
{
|
|
free(scrub_cur_list.buf);
|
|
scrub_cur_list = {};
|
|
scrub_list_pos = 0;
|
|
}
|
|
scrub_last_pg = pg_it->first;
|
|
scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
|
|
return 1;
|
|
}
|
|
if (pg_it->second.state & PG_SCRUBBING)
|
|
{
|
|
scrub_last_pg = {};
|
|
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
|
|
pg_it->second.next_scrub = 0;
|
|
pg_it->second.history_changed = true;
|
|
report_pg_state(pg_it->second);
|
|
}
|
|
// The list is definitely not needed anymore
|
|
if (scrub_cur_list.buf)
|
|
{
|
|
free(scrub_cur_list.buf);
|
|
scrub_cur_list = {};
|
|
}
|
|
}
|
|
pg_it++;
|
|
if (pg_it == pgs.end() && rescan)
|
|
{
|
|
// Scan one more time to guarantee that there are no PGs to scrub
|
|
pg_it = pgs.begin();
|
|
rescan = false;
|
|
}
|
|
}
|
|
// Scanned all PGs - no more scrubs to do
|
|
return 0;
|
|
}
|
|
|
|
void osd_t::submit_scrub_op(object_id oid)
|
|
{
|
|
auto osd_op = new osd_op_t();
|
|
osd_op->op_type = OSD_OP_OUT;
|
|
osd_op->peer_fd = -1;
|
|
osd_op->req = (osd_any_op_t){
|
|
.rw = {
|
|
.header = {
|
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
|
.id = 1,
|
|
.opcode = OSD_OP_SCRUB,
|
|
},
|
|
.inode = oid.inode,
|
|
.offset = oid.stripe,
|
|
.len = 0,
|
|
},
|
|
};
|
|
if (log_level > 2)
|
|
{
|
|
printf("Submitting scrub for %jx:%jx\n", oid.inode, oid.stripe);
|
|
}
|
|
osd_op->callback = [this](osd_op_t *osd_op)
|
|
{
|
|
object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
|
|
if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
|
|
{
|
|
// Scrub error
|
|
printf(
|
|
"Scrub failed with object %jx:%jx (PG %u/%u): error %jd\n",
|
|
oid.inode, oid.stripe, INODE_POOL(oid.inode),
|
|
map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
|
|
osd_op->reply.hdr.retval
|
|
);
|
|
}
|
|
else if (log_level > 2)
|
|
{
|
|
printf("Scrubbed %jx:%jx\n", oid.inode, oid.stripe);
|
|
}
|
|
delete osd_op;
|
|
if (scrub_sleep_ms)
|
|
{
|
|
this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
|
|
{
|
|
scrub_ops.erase(oid);
|
|
continue_scrub();
|
|
});
|
|
}
|
|
else
|
|
{
|
|
scrub_ops.erase(oid);
|
|
continue_scrub();
|
|
}
|
|
};
|
|
scrub_ops[oid] = osd_op;
|
|
exec_op(osd_op);
|
|
}
|
|
|
|
// Triggers scrub requests
|
|
// Scrub reads data from all replicas and compares it
|
|
// To scrub first we need to read objects listings
|
|
bool osd_t::continue_scrub()
|
|
{
|
|
if (scrub_list_op)
|
|
{
|
|
return true;
|
|
}
|
|
if (no_scrub)
|
|
{
|
|
// Return false = no more scrub work to do
|
|
scrub_cur_list = {};
|
|
scrub_last_pg = {};
|
|
scrub_nearest_ts = 0;
|
|
if (scrub_timer_id >= 0)
|
|
{
|
|
tfd->clear_timer(scrub_timer_id);
|
|
scrub_timer_id = -1;
|
|
}
|
|
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
|
{
|
|
if (pg_it->second.state & PG_SCRUBBING)
|
|
{
|
|
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
|
|
report_pg_state(pg_it->second);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
while (scrub_ops.size() < scrub_queue_depth)
|
|
{
|
|
object_id oid;
|
|
int r = pick_next_scrub(oid);
|
|
if (r == 2)
|
|
submit_scrub_op(oid);
|
|
else
|
|
return r;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void osd_t::plan_scrub(pg_t & pg, bool report_state)
|
|
{
|
|
if ((pg.state & PG_ACTIVE) && !pg.next_scrub && auto_scrub)
|
|
{
|
|
timespec tv_now;
|
|
clock_gettime(CLOCK_REALTIME, &tv_now);
|
|
auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
|
|
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
|
|
if (pg.next_scrub != tv_now.tv_sec + interval)
|
|
{
|
|
pool_cfg.pg_config[pg.pg_num].next_scrub = pg.next_scrub = tv_now.tv_sec + interval;
|
|
pg.history_changed = true;
|
|
if (report_state)
|
|
report_pg_state(pg);
|
|
}
|
|
schedule_scrub(pg);
|
|
}
|
|
}
|
|
|
|
void osd_t::schedule_scrub(pg_t & pg)
|
|
{
|
|
if (!no_scrub && pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub))
|
|
{
|
|
scrub_nearest_ts = pg.next_scrub;
|
|
timespec tv_now;
|
|
clock_gettime(CLOCK_REALTIME, &tv_now);
|
|
if (scrub_timer_id >= 0)
|
|
{
|
|
tfd->clear_timer(scrub_timer_id);
|
|
scrub_timer_id = -1;
|
|
}
|
|
if (tv_now.tv_sec >= scrub_nearest_ts)
|
|
{
|
|
scrub_nearest_ts = 0;
|
|
peering_state = peering_state | OSD_SCRUBBING;
|
|
ringloop->wakeup();
|
|
}
|
|
else
|
|
{
|
|
scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
|
|
{
|
|
scrub_timer_id = -1;
|
|
scrub_nearest_ts = 0;
|
|
peering_state = peering_state | OSD_SCRUBBING;
|
|
ringloop->wakeup();
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
|
{
|
|
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
|
return;
|
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
|
if (op_data->st == 1)
|
|
goto resume_1;
|
|
else if (op_data->st == 2)
|
|
goto resume_2;
|
|
{
|
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
|
cur_op->req.rw.len = bs_block_size * pg.pg_data_size;
|
|
// Determine version
|
|
auto vo_it = pg.ver_override.find(op_data->oid);
|
|
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
|
// PG may have degraded or misplaced objects
|
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
|
// Read all available chunks
|
|
int n_copies = 0;
|
|
op_data->degraded = false;
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
op_data->stripes[role].write_buf = NULL;
|
|
op_data->stripes[role].read_start = 0;
|
|
op_data->stripes[role].read_end = bs_block_size;
|
|
if (op_data->prev_set[role] != 0)
|
|
{
|
|
n_copies++;
|
|
}
|
|
else
|
|
{
|
|
op_data->stripes[role].missing = true;
|
|
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
|
{
|
|
op_data->degraded = true;
|
|
}
|
|
}
|
|
}
|
|
if (n_copies <= op_data->pg_data_size)
|
|
{
|
|
// Nothing to compare, even if we'd like to
|
|
finish_op(cur_op, 0);
|
|
return;
|
|
}
|
|
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
|
|
// Submit reads
|
|
osd_op_t *subops = new osd_op_t[n_copies];
|
|
op_data->fact_ver = 0;
|
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
|
op_data->n_subops = n_copies;
|
|
op_data->subops = subops;
|
|
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
|
|
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
|
|
assert(sent == n_copies);
|
|
op_data->st = 1;
|
|
}
|
|
resume_1:
|
|
return;
|
|
resume_2:
|
|
if (op_data->errors > 0)
|
|
{
|
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
|
{
|
|
// I/O or checksum error
|
|
int n_copies = 0;
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (op_data->stripes[role].read_error)
|
|
{
|
|
op_data->stripes[role].missing = true;
|
|
if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
|
{
|
|
op_data->degraded = true;
|
|
}
|
|
}
|
|
else if (!op_data->stripes[role].missing)
|
|
{
|
|
n_copies++;
|
|
}
|
|
}
|
|
if (n_copies <= op_data->pg_data_size)
|
|
{
|
|
// Nothing to compare, just mark the object as corrupted
|
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
|
|
// Operation is treated as unsuccessful only if the object becomes unreadable
|
|
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
|
return;
|
|
}
|
|
// Proceed, we can still compare chunks that were successfully read
|
|
}
|
|
else
|
|
{
|
|
finish_op(cur_op, op_data->errcode);
|
|
return;
|
|
}
|
|
}
|
|
bool inconsistent = false;
|
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
|
{
|
|
// Check that all chunks have returned the same data
|
|
int total = 0;
|
|
int eq_to[op_data->pg_size];
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
eq_to[role] = -1;
|
|
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
|
|
!op_data->stripes[role].not_exists)
|
|
{
|
|
total++;
|
|
eq_to[role] = role;
|
|
for (int other = 0; other < role; other++)
|
|
{
|
|
// Only compare with unique chunks (eq_to[other] == other)
|
|
if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
|
|
{
|
|
eq_to[role] = eq_to[other];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
int votes[op_data->pg_size];
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
votes[role] = 0;
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (eq_to[role] != -1)
|
|
votes[eq_to[role]]++;
|
|
}
|
|
int best = -1;
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (votes[role] > (best >= 0 ? votes[best] : 0))
|
|
best = role;
|
|
}
|
|
if (best >= 0 && votes[best] < total)
|
|
{
|
|
bool unknown = false;
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (role != best && votes[role] == votes[best])
|
|
{
|
|
unknown = true;
|
|
}
|
|
if (votes[role] > 0 && votes[role] < votes[best])
|
|
{
|
|
printf(
|
|
"[PG %u/%u] Object %jx:%jx v%ju copy on OSD %ju doesn't match %d other copies%s\n",
|
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
|
op_data->stripes[role].osd_num, votes[best],
|
|
scrub_find_best ? ", marking it as corrupted" : ""
|
|
);
|
|
if (scrub_find_best)
|
|
{
|
|
op_data->stripes[role].read_error = true;
|
|
}
|
|
}
|
|
}
|
|
if (!scrub_find_best)
|
|
{
|
|
unknown = true;
|
|
}
|
|
if (unknown)
|
|
{
|
|
// It's unknown which replica is good. There are multiple versions with no majority
|
|
// Mark all good replicas as ambiguous
|
|
best = -1;
|
|
inconsistent = true;
|
|
printf(
|
|
"[PG %u/%u] Object %jx:%jx v%ju is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
|
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
|
);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
|
auto good_subset = ec_find_good(
|
|
op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
|
|
bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
|
|
);
|
|
if (!good_subset.size())
|
|
{
|
|
inconsistent = true;
|
|
printf(
|
|
"[PG %u/%u] Object %jx:%jx v%ju is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
|
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
|
);
|
|
}
|
|
else
|
|
{
|
|
int total = 0;
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (!op_data->stripes[role].missing)
|
|
{
|
|
total++;
|
|
op_data->stripes[role].read_error = true;
|
|
}
|
|
}
|
|
for (int role: good_subset)
|
|
{
|
|
op_data->stripes[role].read_error = false;
|
|
}
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
|
{
|
|
printf(
|
|
"[PG %u/%u] Object %jx:%jx v%ju chunk %d on OSD %ju doesn't match other chunks%s\n",
|
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
|
|
role, op_data->stripes[role].osd_num,
|
|
scrub_find_best ? ", marking it as corrupted" : ""
|
|
);
|
|
}
|
|
}
|
|
if (!scrub_find_best && good_subset.size() < total)
|
|
{
|
|
inconsistent = true;
|
|
printf(
|
|
"[PG %u/%u] Object %jx:%jx v%ju is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
|
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
|
op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
|
|
);
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
|
|
{
|
|
// Undo error locator marking chunk as bad
|
|
op_data->stripes[role].read_error = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for (int role = 0; role < op_data->pg_size; role++)
|
|
{
|
|
if (op_data->stripes[role].osd_num != 0 &&
|
|
(op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
|
|
inconsistent)
|
|
{
|
|
// Got at least 1 read error or mismatch, mark the object as corrupted
|
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
|
|
break;
|
|
}
|
|
}
|
|
finish_op(cur_op, 0);
|
|
}
|