Implement scrubbing "data path" - OSD_OP_SCRUB

test-double-alloc
Vitaliy Filippov 2023-02-13 02:59:38 +03:00
parent a6d846863b
commit 43b77d7619
9 changed files with 233 additions and 5 deletions

View File

@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
add_executable(vitastor-osd
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
osd_cluster.cpp osd_rmw.cpp
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
)
target_link_libraries(vitastor-osd
vitastor_common

View File

@ -320,7 +320,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
(cur_op->req.rw.len > OSD_RW_MAX ||
cur_op->req.rw.len % bs_bitmap_granularity ||
cur_op->req.rw.offset % bs_bitmap_granularity)))
cur_op->req.rw.offset % bs_bitmap_granularity)) ||
cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
{
// Bad command
finish_op(cur_op, -EINVAL);
@ -337,6 +338,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
cur_op->req.hdr.opcode != OSD_OP_READ &&
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
{
// Readonly mode
@ -367,6 +369,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
{
continue_primary_del(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
{
continue_primary_scrub(cur_op);
}
else
{
exec_secondary(cur_op);

View File

@ -235,6 +235,7 @@ class osd_t
void autosync();
bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op);
void continue_primary_scrub(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op);
void cancel_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op);

View File

@ -21,4 +21,5 @@ const char* osd_op_names[] = {
"primary_delete",
"ping",
"sec_read_bmp",
"scrub",
};

View File

@ -29,7 +29,8 @@
#define OSD_OP_DELETE 14
#define OSD_OP_PING 15
#define OSD_OP_SEC_READ_BMP 16
#define OSD_OP_MAX 16
#define OSD_OP_SCRUB 17
#define OSD_OP_MAX 17
#define OSD_RW_MAX 64*1024*1024
#define OSD_PROTOCOL_VERSION 1

View File

@ -52,7 +52,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
finish_op(cur_op, -EINVAL);
return false;
}
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
// Scrub is similar to r/w, so it's also handled here
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
int chain_size = 0;
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
{

View File

@ -9,6 +9,7 @@
#define SUBMIT_READ 0
#define SUBMIT_RMW_READ 1
#define SUBMIT_WRITE 2
#define SUBMIT_SCRUB_READ 3
struct unstable_osd_num_t
{

View File

@ -9,6 +9,7 @@ void osd_t::autosync()
{
autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN;
autosync_op->peer_fd = -1;
autosync_op->req = (osd_any_op_t){
.sync = {
.header = {
@ -142,7 +143,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
for (int role = 0; role < op_data->pg_size; role++)
{
// We always submit zero-length writes to all replicas, even if the stripe is not modified
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
{
continue;
}
@ -430,6 +431,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
{
continue_primary_del(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
{
continue_primary_scrub(cur_op);
}
else
{
throw std::runtime_error("BUG: unknown opcode");

211
src/osd_scrub.cpp Normal file
View File

@ -0,0 +1,211 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "osd_primary.h"
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
{
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
return;
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == 1)
goto resume_1;
else if (op_data->st == 2)
goto resume_2;
{
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// Determine version
auto vo_it = pg.ver_override.find(op_data->oid);
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
// PG may have degraded or misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
// Read all available chunks
int n_copies = 0;
op_data->degraded = false;
for (int role = 0; role < op_data->pg_size; role++)
{
op_data->stripes[role].read_start = 0;
op_data->stripes[role].read_end = bs_block_size;
if (op_data->prev_set[role] != 0)
{
n_copies++;
}
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
{
op_data->degraded = true;
}
}
if (n_copies <= op_data->pg_data_size)
{
// Nothing to compare, even if we'd like to
finish_op(cur_op, 0);
return;
}
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
// Submit reads
osd_op_t *subops = new osd_op_t[n_copies];
op_data->fact_ver = 0;
op_data->done = op_data->errors = op_data->errcode = 0;
op_data->n_subops = n_copies;
op_data->subops = subops;
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
assert(sent == n_copies);
op_data->st = 1;
}
resume_1:
return;
resume_2:
if (op_data->errors > 0)
{
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
{
// I/O or checksum error
int n_copies = 0;
for (int role = 0; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].read_end != 0 &&
!op_data->stripes[role].read_error)
{
n_copies++;
}
}
if (n_copies <= op_data->pg_data_size)
{
// Nothing to compare, just mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
// Operation is treated as unsuccessful only if the object becomes unreadable
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
return;
}
// Proceed, we can still compare chunks that were successfully read
}
else
{
finish_op(cur_op, op_data->errcode);
return;
}
}
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Check that all chunks have returned the same data
int total = 0;
int eq_to[op_data->pg_size];
for (int role = 0; role < op_data->pg_size; role++)
{
eq_to[role] = -1;
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
{
total++;
eq_to[role] = role;
for (int other = 0; other < role; other++)
{
// Only compare with unique chunks (eq_to[other] == other)
if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
{
eq_to[role] = eq_to[other];
break;
}
}
}
}
int votes[op_data->pg_size];
for (int role = 0; role < op_data->pg_size; role++)
votes[role] = 0;
for (int role = 0; role < op_data->pg_size; role++)
{
if (eq_to[role] != -1)
votes[eq_to[role]]++;
}
int best = -1;
for (int role = 0; role < op_data->pg_size; role++)
{
if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
best = role;
}
if (best > 0 && votes[best] < total)
{
// FIXME Add a flag to allow to skip such objects and not recover them automatically
bool unknown = false;
for (int role = 0; role < op_data->pg_size; role++)
{
if (role != best && votes[role] == votes[best])
unknown = true;
if (votes[role] > 0 && votes[role] < votes[best])
{
printf(
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
);
op_data->stripes[role].read_error = true;
}
}
if (unknown)
{
// It's unknown which replica is good. There are multiple versions with no majority
best = -1;
}
}
}
else
{
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
if (op_data->degraded)
{
// Reconstruct missing stripes
// XOR shouldn't come here as it only has 1 parity chunk
assert(op_data->scheme == POOL_SCHEME_EC);
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
}
// Generate parity chunks and compare them with actual data
osd_num_t fake_osd_set[op_data->pg_size];
for (int i = 0; i < op_data->pg_size; i++)
{
fake_osd_set[i] = 1;
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
: op_data->stripes[i].read_buf;
}
if (op_data->scheme == POOL_SCHEME_XOR)
{
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
}
else if (op_data->scheme == POOL_SCHEME_EC)
{
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
}
// Now compare that write_buf == read_buf
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
{
// Chunks don't match - something's wrong... but we don't know what :D
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
printf(
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
INODE_POOL(op_data->oid.inode), op_data->pg_num,
op_data->oid.inode, op_data->oid.stripe,
role-op_data->pg_data_size, op_data->stripes[role].osd_num
);
op_data->stripes[role].read_error = true;
}
}
}
for (int role = 0; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
{
// Got at least 1 read error or mismatch, mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
break;
}
}
finish_op(cur_op, 0);
}