Implement scrubbing "data path" - OSD_OP_SCRUB
parent
a6d846863b
commit
43b77d7619
|
@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||||
add_executable(vitastor-osd
|
add_executable(vitastor-osd
|
||||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||||
osd_cluster.cpp osd_rmw.cpp
|
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
|
||||||
)
|
)
|
||||||
target_link_libraries(vitastor-osd
|
target_link_libraries(vitastor-osd
|
||||||
vitastor_common
|
vitastor_common
|
||||||
|
|
|
@ -320,7 +320,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||||
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
||||||
(cur_op->req.rw.len > OSD_RW_MAX ||
|
(cur_op->req.rw.len > OSD_RW_MAX ||
|
||||||
cur_op->req.rw.len % bs_bitmap_granularity ||
|
cur_op->req.rw.len % bs_bitmap_granularity ||
|
||||||
cur_op->req.rw.offset % bs_bitmap_granularity)))
|
cur_op->req.rw.offset % bs_bitmap_granularity)) ||
|
||||||
|
cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
|
||||||
{
|
{
|
||||||
// Bad command
|
// Bad command
|
||||||
finish_op(cur_op, -EINVAL);
|
finish_op(cur_op, -EINVAL);
|
||||||
|
@ -337,6 +338,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||||
|
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
|
||||||
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
||||||
{
|
{
|
||||||
// Readonly mode
|
// Readonly mode
|
||||||
|
@ -367,6 +369,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
continue_primary_del(cur_op);
|
continue_primary_del(cur_op);
|
||||||
}
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||||
|
{
|
||||||
|
continue_primary_scrub(cur_op);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
exec_secondary(cur_op);
|
exec_secondary(cur_op);
|
||||||
|
|
|
@ -235,6 +235,7 @@ class osd_t
|
||||||
void autosync();
|
void autosync();
|
||||||
bool prepare_primary_rw(osd_op_t *cur_op);
|
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||||
void continue_primary_read(osd_op_t *cur_op);
|
void continue_primary_read(osd_op_t *cur_op);
|
||||||
|
void continue_primary_scrub(osd_op_t *cur_op);
|
||||||
void continue_primary_write(osd_op_t *cur_op);
|
void continue_primary_write(osd_op_t *cur_op);
|
||||||
void cancel_primary_write(osd_op_t *cur_op);
|
void cancel_primary_write(osd_op_t *cur_op);
|
||||||
void continue_primary_sync(osd_op_t *cur_op);
|
void continue_primary_sync(osd_op_t *cur_op);
|
||||||
|
|
|
@ -21,4 +21,5 @@ const char* osd_op_names[] = {
|
||||||
"primary_delete",
|
"primary_delete",
|
||||||
"ping",
|
"ping",
|
||||||
"sec_read_bmp",
|
"sec_read_bmp",
|
||||||
|
"scrub",
|
||||||
};
|
};
|
||||||
|
|
|
@ -29,7 +29,8 @@
|
||||||
#define OSD_OP_DELETE 14
|
#define OSD_OP_DELETE 14
|
||||||
#define OSD_OP_PING 15
|
#define OSD_OP_PING 15
|
||||||
#define OSD_OP_SEC_READ_BMP 16
|
#define OSD_OP_SEC_READ_BMP 16
|
||||||
#define OSD_OP_MAX 16
|
#define OSD_OP_SCRUB 17
|
||||||
|
#define OSD_OP_MAX 17
|
||||||
#define OSD_RW_MAX 64*1024*1024
|
#define OSD_RW_MAX 64*1024*1024
|
||||||
#define OSD_PROTOCOL_VERSION 1
|
#define OSD_PROTOCOL_VERSION 1
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||||
finish_op(cur_op, -EINVAL);
|
finish_op(cur_op, -EINVAL);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
|
// Scrub is similar to r/w, so it's also handled here
|
||||||
|
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||||
|
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
|
||||||
int chain_size = 0;
|
int chain_size = 0;
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
||||||
{
|
{
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#define SUBMIT_READ 0
|
#define SUBMIT_READ 0
|
||||||
#define SUBMIT_RMW_READ 1
|
#define SUBMIT_RMW_READ 1
|
||||||
#define SUBMIT_WRITE 2
|
#define SUBMIT_WRITE 2
|
||||||
|
#define SUBMIT_SCRUB_READ 3
|
||||||
|
|
||||||
struct unstable_osd_num_t
|
struct unstable_osd_num_t
|
||||||
{
|
{
|
||||||
|
|
|
@ -9,6 +9,7 @@ void osd_t::autosync()
|
||||||
{
|
{
|
||||||
autosync_op = new osd_op_t();
|
autosync_op = new osd_op_t();
|
||||||
autosync_op->op_type = OSD_OP_IN;
|
autosync_op->op_type = OSD_OP_IN;
|
||||||
|
autosync_op->peer_fd = -1;
|
||||||
autosync_op->req = (osd_any_op_t){
|
autosync_op->req = (osd_any_op_t){
|
||||||
.sync = {
|
.sync = {
|
||||||
.header = {
|
.header = {
|
||||||
|
@ -142,7 +143,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||||
for (int role = 0; role < op_data->pg_size; role++)
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
{
|
{
|
||||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
|
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -430,6 +431,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
continue_primary_del(cur_op);
|
continue_primary_del(cur_op);
|
||||||
}
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||||
|
{
|
||||||
|
continue_primary_scrub(cur_op);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw std::runtime_error("BUG: unknown opcode");
|
throw std::runtime_error("BUG: unknown opcode");
|
||||||
|
|
|
@ -0,0 +1,211 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
|
||||||
|
#include "osd_primary.h"
|
||||||
|
|
||||||
|
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
||||||
|
{
|
||||||
|
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||||
|
return;
|
||||||
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
if (op_data->st == 1)
|
||||||
|
goto resume_1;
|
||||||
|
else if (op_data->st == 2)
|
||||||
|
goto resume_2;
|
||||||
|
{
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// Determine version
|
||||||
|
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||||
|
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||||
|
// PG may have degraded or misplaced objects
|
||||||
|
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||||
|
// Read all available chunks
|
||||||
|
int n_copies = 0;
|
||||||
|
op_data->degraded = false;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
op_data->stripes[role].read_start = 0;
|
||||||
|
op_data->stripes[role].read_end = bs_block_size;
|
||||||
|
if (op_data->prev_set[role] != 0)
|
||||||
|
{
|
||||||
|
n_copies++;
|
||||||
|
}
|
||||||
|
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||||
|
{
|
||||||
|
op_data->degraded = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_copies <= op_data->pg_data_size)
|
||||||
|
{
|
||||||
|
// Nothing to compare, even if we'd like to
|
||||||
|
finish_op(cur_op, 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
|
||||||
|
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
|
||||||
|
// Submit reads
|
||||||
|
osd_op_t *subops = new osd_op_t[n_copies];
|
||||||
|
op_data->fact_ver = 0;
|
||||||
|
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||||
|
op_data->n_subops = n_copies;
|
||||||
|
op_data->subops = subops;
|
||||||
|
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
|
||||||
|
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
|
||||||
|
assert(sent == n_copies);
|
||||||
|
op_data->st = 1;
|
||||||
|
}
|
||||||
|
resume_1:
|
||||||
|
return;
|
||||||
|
resume_2:
|
||||||
|
if (op_data->errors > 0)
|
||||||
|
{
|
||||||
|
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||||
|
{
|
||||||
|
// I/O or checksum error
|
||||||
|
int n_copies = 0;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->stripes[role].read_end != 0 &&
|
||||||
|
!op_data->stripes[role].read_error)
|
||||||
|
{
|
||||||
|
n_copies++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_copies <= op_data->pg_data_size)
|
||||||
|
{
|
||||||
|
// Nothing to compare, just mark the object as corrupted
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||||
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||||
|
// Operation is treated as unsuccessful only if the object becomes unreadable
|
||||||
|
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Proceed, we can still compare chunks that were successfully read
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
finish_op(cur_op, op_data->errcode);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||||
|
{
|
||||||
|
// Check that all chunks have returned the same data
|
||||||
|
int total = 0;
|
||||||
|
int eq_to[op_data->pg_size];
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
eq_to[role] = -1;
|
||||||
|
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
|
||||||
|
{
|
||||||
|
total++;
|
||||||
|
eq_to[role] = role;
|
||||||
|
for (int other = 0; other < role; other++)
|
||||||
|
{
|
||||||
|
// Only compare with unique chunks (eq_to[other] == other)
|
||||||
|
if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
|
||||||
|
{
|
||||||
|
eq_to[role] = eq_to[other];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int votes[op_data->pg_size];
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
votes[role] = 0;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (eq_to[role] != -1)
|
||||||
|
votes[eq_to[role]]++;
|
||||||
|
}
|
||||||
|
int best = -1;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
|
||||||
|
best = role;
|
||||||
|
}
|
||||||
|
if (best > 0 && votes[best] < total)
|
||||||
|
{
|
||||||
|
// FIXME Add a flag to allow to skip such objects and not recover them automatically
|
||||||
|
bool unknown = false;
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (role != best && votes[role] == votes[best])
|
||||||
|
unknown = true;
|
||||||
|
if (votes[role] > 0 && votes[role] < votes[best])
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
|
||||||
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
|
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
|
||||||
|
);
|
||||||
|
op_data->stripes[role].read_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (unknown)
|
||||||
|
{
|
||||||
|
// It's unknown which replica is good. There are multiple versions with no majority
|
||||||
|
best = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
||||||
|
if (op_data->degraded)
|
||||||
|
{
|
||||||
|
// Reconstruct missing stripes
|
||||||
|
// XOR shouldn't come here as it only has 1 parity chunk
|
||||||
|
assert(op_data->scheme == POOL_SCHEME_EC);
|
||||||
|
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
// Generate parity chunks and compare them with actual data
|
||||||
|
osd_num_t fake_osd_set[op_data->pg_size];
|
||||||
|
for (int i = 0; i < op_data->pg_size; i++)
|
||||||
|
{
|
||||||
|
fake_osd_set[i] = 1;
|
||||||
|
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
|
||||||
|
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
|
||||||
|
: op_data->stripes[i].read_buf;
|
||||||
|
}
|
||||||
|
if (op_data->scheme == POOL_SCHEME_XOR)
|
||||||
|
{
|
||||||
|
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
else if (op_data->scheme == POOL_SCHEME_EC)
|
||||||
|
{
|
||||||
|
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||||
|
}
|
||||||
|
// Now compare that write_buf == read_buf
|
||||||
|
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
|
||||||
|
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
|
||||||
|
{
|
||||||
|
// Chunks don't match - something's wrong... but we don't know what :D
|
||||||
|
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
|
||||||
|
printf(
|
||||||
|
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
|
||||||
|
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||||
|
op_data->oid.inode, op_data->oid.stripe,
|
||||||
|
role-op_data->pg_data_size, op_data->stripes[role].osd_num
|
||||||
|
);
|
||||||
|
op_data->stripes[role].read_error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int role = 0; role < op_data->pg_size; role++)
|
||||||
|
{
|
||||||
|
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
|
||||||
|
{
|
||||||
|
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||||
|
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||||
|
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||||
|
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finish_op(cur_op, 0);
|
||||||
|
}
|
Loading…
Reference in New Issue