Allow scrub to fix corrupted object states

test-double-alloc
Vitaliy Filippov 2023-04-21 02:05:19 +03:00
parent 4bfd994341
commit 6ca20aa194
7 changed files with 71 additions and 21 deletions

View File

@ -73,7 +73,10 @@ Input:
write request is copied into the metadata area bitwise and stored there.
Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
- retval = number of bytes actually read/written or negative error number
-EINVAL = invalid input parameters
-ENOENT = requested object/version does not exist for reads
-ENOSPC = no space left in the store for writes
- version = the version actually read or written
## BS_OP_DELETE

View File

@ -124,10 +124,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
if (!clean_found && !dirty_found)
{
// region is not allocated - return zeroes
memset(read_op->buf, 0, read_op->len);
read_op->version = 0;
read_op->retval = read_op->len;
read_op->retval = -ENOENT;
FINISH_OP(read_op);
return 2;
}
@ -149,8 +147,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (IS_DELETE(dirty.state))
{
assert(!result_version);
clean_found = false;
break;
read_op->version = 0;
read_op->retval = -ENOENT;
FINISH_OP(read_op);
return 2;
}
if (!result_version)
{
@ -238,12 +238,19 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
}
}
else if (fulfilled < read_op->len)
if (!result_version)
{
// fill remaining parts with zeroes
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
// May happen if there are entries in dirty_db but all of them are !version_ok
read_op->version = 0;
read_op->retval = -ENOENT;
FINISH_OP(read_op);
return 2;
}
if (fulfilled < read_op->len)
{
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
assert(fulfilled == read_op->len);
}
assert(fulfilled == read_op->len);
read_op->version = result_version;
if (!PRIV(read_op)->pending_ops)
{

View File

@ -329,21 +329,40 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
}
// Mark object chunk(s) as corrupted
int changes = 0;
for (auto & chunk: corrupted_set)
for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
{
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
if (corrupted)
auto & chunk = *chunk_it;
if (stripes[chunk.role].osd_num == chunk.osd_num)
{
if (!(chunk.loc_bad & LOC_CORRUPTED))
if (stripes[chunk.role].not_exists)
{
changes++;
chunk.loc_bad |= LOC_CORRUPTED;
corrupted_set.erase(chunk_it, chunk_it+1);
continue;
}
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
{
changes++;
chunk.loc_bad = LOC_CORRUPTED;
}
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
(chunk.loc_bad & LOC_CORRUPTED))
{
changes++;
chunk.loc_bad &= ~LOC_CORRUPTED;
}
}
else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED))
if (inconsistent && !chunk.loc_bad)
{
if (!(chunk.loc_bad & LOC_INCONSISTENT))
changes++;
changes++;
chunk.loc_bad |= LOC_INCONSISTENT;
}
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
{
changes++;
chunk.loc_bad &= ~LOC_INCONSISTENT;
}
chunk_it++;
}
if (!changes)
{

View File

@ -344,6 +344,13 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
else
expected = 0;
osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval == -ENOENT && opcode == OSD_OP_SEC_READ)
{
// ENOENT is not an error for almost all reads, except scrub
retval = expected;
memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
}
if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
{
uint64_t version = subop->reply.sec_rw.version;

View File

@ -1131,12 +1131,14 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
{
std::vector<int> found_valid;
int cur_live[pg_size], live_count = 0;
int cur_live[pg_size], live_count = 0, exists_count = 0;
osd_num_t fake_osd_set[pg_size];
for (int role = 0; role < pg_size; role++)
{
if (!stripes[role].missing)
{
if (!stripes[role].not_exists)
exists_count++;
cur_live[live_count++] = role;
fake_osd_set[role] = role+1;
}
@ -1145,6 +1147,14 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
{
return std::vector<int>();
}
if (exists_count <= pg_minsize)
{
// Special case: user manually deleted some chunks
for (int role = 0; role < pg_size; role++)
if (!stripes[role].missing && !stripes[role].not_exists)
found_valid.push_back(role);
return found_valid;
}
// Try to locate errors using brute force if there isn't too many combinations
osd_rmw_stripe_t brute_stripes[pg_size];
int out_count = live_count-pg_minsize;

View File

@ -30,6 +30,7 @@ struct osd_rmw_stripe_t
osd_num_t osd_num;
bool missing: 1;
bool read_error: 1;
bool not_exists: 1;
};
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate

View File

@ -455,7 +455,8 @@ resume_2:
for (int role = 0; role < op_data->pg_size; role++)
{
eq_to[role] = -1;
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing)
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
!op_data->stripes[role].not_exists)
{
total++;
eq_to[role] = role;
@ -560,7 +561,9 @@ resume_2:
}
for (int role = 0; role < op_data->pg_size; role++)
{
if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent)
if (op_data->stripes[role].osd_num != 0 &&
(op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
inconsistent)
{
// Got at least 1 read error or mismatch, mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });