Allow scrub to fix corrupted object states

test-double-alloc
Vitaliy Filippov 2023-04-21 02:05:19 +03:00
parent 4bfd994341
commit 6ca20aa194
7 changed files with 71 additions and 21 deletions

View File

@ -73,7 +73,10 @@ Input:
write request is copied into the metadata area bitwise and stored there. write request is copied into the metadata area bitwise and stored there.
Output: Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC) - retval = number of bytes actually read/written or negative error number
-EINVAL = invalid input parameters
-ENOENT = requested object/version does not exist for reads
-ENOSPC = no space left in the store for writes
- version = the version actually read or written - version = the version actually read or written
## BS_OP_DELETE ## BS_OP_DELETE

View File

@ -124,10 +124,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid); bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
if (!clean_found && !dirty_found) if (!clean_found && !dirty_found)
{ {
// region is not allocated - return zeroes
memset(read_op->buf, 0, read_op->len);
read_op->version = 0; read_op->version = 0;
read_op->retval = read_op->len; read_op->retval = -ENOENT;
FINISH_OP(read_op); FINISH_OP(read_op);
return 2; return 2;
} }
@ -149,8 +147,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (IS_DELETE(dirty.state)) if (IS_DELETE(dirty.state))
{ {
assert(!result_version); assert(!result_version);
clean_found = false; read_op->version = 0;
break; read_op->retval = -ENOENT;
FINISH_OP(read_op);
return 2;
} }
if (!result_version) if (!result_version)
{ {
@ -238,12 +238,19 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
} }
} }
} }
else if (fulfilled < read_op->len) if (!result_version)
{ {
// fill remaining parts with zeroes // May happen if there are entries in dirty_db but all of them are !version_ok
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0)); read_op->version = 0;
read_op->retval = -ENOENT;
FINISH_OP(read_op);
return 2;
}
if (fulfilled < read_op->len)
{
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
assert(fulfilled == read_op->len);
} }
assert(fulfilled == read_op->len);
read_op->version = result_version; read_op->version = result_version;
if (!PRIV(read_op)->pending_ops) if (!PRIV(read_op)->pending_ops)
{ {

View File

@ -329,21 +329,40 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
} }
// Mark object chunk(s) as corrupted // Mark object chunk(s) as corrupted
int changes = 0; int changes = 0;
for (auto & chunk: corrupted_set) for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
{ {
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error; auto & chunk = *chunk_it;
if (corrupted) if (stripes[chunk.role].osd_num == chunk.osd_num)
{ {
if (!(chunk.loc_bad & LOC_CORRUPTED)) if (stripes[chunk.role].not_exists)
{
changes++; changes++;
chunk.loc_bad |= LOC_CORRUPTED; corrupted_set.erase(chunk_it, chunk_it+1);
continue;
}
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
{
changes++;
chunk.loc_bad = LOC_CORRUPTED;
}
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
(chunk.loc_bad & LOC_CORRUPTED))
{
changes++;
chunk.loc_bad &= ~LOC_CORRUPTED;
}
} }
else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED)) if (inconsistent && !chunk.loc_bad)
{ {
if (!(chunk.loc_bad & LOC_INCONSISTENT)) changes++;
changes++;
chunk.loc_bad |= LOC_INCONSISTENT; chunk.loc_bad |= LOC_INCONSISTENT;
} }
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
{
changes++;
chunk.loc_bad &= ~LOC_INCONSISTENT;
}
chunk_it++;
} }
if (!changes) if (!changes)
{ {

View File

@ -344,6 +344,13 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
else else
expected = 0; expected = 0;
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval == -ENOENT && opcode == OSD_OP_SEC_READ)
{
// ENOENT is not an error for almost all reads, except scrub
retval = expected;
memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
}
if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)) if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
{ {
uint64_t version = subop->reply.sec_rw.version; uint64_t version = subop->reply.sec_rw.version;

View File

@ -1131,12 +1131,14 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce) uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
{ {
std::vector<int> found_valid; std::vector<int> found_valid;
int cur_live[pg_size], live_count = 0; int cur_live[pg_size], live_count = 0, exists_count = 0;
osd_num_t fake_osd_set[pg_size]; osd_num_t fake_osd_set[pg_size];
for (int role = 0; role < pg_size; role++) for (int role = 0; role < pg_size; role++)
{ {
if (!stripes[role].missing) if (!stripes[role].missing)
{ {
if (!stripes[role].not_exists)
exists_count++;
cur_live[live_count++] = role; cur_live[live_count++] = role;
fake_osd_set[role] = role+1; fake_osd_set[role] = role+1;
} }
@ -1145,6 +1147,14 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
{ {
return std::vector<int>(); return std::vector<int>();
} }
if (exists_count <= pg_minsize)
{
// Special case: user manually deleted some chunks
for (int role = 0; role < pg_size; role++)
if (!stripes[role].missing && !stripes[role].not_exists)
found_valid.push_back(role);
return found_valid;
}
// Try to locate errors using brute force if there isn't too many combinations // Try to locate errors using brute force if there isn't too many combinations
osd_rmw_stripe_t brute_stripes[pg_size]; osd_rmw_stripe_t brute_stripes[pg_size];
int out_count = live_count-pg_minsize; int out_count = live_count-pg_minsize;

View File

@ -30,6 +30,7 @@ struct osd_rmw_stripe_t
osd_num_t osd_num; osd_num_t osd_num;
bool missing: 1; bool missing: 1;
bool read_error: 1; bool read_error: 1;
bool not_exists: 1;
}; };
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate

View File

@ -455,7 +455,8 @@ resume_2:
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {
eq_to[role] = -1; eq_to[role] = -1;
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing) if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
!op_data->stripes[role].not_exists)
{ {
total++; total++;
eq_to[role] = role; eq_to[role] = role;
@ -560,7 +561,9 @@ resume_2:
} }
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {
if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent) if (op_data->stripes[role].osd_num != 0 &&
(op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
inconsistent)
{ {
// Got at least 1 read error or mismatch, mark the object as corrupted // Got at least 1 read error or mismatch, mark the object as corrupted
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }); auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });