Allow scrub to fix corrupted object states
parent
4bfd994341
commit
6ca20aa194
|
@ -73,7 +73,10 @@ Input:
|
|||
write request is copied into the metadata area bitwise and stored there.
|
||||
|
||||
Output:
|
||||
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
|
||||
- retval = number of bytes actually read/written or negative error number
|
||||
-EINVAL = invalid input parameters
|
||||
-ENOENT = requested object/version does not exist for reads
|
||||
-ENOSPC = no space left in the store for writes
|
||||
- version = the version actually read or written
|
||||
|
||||
## BS_OP_DELETE
|
||||
|
|
|
@ -124,10 +124,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
|||
bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
|
||||
if (!clean_found && !dirty_found)
|
||||
{
|
||||
// region is not allocated - return zeroes
|
||||
memset(read_op->buf, 0, read_op->len);
|
||||
read_op->version = 0;
|
||||
read_op->retval = read_op->len;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
|
@ -149,8 +147,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
|||
if (IS_DELETE(dirty.state))
|
||||
{
|
||||
assert(!result_version);
|
||||
clean_found = false;
|
||||
break;
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (!result_version)
|
||||
{
|
||||
|
@ -238,12 +238,19 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
|
|||
}
|
||||
}
|
||||
}
|
||||
else if (fulfilled < read_op->len)
|
||||
if (!result_version)
|
||||
{
|
||||
// fill remaining parts with zeroes
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
// May happen if there are entries in dirty_db but all of them are !version_ok
|
||||
read_op->version = 0;
|
||||
read_op->retval = -ENOENT;
|
||||
FINISH_OP(read_op);
|
||||
return 2;
|
||||
}
|
||||
if (fulfilled < read_op->len)
|
||||
{
|
||||
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
|
||||
assert(fulfilled == read_op->len);
|
||||
}
|
||||
assert(fulfilled == read_op->len);
|
||||
read_op->version = result_version;
|
||||
if (!PRIV(read_op)->pending_ops)
|
||||
{
|
||||
|
|
|
@ -329,21 +329,40 @@ pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_os
|
|||
}
|
||||
// Mark object chunk(s) as corrupted
|
||||
int changes = 0;
|
||||
for (auto & chunk: corrupted_set)
|
||||
for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
|
||||
{
|
||||
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
|
||||
if (corrupted)
|
||||
auto & chunk = *chunk_it;
|
||||
if (stripes[chunk.role].osd_num == chunk.osd_num)
|
||||
{
|
||||
if (!(chunk.loc_bad & LOC_CORRUPTED))
|
||||
if (stripes[chunk.role].not_exists)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_CORRUPTED;
|
||||
corrupted_set.erase(chunk_it, chunk_it+1);
|
||||
continue;
|
||||
}
|
||||
if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad = LOC_CORRUPTED;
|
||||
}
|
||||
else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
|
||||
(chunk.loc_bad & LOC_CORRUPTED))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_CORRUPTED;
|
||||
}
|
||||
}
|
||||
else if (inconsistent && !(chunk.loc_bad & LOC_OUTDATED))
|
||||
if (inconsistent && !chunk.loc_bad)
|
||||
{
|
||||
if (!(chunk.loc_bad & LOC_INCONSISTENT))
|
||||
changes++;
|
||||
changes++;
|
||||
chunk.loc_bad |= LOC_INCONSISTENT;
|
||||
}
|
||||
else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
|
||||
{
|
||||
changes++;
|
||||
chunk.loc_bad &= ~LOC_INCONSISTENT;
|
||||
}
|
||||
chunk_it++;
|
||||
}
|
||||
if (!changes)
|
||||
{
|
||||
|
|
|
@ -344,6 +344,13 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||
else
|
||||
expected = 0;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (retval == -ENOENT && opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
// ENOENT is not an error for almost all reads, except scrub
|
||||
retval = expected;
|
||||
memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
|
||||
}
|
||||
if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
|
||||
{
|
||||
uint64_t version = subop->reply.sec_rw.version;
|
||||
|
|
|
@ -1131,12 +1131,14 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
|
|||
uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
|
||||
{
|
||||
std::vector<int> found_valid;
|
||||
int cur_live[pg_size], live_count = 0;
|
||||
int cur_live[pg_size], live_count = 0, exists_count = 0;
|
||||
osd_num_t fake_osd_set[pg_size];
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
{
|
||||
if (!stripes[role].missing)
|
||||
{
|
||||
if (!stripes[role].not_exists)
|
||||
exists_count++;
|
||||
cur_live[live_count++] = role;
|
||||
fake_osd_set[role] = role+1;
|
||||
}
|
||||
|
@ -1145,6 +1147,14 @@ std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
|
|||
{
|
||||
return std::vector<int>();
|
||||
}
|
||||
if (exists_count <= pg_minsize)
|
||||
{
|
||||
// Special case: user manually deleted some chunks
|
||||
for (int role = 0; role < pg_size; role++)
|
||||
if (!stripes[role].missing && !stripes[role].not_exists)
|
||||
found_valid.push_back(role);
|
||||
return found_valid;
|
||||
}
|
||||
// Try to locate errors using brute force if there isn't too many combinations
|
||||
osd_rmw_stripe_t brute_stripes[pg_size];
|
||||
int out_count = live_count-pg_minsize;
|
||||
|
|
|
@ -30,6 +30,7 @@ struct osd_rmw_stripe_t
|
|||
osd_num_t osd_num;
|
||||
bool missing: 1;
|
||||
bool read_error: 1;
|
||||
bool not_exists: 1;
|
||||
};
|
||||
|
||||
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
|
||||
|
|
|
@ -455,7 +455,8 @@ resume_2:
|
|||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
eq_to[role] = -1;
|
||||
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing)
|
||||
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
|
||||
!op_data->stripes[role].not_exists)
|
||||
{
|
||||
total++;
|
||||
eq_to[role] = role;
|
||||
|
@ -560,7 +561,9 @@ resume_2:
|
|||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 && op_data->stripes[role].read_error || inconsistent)
|
||||
if (op_data->stripes[role].osd_num != 0 &&
|
||||
(op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
|
||||
inconsistent)
|
||||
{
|
||||
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
|
|
Loading…
Reference in New Issue