Retry failed reads (including chained and RMW) from other replicas

test-double-alloc
Vitaliy Filippov 2023-02-27 02:12:55 +03:00
parent bf2112653b
commit 8dc427b43c
9 changed files with 319 additions and 61 deletions

View File

@ -240,7 +240,8 @@ class osd_t
void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
@ -256,10 +257,11 @@ class osd_t
int submit_primary_sync_subops(osd_op_t *cur_op);
void submit_primary_stab_subops(osd_op_t *cur_op);
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
void continue_chained_read(osd_op_t *cur_op);
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);

View File

@ -310,20 +310,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
if (osd_op->reply.hdr.retval < 0)
{
// Error recovering object
if (osd_op->reply.hdr.retval == -EPIPE)
{
// PG is stopped or one of the OSDs is gone, error is harmless
printf(
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
op->oid.inode, op->oid.stripe
);
}
else
{
throw std::runtime_error("Failed to recover an object");
}
// EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
printf(
"[PG %u/%u] Recovery operation failed with object %lx:%lx: error %ld\n",
INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval
);
}
else if (log_level > 2)
{
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
op->osd_op = NULL;

View File

@ -336,7 +336,7 @@ pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_
{
for (auto & o: osd_set)
{
if (!(o.loc_bad & LOC_OUTDATED))
if (!o.loc_bad)
{
read_target.push_back(o.osd_num);
}

View File

@ -90,6 +90,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
chain_size * (
// - copy of the chain
sizeof(inode_t) +
// - object states for every chain item
sizeof(void*) +
// - bitmap buffers for chained read
stripe_count * clean_entry_bitmap_size +
// - 'missing' flags for chained reads
@ -117,6 +119,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
{
op_data->read_chain = (inode_t*)data_buf;
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
op_data->chain_states = (pg_osd_set_state_t**)data_buf;
data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
op_data->snapshot_bitmaps = data_buf;
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
op_data->missing_flags = (uint8_t*)data_buf;
@ -131,6 +135,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
inode_it->second.parent_id != cur_op->req.rw.inode)
{
op_data->read_chain[chain_num++] = inode_it->second.parent_id;
op_data->chain_states[chain_num++] = NULL;
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
}
}
@ -138,12 +143,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
return true;
}
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
{
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
{
*object_state = NULL;
return def;
return pg.cur_set.data();
}
auto st_it = pg.incomplete_objects.find(oid);
if (st_it != pg.incomplete_objects.end())
@ -164,7 +169,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_
return st_it->second->read_target.data();
}
*object_state = NULL;
return def;
return pg.cur_set.data();
}
void osd_t::continue_primary_read(osd_op_t *cur_op)
@ -183,6 +188,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
goto resume_1;
else if (op_data->st == 2)
goto resume_2;
resume_0:
cur_op->reply.rw.bitmap_len = 0;
{
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
@ -206,15 +212,17 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
// Determine version
auto vo_it = pg.ver_override.find(op_data->oid);
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
op_data->prev_set = pg.cur_set.data();
if (pg.state != PG_ACTIVE)
{
// PG may be degraded or have misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
}
// PG may have degraded or misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Fast happy-path
if (op_data->scheme == POOL_SCHEME_REPLICATED &&
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
{
finish_op(cur_op, -EIO);
return;
}
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
op_data->st = 1;
@ -240,6 +248,14 @@ resume_1:
resume_2:
if (op_data->errors > 0)
{
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
{
// I/O or checksum error
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
goto resume_0;
}
finish_op(cur_op, op_data->errcode);
return;
}
@ -278,15 +294,129 @@ resume_2:
finish_op(cur_op, cur_op->req.rw.len);
}
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
{
pg_osd_set_state_t *object_state = NULL;
get_object_osd_set(pg, oid, &object_state);
if (prev_object_state != object_state)
{
// Object state changed in between by a parallel I/O operation, skip marking as failed
if (ref)
{
deref_object_state(pg, &prev_object_state, ref);
if (object_state)
object_state->ref_count++;
}
return object_state;
}
pg_osd_set_t corrupted_set;
if (object_state)
{
corrupted_set = object_state->osd_set;
}
else
{
for (int i = 0; i < pg.cur_set.size(); i++)
{
corrupted_set.push_back((pg_obj_loc_t){
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
.osd_num = pg.cur_set[i],
});
}
}
// Mark object chunk(s) as corrupted
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
for (auto & chunk: corrupted_set)
{
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
n_corrupted++;
chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
if (!chunk.loc_bad)
{
if (pg.scheme == POOL_SCHEME_REPLICATED)
n_roles = 1;
else if (!(has_roles & (1 << chunk.role)))
{
n_roles++;
has_roles |= (1 << chunk.role);
}
n_copies++;
}
}
if (!n_corrupted)
{
// No chunks newly marked as corrupted - object is already marked or moved
return object_state;
}
int old_pg_state = pg.state;
if (object_state)
{
remove_object_from_state(oid, &object_state, pg, false);
deref_object_state(pg, &object_state, ref);
}
// Calculate object state
uint64_t obj_state = OBJ_CORRUPTED;
int pg_state_bits = PG_HAS_CORRUPTED;
this->corrupted_objects++;
pg.corrupted_count++;
if (log_level > 1)
{
printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
}
if (n_roles < pg.pg_data_size)
{
this->incomplete_objects++;
obj_state |= OBJ_INCOMPLETE;
pg_state_bits = PG_HAS_INCOMPLETE;
}
else if (n_roles < pg.pg_cursize)
{
this->degraded_objects++;
obj_state |= OBJ_DEGRADED;
pg_state_bits = PG_HAS_DEGRADED;
}
else
{
this->misplaced_objects++;
obj_state |= OBJ_MISPLACED;
pg_state_bits = PG_HAS_MISPLACED;
}
pg.state |= pg_state_bits;
if (pg.state != old_pg_state)
{
report_pg_state(pg);
if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
(old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
{
peering_state = peering_state | OSD_RECOVERING;
if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
{
// Restart recovery from degraded objects
recovery_last_degraded = true;
recovery_last_pg = {};
recovery_last_oid = {};
}
ringloop->wakeup();
}
}
// Insert object into the new state and retry
object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
if (ref)
object_state->ref_count++;
return object_state;
}
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg)
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
{
if (!*object_state)
{
return;
}
pg_osd_set_state_t *recheck_state = NULL;
get_object_osd_set(pg, oid, NULL, &recheck_state);
get_object_osd_set(pg, oid, &recheck_state);
if (recheck_state != *object_state)
{
recheck_state->ref_count++;
@ -295,6 +425,12 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
return;
}
(*object_state)->object_count--;
if ((*object_state)->state & OBJ_CORRUPTED)
{
this->corrupted_objects--;
pg.corrupted_count--;
}
bool changed = false;
if ((*object_state)->state & OBJ_INCOMPLETE)
{
// Successful write means that object is not incomplete anymore
@ -303,7 +439,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
if (!pg.incomplete_objects.size())
{
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
report_pg_state(pg);
changed = true;
}
}
else if ((*object_state)->state & OBJ_DEGRADED)
@ -313,7 +449,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
if (!pg.degraded_objects.size())
{
pg.state = pg.state & ~PG_HAS_DEGRADED;
report_pg_state(pg);
changed = true;
}
}
else if ((*object_state)->state & OBJ_MISPLACED)
@ -323,13 +459,17 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
if (!pg.misplaced_objects.size())
{
pg.state = pg.state & ~PG_HAS_MISPLACED;
report_pg_state(pg);
changed = true;
}
}
else
{
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
}
if (changed && report)
{
report_pg_state(pg);
}
}
void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
@ -374,13 +514,14 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
}
resume_1:
// Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
if (op_data->object_state)
{
op_data->object_state->ref_count++;
}
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
op_data->prev_set = NULL;
resume_2:
op_data->st = 2;
return;

View File

@ -50,6 +50,7 @@ struct osd_primary_op_data_t
// for read_bitmaps
void *snapshot_bitmaps;
inode_t *read_chain;
pg_osd_set_state_t **chain_states;
uint8_t *missing_flags;
int chain_size;
osd_chain_read_t *chain_reads;

View File

@ -40,10 +40,24 @@ resume_3:
resume_4:
if (op_data->errors > 0)
{
free(op_data->chain_reads);
op_data->chain_reads = NULL;
finish_op(cur_op, op_data->errcode);
return;
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
{
// Handle corrupted reads and retry...
check_corrupted_chained(pg, cur_op);
free(cur_op->buf);
cur_op->buf = NULL;
free(op_data->chain_reads);
op_data->chain_reads = NULL;
// FIXME: We can in theory retry only specific parts instead of the whole operation
goto resume_1;
}
else
{
free(op_data->chain_reads);
op_data->chain_reads = NULL;
finish_op(cur_op, op_data->errcode);
return;
}
}
send_chained_read_results(pg, cur_op);
finish_op(cur_op, cur_op->req.rw.len);
@ -131,8 +145,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
auto vo_it = pg.ver_override.find(cur_oid);
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
pg_osd_set_state_t *object_state;
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
if (pg.scheme == POOL_SCHEME_REPLICATED)
{
osd_num_t read_target = 0;
@ -247,6 +260,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
osd_op_t *subop = op_data->subops+subop_idx;
subop->op_type = OSD_OP_OUT;
// FIXME: Use the pre-allocated buffer
assert(!subop->buf);
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
subop->req = (osd_any_op_t){
.sec_read_bmp = {
@ -375,6 +389,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
op_data->chain_read_count = chain_reads.size();
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
1, sizeof(osd_chain_read_t) * chain_reads.size()
// FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
// (but it's slightly harder to handle in send_chained_read_results())
+ sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
);
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
@ -403,8 +419,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
uint64_t *cur_set = pg.cur_set.data();
if (pg.state != PG_ACTIVE)
{
pg_osd_set_state_t *object_state;
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
if (op_data->scheme != POOL_SCHEME_REPLICATED)
{
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
@ -416,6 +431,17 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
}
op_data->degraded = 1;
}
else
{
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
{
free(op_data->chain_reads);
op_data->chain_reads = NULL;
finish_op(cur_op, -EIO);
return -1;
}
}
}
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
@ -433,6 +459,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
}
}
}
assert(!cur_op->buf);
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
void *cur_buf = cur_op->buf;
for (int cri = 0; cri < chain_reads.size(); cri++)
@ -468,12 +495,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
auto vo_it = pg.ver_override.find(cur_oid);
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
uint64_t *cur_set = pg.cur_set.data();
if (pg.state != PG_ACTIVE)
{
pg_osd_set_state_t *object_state;
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
}
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
int zero_read = -1;
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
@ -487,6 +510,33 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
return 0;
}
void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
(uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
);
for (int cri = 0; cri < op_data->chain_read_count; cri++)
{
object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
bool corrupted = false;
for (int i = 0; i < stripe_count; i++)
{
if (stripes[i].read_error)
{
corrupted = true;
break;
}
}
if (corrupted)
{
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
}
}
}
void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
{
osd_primary_op_data_t *op_data = cur_op->op_data;

View File

@ -147,9 +147,9 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
continue;
}
osd_num_t role_osd_num = osd_set[role];
int stripe_num = rep ? 0 : role;
if (role_osd_num != 0)
{
int stripe_num = rep ? 0 : role;
osd_op_t *subop = op_data->subops + i;
uint32_t subop_len = wr
? stripes[stripe_num].write_end - stripes[stripe_num].write_start
@ -158,12 +158,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
{
subop_len = 0;
}
stripes[stripe_num].osd_num = role_osd_num;
stripes[stripe_num].read_error = false;
subop->bitmap = stripes[stripe_num].bmp_buf;
subop->bitmap_len = clean_entry_bitmap_size;
// Using rmw_buf to pass pointer to stripes. Dirty but should work
subop->rmw_buf = stripes+stripe_num;
if (role_osd_num == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
subop->op_type = (uint64_t)cur_op;
subop->bitmap = stripes[stripe_num].bmp_buf;
subop->bitmap_len = clean_entry_bitmap_size;
subop->bs_op = new blockstore_op_t({
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
.callback = [subop, this](blockstore_op_t *bs_subop)
@ -192,8 +196,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
else
{
subop->op_type = OSD_OP_OUT;
subop->bitmap = stripes[stripe_num].bmp_buf;
subop->bitmap_len = clean_entry_bitmap_size;
subop->req.sec_rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
@ -250,6 +252,10 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
}
i++;
}
else
{
stripes[stripe_num].osd_num = 0;
}
}
return i-subop_idx;
}
@ -339,9 +345,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
{
printf(
"%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
subop->peer_fd >= 0
? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
: "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
subop->peer_fd, retval, expected
retval, expected, subop->peer_fd
);
}
else
@ -351,22 +359,32 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
osd_op_names[opcode], subop->peer_fd, retval, expected
);
}
// Error priority: EIO > ENOSPC > EPIPE
if (op_data->errcode == 0 || retval == -EIO ||
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
{
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
}
subop->rmw_buf = NULL;
// Error priority: EIO > EDOM > ENOSPC > EPIPE
if (op_data->errcode == 0 ||
retval == -EIO ||
retval == -EDOM && (op_data->errcode == -ENOSPC || op_data->errcode == -EPIPE) ||
retval == -ENOSPC && op_data->errcode == -EPIPE)
{
op_data->errcode = retval;
}
op_data->errors++;
if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
retval != -ENOSPC))
if (subop->peer_fd >= 0 && retval != -EDOM &&
(retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
(retval != -EIO || opcode != OSD_OP_SEC_READ))
{
// Drop connection on any error expect ENOSPC
// Drop connection on unexpected errors
msgr.stop_client(subop->peer_fd);
}
}
else
{
subop->rmw_buf = NULL;
op_data->done++;
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
{

View File

@ -58,12 +58,13 @@ resume_1:
// Determine blocks to read and write
// Missing chunks are allowed to be overwritten even in incomplete objects
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
if (op_data->object_state)
{
// Protect object_state from being freed by a parallel read operation changing it
op_data->object_state->ref_count++;
}
retry_1:
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Simplified algorithm
@ -73,6 +74,12 @@ resume_1:
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
op_data->stripes[0].write_end != bs_block_size))
{
if (op_data->object_state->state & OBJ_INCOMPLETE)
{
// Refuse partial overwrite of an incomplete (corrupted) object
cur_op->reply.hdr.retval = -EIO;
goto continue_others;
}
// Object is degraded/misplaced and will be moved to <write_osd_set>
op_data->stripes[0].read_start = 0;
op_data->stripes[0].read_end = bs_block_size;
@ -91,13 +98,53 @@ resume_1:
}
}
// Read required blocks
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
{
if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
{
// Allow to read version number (just version number!) from corrupted chunks
// to allow full overwrite of a corrupted object
bool found = false;
for (int role = 0; role < op_data->pg_size; role++)
{
if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
{
found = true;
break;
}
}
if (!found)
{
osd_num_t corrupted_target[op_data->pg_size];
for (int role = 0; role < op_data->pg_size; role++)
{
corrupted_target[role] = 0;
}
for (auto & loc: op_data->object_state->osd_set)
{
if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
{
corrupted_target[loc.role] = loc.osd_num;
}
}
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
goto resume_2;
}
}
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
}
resume_2:
op_data->st = 2;
return;
resume_3:
if (op_data->errors > 0)
{
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
{
// Mark object corrupted and retry
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
goto retry_1;
}
deref_object_state(pg, &op_data->object_state, true);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
return;

View File

@ -26,7 +26,9 @@ struct osd_rmw_stripe_t
// read_end=UINT32_MAX means to only read bitmap, but not data
uint32_t read_start, read_end;
uint32_t write_start, write_end;
bool missing;
osd_num_t osd_num;
bool missing: 1;
bool read_error: 1;
};
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate