Fix OSDs possibly dying with "map::at" errors when other OSDs are stopped
parent
9d3ba113aa
commit
61ebed144a
|
@ -96,7 +96,8 @@ struct rm_inode_t
|
|||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
|
|
|
@ -200,7 +200,8 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
|
|||
auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
|
||||
// Already checked that it exists above, but anyway
|
||||
op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num);
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
|
|
|
@ -211,7 +211,7 @@ class osd_t
|
|||
// flushing, recovery and backfill
|
||||
void submit_pg_flush_ops(pg_t & pg);
|
||||
void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
|
||||
void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
|
||||
bool pick_next_recovery(osd_recovery_op_t &op);
|
||||
void submit_recovery_op(osd_recovery_op_t *op);
|
||||
bool continue_recovery();
|
||||
|
|
|
@ -47,7 +47,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
|||
if (l.second.size() > 0)
|
||||
{
|
||||
fb->flush_ops++;
|
||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
|
||||
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()))
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (auto & l: fb->stable_lists)
|
||||
|
@ -55,7 +56,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
|||
if (l.second.size() > 0)
|
||||
{
|
||||
fb->flush_ops++;
|
||||
submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
|
||||
if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()))
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -160,7 +162,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||
}
|
||||
}
|
||||
|
||||
void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||
bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
|
||||
{
|
||||
osd_op_t *op = new osd_op_t();
|
||||
// Copy buffer so it gets freed along with the operation
|
||||
|
@ -188,10 +190,8 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||
else
|
||||
{
|
||||
// Peer
|
||||
int peer_fd = msgr.osd_peer_fds[peer_osd];
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
||||
op->peer_fd = peer_fd;
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_stab = {
|
||||
.header = {
|
||||
|
@ -207,8 +207,21 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
||||
delete op;
|
||||
};
|
||||
msgr.outbox_push(op);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
op->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
op->reply.hdr.retval = -EPIPE;
|
||||
op->callback(op);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||
|
|
|
@ -340,7 +340,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
|||
else
|
||||
{
|
||||
// Peer
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cl->peer_fd;
|
||||
|
@ -419,7 +419,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||
// Peer
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds[role_osd];
|
||||
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
|
|
|
@ -246,7 +246,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||
// Send to a remote OSD
|
||||
osd_op_t *subop = op_data->subops+subop_idx;
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
|
||||
// FIXME: Use the pre-allocated buffer
|
||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||
subop->req = (osd_any_op_t){
|
||||
|
@ -287,7 +286,18 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||
}
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(subop);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(subop_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
}
|
||||
subop_idx++;
|
||||
}
|
||||
prev = i+1;
|
||||
|
|
|
@ -182,7 +182,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->req.sec_rw = {
|
||||
|
@ -225,7 +224,18 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(subop);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subop->peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subop->reply.hdr.retval = -EPIPE;
|
||||
subop->callback(subop);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
@ -463,7 +473,6 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
|
@ -477,7 +486,18 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(&subops[i]);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(chunk.osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -567,7 +587,6 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
|
@ -581,7 +600,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
msgr.outbox_push(&subops[i]);
|
||||
auto peer_fd_it = msgr.osd_peer_fds.find(stab_osd.osd_num);
|
||||
if (peer_fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].peer_fd = peer_fd_it->second;
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fail it immediately
|
||||
subops[i].reply.hdr.retval = -EPIPE;
|
||||
subops[i].callback(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue