Fix "can't get SQE, will fall out of sync with EPOLLET" when overflowing the ring

OSDs shouldn't crash or hang with long iodepths anymore
Vitaliy Filippov 2020-10-30 01:06:34 +03:00
parent 2ccb75974b
commit 23ea409081
3 changed files with 17 additions and 9 deletions

View File

@ -357,9 +357,6 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
## Known Problems
- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET"
if you try to load them with very long iodepths because io_uring queue (ring) is limited
and OSDs don't check if it fills up.
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
deletion because proper handling of object cleanup in a cluster should be "three-phase"
and it's currently not implemented. Inode removal tool currently can't handle unclean

View File

@ -122,9 +122,6 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
return true;
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
if (ringloop && !use_sync_send_recv)
{
io_uring_sqe* sqe = ringloop->get_sqe();
@ -132,12 +129,18 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
return false;
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
}
else
{
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
if (result < 0)
{

View File

@ -66,10 +66,18 @@ void ring_loop_t::loop()
struct ring_data_t *d = (struct ring_data_t*)cqe->user_data;
if (d->callback)
{
d->res = cqe->res;
d->callback(d);
// First free ring_data item, then call the callback
// so it has at least 1 free slot for the next event
// which is required for EPOLLET to function properly
struct ring_data_t dl;
dl.iov = d->iov;
dl.res = cqe->res;
dl.callback.swap(d->callback);
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
dl.callback(&dl);
}
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
else
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
io_uring_cqe_seen(&ring, cqe);
}
while (get_sqe_queue.size() > 0)