|
|
|
@ -6,6 +6,12 @@
|
|
|
|
|
|
|
|
|
|
#include "messenger.h"
|
|
|
|
|
|
|
|
|
|
#include <linux/errqueue.h>
|
|
|
|
|
|
|
|
|
|
#ifndef MSG_ZEROCOPY
|
|
|
|
|
#define MSG_ZEROCOPY 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|
|
|
|
{
|
|
|
|
|
assert(cur_op->peer_fd);
|
|
|
|
@ -36,6 +42,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|
|
|
|
}
|
|
|
|
|
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
|
|
|
|
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
|
|
|
|
auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size;
|
|
|
|
|
if (cur_op->op_type == OSD_OP_IN)
|
|
|
|
|
{
|
|
|
|
|
measure_exec(cur_op);
|
|
|
|
@ -46,6 +53,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|
|
|
|
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
|
|
|
|
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
|
|
|
|
}
|
|
|
|
|
to_size += OSD_PACKET_SIZE;
|
|
|
|
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
|
|
|
|
// Bitmap
|
|
|
|
|
if (cur_op->op_type == OSD_OP_IN &&
|
|
|
|
@ -57,6 +65,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|
|
|
|
.iov_len = cur_op->reply.sec_rw.attr_len,
|
|
|
|
|
});
|
|
|
|
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
|
|
|
|
to_size += cur_op->reply.sec_rw.attr_len;
|
|
|
|
|
}
|
|
|
|
|
else if (cur_op->op_type == OSD_OP_OUT &&
|
|
|
|
|
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
|
|
|
@ -67,6 +76,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|
|
|
|
.iov_len = cur_op->req.sec_rw.attr_len,
|
|
|
|
|
});
|
|
|
|
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
|
|
|
|
to_size += cur_op->req.sec_rw.attr_len;
|
|
|
|
|
}
|
|
|
|
|
// Operation data
|
|
|
|
|
if ((cur_op->op_type == OSD_OP_IN
|
|
|
|
@ -86,14 +96,21 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|
|
|
|
assert(cur_op->iov.buf[i].iov_base);
|
|
|
|
|
to_send_list.push_back(cur_op->iov.buf[i]);
|
|
|
|
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
|
|
|
|
to_size += cur_op->iov.buf[i].iov_len;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
|
|
|
|
{
|
|
|
|
|
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
|
|
|
|
|
{
|
|
|
|
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
|
|
|
|
to_size += cur_op->reply.hdr.retval;
|
|
|
|
|
}
|
|
|
|
|
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
|
|
|
|
{
|
|
|
|
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
|
|
|
|
to_size += cur_op->req.sec_read_bmp.len;
|
|
|
|
|
}
|
|
|
|
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
|
|
|
|
}
|
|
|
|
|
if (cur_op->op_type == OSD_OP_IN)
|
|
|
|
@ -177,17 +194,19 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|
|
|
|
}
|
|
|
|
|
cl->write_msg.msg_iov = cl->send_list.data();
|
|
|
|
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
|
|
|
|
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
|
|
|
|
cl->refs++;
|
|
|
|
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
|
|
|
|
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
|
|
|
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
|
|
|
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
cl->write_msg.msg_iov = cl->send_list.data();
|
|
|
|
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
|
|
|
|
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
|
|
|
|
cl->refs++;
|
|
|
|
|
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
|
|
|
|
|
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags);
|
|
|
|
|
if (result < 0)
|
|
|
|
|
{
|
|
|
|
|
result = -errno;
|
|
|
|
@ -197,6 +216,62 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res)
|
|
|
|
|
{
|
|
|
|
|
cl->refs--;
|
|
|
|
|
if (cl->peer_state == PEER_STOPPED)
|
|
|
|
|
{
|
|
|
|
|
if (cl->refs <= 0)
|
|
|
|
|
{
|
|
|
|
|
delete cl;
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (res != 0)
|
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC)
|
|
|
|
|
{
|
|
|
|
|
fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm))
|
|
|
|
|
{
|
|
|
|
|
if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR)
|
|
|
|
|
{
|
|
|
|
|
struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm);
|
|
|
|
|
if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY)
|
|
|
|
|
{
|
|
|
|
|
// completed sends numbered serr->ee_info .. serr->ee_data
|
|
|
|
|
int start = 0;
|
|
|
|
|
while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info)
|
|
|
|
|
start++;
|
|
|
|
|
int end = start;
|
|
|
|
|
if (serr->ee_data < serr->ee_info)
|
|
|
|
|
{
|
|
|
|
|
// counter has wrapped around
|
|
|
|
|
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend)
|
|
|
|
|
end++;
|
|
|
|
|
}
|
|
|
|
|
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data)
|
|
|
|
|
end++;
|
|
|
|
|
if (end > start)
|
|
|
|
|
{
|
|
|
|
|
for (int i = start; i < end; i++)
|
|
|
|
|
{
|
|
|
|
|
delete cl->zerocopy_sent[i].op;
|
|
|
|
|
}
|
|
|
|
|
cl->zerocopy_sent.erase(
|
|
|
|
|
cl->zerocopy_sent.begin() + start,
|
|
|
|
|
cl->zerocopy_sent.begin() + end
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void osd_messenger_t::send_replies()
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < write_ready_clients.size(); i++)
|
|
|
|
@ -224,16 +299,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (result < 0 && result != -EAGAIN && result != -EINTR)
|
|
|
|
|
if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS)
|
|
|
|
|
{
|
|
|
|
|
// this is a client socket, so don't panic. just disconnect it
|
|
|
|
|
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
|
|
|
|
stop_client(cl->peer_fd);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
bool used_zerocopy = false;
|
|
|
|
|
if (result >= 0)
|
|
|
|
|
{
|
|
|
|
|
used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false;
|
|
|
|
|
int done = 0;
|
|
|
|
|
int bytes_written = result;
|
|
|
|
|
while (result > 0 && done < cl->send_list.size())
|
|
|
|
|
{
|
|
|
|
|
iovec & iov = cl->send_list[done];
|
|
|
|
@ -242,7 +320,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|
|
|
|
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
|
|
|
|
{
|
|
|
|
|
// Reply fully sent
|
|
|
|
|
delete cl->outbox[done].op;
|
|
|
|
|
if (!used_zerocopy)
|
|
|
|
|
{
|
|
|
|
|
delete cl->outbox[done].op;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// With zero-copy send the difference is that we must keep the buffer (i.e. the operation)
|
|
|
|
|
// allocated until we get send notification from MSG_ERRQUEUE
|
|
|
|
|
cl->zerocopy_sent.push_back((msgr_zc_not_t){
|
|
|
|
|
.op = cl->outbox[done].op,
|
|
|
|
|
.nsend = cl->zerocopy_notification_idx,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result -= iov.iov_len;
|
|
|
|
|
done++;
|
|
|
|
@ -254,6 +344,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (used_zerocopy)
|
|
|
|
|
{
|
|
|
|
|
cl->zerocopy_notification_idx++;
|
|
|
|
|
}
|
|
|
|
|
cl->outbox_size -= bytes_written;
|
|
|
|
|
if (done > 0)
|
|
|
|
|
{
|
|
|
|
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
|
|
|
@ -263,8 +358,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|
|
|
|
{
|
|
|
|
|
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
|
|
|
|
|
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
|
|
|
|
|
cl->outbox_size += cl->next_outbox_size;
|
|
|
|
|
cl->next_send_list.clear();
|
|
|
|
|
cl->next_outbox.clear();
|
|
|
|
|
cl->next_outbox_size = 0;
|
|
|
|
|
}
|
|
|
|
|
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
|
|
|
|
#ifdef WITH_RDMA
|
|
|
|
@ -287,4 +384,34 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|
|
|
|
{
|
|
|
|
|
write_ready_clients.push_back(cl->peer_fd);
|
|
|
|
|
}
|
|
|
|
|
if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 &&
|
|
|
|
|
cl->zerocopy_sent.size() > 0)
|
|
|
|
|
{
|
|
|
|
|
cl->zerocopy_notification_prev = cl->zerocopy_notification_idx;
|
|
|
|
|
cl->zerocopy_notification_msg = {
|
|
|
|
|
.msg_control = cl->zerocopy_notification_buf,
|
|
|
|
|
.msg_controllen = sizeof(cl->zerocopy_notification_buf),
|
|
|
|
|
};
|
|
|
|
|
cl->refs++;
|
|
|
|
|
io_uring_sqe* sqe = NULL;
|
|
|
|
|
if (ringloop && !use_sync_send_recv)
|
|
|
|
|
{
|
|
|
|
|
sqe = ringloop->get_sqe();
|
|
|
|
|
}
|
|
|
|
|
if (!sqe)
|
|
|
|
|
{
|
|
|
|
|
int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT);
|
|
|
|
|
if (res < 0)
|
|
|
|
|
{
|
|
|
|
|
res = -errno;
|
|
|
|
|
}
|
|
|
|
|
handle_zerocopy_notification(cl, res);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
|
|
|
|
data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); };
|
|
|
|
|
my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|