Experiment: zero-copy TCP send

Vitaliy Filippov 2022-02-03 02:39:37 +03:00
parent 2697aae909
commit 385aca9d44
5 changed files with 193 additions and 14 deletions

View File

@ -30,6 +30,18 @@
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
принципе, это может применяться для клиентов со старыми версиями ядра. принципе, это может применяться для клиентов со старыми версиями ядра.
- name: use_zerocopy_send
type: bool
default: false
info: |
If true, OSDs and clients will attempt to use TCP zero-copy send
(MSG_ZEROCOPY) for big buffers. It's recommended to raise net.ipv4.tcp_wmem
and net.core.wmem_max sysctls when using this mode.
info_ru: |
Если установлено в true, то OSD и клиенты будут стараться использовать
TCP-отправку без копирования (MSG_ZEROCOPY) для больших буферов данных.
Рекомендуется поднять значения sysctl net.ipv4.tcp_wmem и net.core.wmem_max
при использовании этого режима.
- name: use_rdma - name: use_rdma
type: bool type: bool
default: true default: true

View File

@ -64,6 +64,7 @@ const etcd_tree = {
// client and osd // client and osd
tcp_header_buffer_size: 65536, tcp_header_buffer_size: 65536,
use_sync_send_recv: false, use_sync_send_recv: false,
use_zerocopy_send: false,
use_rdma: true, use_rdma: true,
rdma_device: null, // for example, "rocep5s0f0" rdma_device: null, // for example, "rocep5s0f0"
rdma_port_num: 1, rdma_port_num: 1,

View File

@ -39,6 +39,12 @@ void osd_messenger_t::init()
handle_rdma_events(); handle_rdma_events();
} }
} }
#endif
#ifndef SO_ZEROCOPY
if (log_level > 0)
{
fprintf(stderr, "Zero-copy TCP send is not supported in this build, ignoring\n");
}
#endif #endif
keepalive_timer_id = tfd->set_timer(1000, true, [this](int) keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
{ {
@ -162,6 +168,8 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->receive_buffer_size = 65536; this->receive_buffer_size = 65536;
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() || this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
config["use_sync_send_recv"].uint64_value(); config["use_sync_send_recv"].uint64_value();
this->use_zerocopy_send = config["use_zerocopy_send"].bool_value() ||
config["use_zerocopy_send"].uint64_value();
this->peer_connect_interval = config["peer_connect_interval"].uint64_value(); this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
if (!this->peer_connect_interval) if (!this->peer_connect_interval)
this->peer_connect_interval = 5; this->peer_connect_interval = 5;
@ -288,8 +296,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
on_connect_peer(peer_osd, -result); on_connect_peer(peer_osd, -result);
return; return;
} }
int one = 1; set_socket_options(cl);
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl->peer_state = PEER_CONNECTED; cl->peer_state = PEER_CONNECTED;
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events) tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{ {
@ -299,6 +306,23 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
check_peer_config(cl); check_peer_config(cl);
} }
void osd_messenger_t::set_socket_options(osd_client_t *cl)
{
int one = 1;
setsockopt(cl->peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
#ifdef SO_ZEROCOPY
if (!use_zerocopy_send)
cl->zerocopy_send = false;
else if (setsockopt(cl->peer_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) != 0)
{
if (log_level > 0)
fprintf(stderr, "[OSD %lu] Failed to enable zero-copy send for client %d: %s\n", this->osd_num, cl->peer_fd, strerror(errno));
}
else
cl->zerocopy_send = true;
#endif
}
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events) void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
{ {
// Mark client as ready (i.e. some data is available) // Mark client as ready (i.e. some data is available)
@ -493,14 +517,13 @@ void osd_messenger_t::accept_connections(int listen_fd)
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd, fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
addr_to_string(addr).c_str()); addr_to_string(addr).c_str());
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1; auto cl = clients[peer_fd] = new osd_client_t();
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); cl->peer_addr = addr;
clients[peer_fd] = new osd_client_t(); cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
clients[peer_fd]->peer_addr = addr; cl->peer_fd = peer_fd;
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port); cl->peer_state = PEER_CONNECTED;
clients[peer_fd]->peer_fd = peer_fd; cl->in_buf = malloc_or_die(receive_buffer_size);
clients[peer_fd]->peer_state = PEER_CONNECTED; set_socket_options(cl);
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
// Add FD to epoll // Add FD to epoll
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events) tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{ {

View File

@ -45,6 +45,12 @@ struct msgr_sendp_t
int flags; int flags;
}; };
struct msgr_zc_not_t
{
osd_op_t *op;
uint32_t nsend;
};
struct osd_client_t struct osd_client_t
{ {
int refs = 0; int refs = 0;
@ -57,6 +63,7 @@ struct osd_client_t
int ping_time_remaining = 0; int ping_time_remaining = 0;
int idle_time_remaining = 0; int idle_time_remaining = 0;
osd_num_t osd_num = 0; osd_num_t osd_num = 0;
bool zerocopy_send = false;
void *in_buf = NULL; void *in_buf = NULL;
@ -87,6 +94,12 @@ struct osd_client_t
int write_state = 0; int write_state = 0;
std::vector<iovec> send_list, next_send_list; std::vector<iovec> send_list, next_send_list;
std::vector<msgr_sendp_t> outbox, next_outbox; std::vector<msgr_sendp_t> outbox, next_outbox;
std::vector<msgr_zc_not_t> zerocopy_sent;
uint64_t outbox_size = 0, next_outbox_size = 0;
uint32_t zerocopy_notification_idx = 0;
uint32_t zerocopy_notification_prev = 0;
uint8_t zerocopy_notification_buf[256];
struct msghdr zerocopy_notification_msg;
~osd_client_t() ~osd_client_t()
{ {
@ -127,6 +140,7 @@ protected:
int osd_ping_timeout = 0; int osd_ping_timeout = 0;
int log_level = 0; int log_level = 0;
bool use_sync_send_recv = false; bool use_sync_send_recv = false;
bool use_zerocopy_send = false;
#ifdef WITH_RDMA #ifdef WITH_RDMA
bool use_rdma = true; bool use_rdma = true;
@ -181,10 +195,12 @@ protected:
void check_peer_config(osd_client_t *cl); void check_peer_config(osd_client_t *cl);
void cancel_osd_ops(osd_client_t *cl); void cancel_osd_ops(osd_client_t *cl);
void cancel_op(osd_op_t *op); void cancel_op(osd_op_t *op);
void set_socket_options(osd_client_t *cl);
bool try_send(osd_client_t *cl); bool try_send(osd_client_t *cl);
void measure_exec(osd_op_t *cur_op); void measure_exec(osd_op_t *cur_op);
void handle_send(int result, osd_client_t *cl); void handle_send(int result, osd_client_t *cl);
void handle_zerocopy_notification(osd_client_t *cl, int res);
bool handle_read(int result, osd_client_t *cl); bool handle_read(int result, osd_client_t *cl);
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain); bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);

View File

@ -6,6 +6,12 @@
#include "messenger.h" #include "messenger.h"
#include <linux/errqueue.h>
#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0
#endif
void osd_messenger_t::outbox_push(osd_op_t *cur_op) void osd_messenger_t::outbox_push(osd_op_t *cur_op)
{ {
assert(cur_op->peer_fd); assert(cur_op->peer_fd);
@ -36,6 +42,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
} }
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list; auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox; auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size;
if (cur_op->op_type == OSD_OP_IN) if (cur_op->op_type == OSD_OP_IN)
{ {
measure_exec(cur_op); measure_exec(cur_op);
@ -46,6 +53,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE }); to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
cl->sent_ops[cur_op->req.hdr.id] = cur_op; cl->sent_ops[cur_op->req.hdr.id] = cur_op;
} }
to_size += OSD_PACKET_SIZE;
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
// Bitmap // Bitmap
if (cur_op->op_type == OSD_OP_IN && if (cur_op->op_type == OSD_OP_IN &&
@ -57,6 +65,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
.iov_len = cur_op->reply.sec_rw.attr_len, .iov_len = cur_op->reply.sec_rw.attr_len,
}); });
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
to_size += cur_op->reply.sec_rw.attr_len;
} }
else if (cur_op->op_type == OSD_OP_OUT && else if (cur_op->op_type == OSD_OP_OUT &&
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) && (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
@ -67,6 +76,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
.iov_len = cur_op->req.sec_rw.attr_len, .iov_len = cur_op->req.sec_rw.attr_len,
}); });
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
to_size += cur_op->req.sec_rw.attr_len;
} }
// Operation data // Operation data
if ((cur_op->op_type == OSD_OP_IN if ((cur_op->op_type == OSD_OP_IN
@ -86,14 +96,21 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
assert(cur_op->iov.buf[i].iov_base); assert(cur_op->iov.buf[i].iov_base);
to_send_list.push_back(cur_op->iov.buf[i]); to_send_list.push_back(cur_op->iov.buf[i]);
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
to_size += cur_op->iov.buf[i].iov_len;
} }
} }
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{ {
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0) if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
{
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval }); to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
to_size += cur_op->reply.hdr.retval;
}
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0) else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
{
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len }); to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
to_size += cur_op->req.sec_read_bmp.len;
}
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
} }
if (cur_op->op_type == OSD_OP_IN) if (cur_op->op_type == OSD_OP_IN)
@ -177,17 +194,19 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
} }
cl->write_msg.msg_iov = cl->send_list.data(); cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX; cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
cl->refs++; cl->refs++;
ring_data_t* data = ((ring_data_t*)sqe->user_data); ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); }; data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0); my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags);
} }
else else
{ {
cl->write_msg.msg_iov = cl->send_list.data(); cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX; cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
cl->refs++; cl->refs++;
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL); int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags);
if (result < 0) if (result < 0)
{ {
result = -errno; result = -errno;
@ -197,6 +216,62 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
return true; return true;
} }
void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res)
{
cl->refs--;
if (cl->peer_state == PEER_STOPPED)
{
if (cl->refs <= 0)
{
delete cl;
}
return;
}
if (res != 0)
{
return;
}
if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC)
{
fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd);
return;
}
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm))
{
if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR)
{
struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm);
if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY)
{
// completed sends numbered serr->ee_info .. serr->ee_data
int start = 0;
while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info)
start++;
int end = start;
if (serr->ee_data < serr->ee_info)
{
// counter has wrapped around
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend)
end++;
}
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data)
end++;
if (end > start)
{
for (int i = start; i < end; i++)
{
delete cl->zerocopy_sent[i].op;
}
cl->zerocopy_sent.erase(
cl->zerocopy_sent.begin() + start,
cl->zerocopy_sent.begin() + end
);
}
}
}
}
}
void osd_messenger_t::send_replies() void osd_messenger_t::send_replies()
{ {
for (int i = 0; i < write_ready_clients.size(); i++) for (int i = 0; i < write_ready_clients.size(); i++)
@ -224,16 +299,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
} }
return; return;
} }
if (result < 0 && result != -EAGAIN && result != -EINTR) if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS)
{ {
// this is a client socket, so don't panic. just disconnect it // this is a client socket, so don't panic. just disconnect it
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result)); fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
stop_client(cl->peer_fd); stop_client(cl->peer_fd);
return; return;
} }
bool used_zerocopy = false;
if (result >= 0) if (result >= 0)
{ {
used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false;
int done = 0; int done = 0;
int bytes_written = result;
while (result > 0 && done < cl->send_list.size()) while (result > 0 && done < cl->send_list.size())
{ {
iovec & iov = cl->send_list[done]; iovec & iov = cl->send_list[done];
@ -242,7 +320,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
if (cl->outbox[done].flags & MSGR_SENDP_FREE) if (cl->outbox[done].flags & MSGR_SENDP_FREE)
{ {
// Reply fully sent // Reply fully sent
delete cl->outbox[done].op; if (!used_zerocopy)
{
delete cl->outbox[done].op;
}
else
{
// With zero-copy send the difference is that we must keep the buffer (i.e. the operation)
// allocated until we get send notification from MSG_ERRQUEUE
cl->zerocopy_sent.push_back((msgr_zc_not_t){
.op = cl->outbox[done].op,
.nsend = cl->zerocopy_notification_idx,
});
}
} }
result -= iov.iov_len; result -= iov.iov_len;
done++; done++;
@ -254,6 +344,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
break; break;
} }
} }
if (used_zerocopy)
{
cl->zerocopy_notification_idx++;
}
cl->outbox_size -= bytes_written;
if (done > 0) if (done > 0)
{ {
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done); cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
@ -263,8 +358,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
{ {
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end()); cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end()); cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
cl->outbox_size += cl->next_outbox_size;
cl->next_send_list.clear(); cl->next_send_list.clear();
cl->next_outbox.clear(); cl->next_outbox.clear();
cl->next_outbox_size = 0;
} }
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0; cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
#ifdef WITH_RDMA #ifdef WITH_RDMA
@ -287,4 +384,34 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
{ {
write_ready_clients.push_back(cl->peer_fd); write_ready_clients.push_back(cl->peer_fd);
} }
if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 &&
cl->zerocopy_sent.size() > 0)
{
cl->zerocopy_notification_prev = cl->zerocopy_notification_idx;
cl->zerocopy_notification_msg = {
.msg_control = cl->zerocopy_notification_buf,
.msg_controllen = sizeof(cl->zerocopy_notification_buf),
};
cl->refs++;
io_uring_sqe* sqe = NULL;
if (ringloop && !use_sync_send_recv)
{
sqe = ringloop->get_sqe();
}
if (!sqe)
{
int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT);
if (res < 0)
{
res = -errno;
}
handle_zerocopy_notification(cl, res);
}
else
{
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); };
my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE);
}
}
} }