diff --git a/docs/params/network.yml b/docs/params/network.yml index 8ccf1818..07627e11 100644 --- a/docs/params/network.yml +++ b/docs/params/network.yml @@ -30,6 +30,18 @@ будут использоваться обычные синхронные системные вызовы send/recv. Для OSD это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в принципе, это может применяться для клиентов со старыми версиями ядра. +- name: use_zerocopy_send + type: bool + default: false + info: | + If true, OSDs and clients will attempt to use TCP zero-copy send + (MSG_ZEROCOPY) for big buffers. It's recommended to raise net.ipv4.tcp_wmem + and net.core.wmem_max sysctls when using this mode. + info_ru: | + Если установлено в true, то OSD и клиенты будут стараться использовать + TCP-отправку без копирования (MSG_ZEROCOPY) для больших буферов данных. + Рекомендуется поднять значения sysctl net.ipv4.tcp_wmem и net.core.wmem_max + при использовании этого режима. - name: use_rdma type: bool default: true diff --git a/mon/mon.js b/mon/mon.js index a37d46e2..be243d49 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -64,6 +64,7 @@ const etcd_tree = { // client and osd tcp_header_buffer_size: 65536, use_sync_send_recv: false, + use_zerocopy_send: false, use_rdma: true, rdma_device: null, // for example, "rocep5s0f0" rdma_port_num: 1, diff --git a/src/messenger.cpp b/src/messenger.cpp index 3850f0e2..1e04cb67 100644 --- a/src/messenger.cpp +++ b/src/messenger.cpp @@ -39,6 +39,12 @@ void osd_messenger_t::init() handle_rdma_events(); } } +#endif +#ifndef SO_ZEROCOPY + if (log_level > 0) + { + fprintf(stderr, "Zero-copy TCP send is not supported in this build, ignoring\n"); + } #endif keepalive_timer_id = tfd->set_timer(1000, true, [this](int) { @@ -162,6 +168,8 @@ void osd_messenger_t::parse_config(const json11::Json & config) this->receive_buffer_size = 65536; this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() || config["use_sync_send_recv"].uint64_value(); + this->use_zerocopy_send = config["use_zerocopy_send"].bool_value() || + config["use_zerocopy_send"].uint64_value(); this->peer_connect_interval = config["peer_connect_interval"].uint64_value(); if (!this->peer_connect_interval) this->peer_connect_interval = 5; @@ -288,8 +296,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd) on_connect_peer(peer_osd, -result); return; } - int one = 1; - setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); + set_socket_options(cl); cl->peer_state = PEER_CONNECTED; tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events) { @@ -299,6 +306,23 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd) check_peer_config(cl); } +void osd_messenger_t::set_socket_options(osd_client_t *cl) +{ + int one = 1; + setsockopt(cl->peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); +#ifdef SO_ZEROCOPY + if (!use_zerocopy_send) + cl->zerocopy_send = false; + else if (setsockopt(cl->peer_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) != 0) + { + if (log_level > 0) + fprintf(stderr, "[OSD %lu] Failed to enable zero-copy send for client %d: %s\n", this->osd_num, cl->peer_fd, strerror(errno)); + } + else + cl->zerocopy_send = true; +#endif +} + void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events) { // Mark client as ready (i.e. some data is available) @@ -493,14 +517,13 @@ void osd_messenger_t::accept_connections(int listen_fd) fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd, addr_to_string(addr).c_str()); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); - int one = 1; - setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); - clients[peer_fd] = new osd_client_t(); - clients[peer_fd]->peer_addr = addr; - clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port); - clients[peer_fd]->peer_fd = peer_fd; - clients[peer_fd]->peer_state = PEER_CONNECTED; - clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size); + auto cl = clients[peer_fd] = new osd_client_t(); + cl->peer_addr = addr; + cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port); + cl->peer_fd = peer_fd; + cl->peer_state = PEER_CONNECTED; + cl->in_buf = malloc_or_die(receive_buffer_size); + set_socket_options(cl); // Add FD to epoll tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events) { diff --git a/src/messenger.h b/src/messenger.h index d9d9e01a..4964deb9 100644 --- a/src/messenger.h +++ b/src/messenger.h @@ -45,6 +45,12 @@ struct msgr_sendp_t int flags; }; +struct msgr_zc_not_t +{ + osd_op_t *op; + uint32_t nsend; +}; + struct osd_client_t { int refs = 0; @@ -57,6 +63,7 @@ struct osd_client_t int ping_time_remaining = 0; int idle_time_remaining = 0; osd_num_t osd_num = 0; + bool zerocopy_send = false; void *in_buf = NULL; @@ -87,6 +94,12 @@ struct osd_client_t int write_state = 0; std::vector send_list, next_send_list; std::vector outbox, next_outbox; + std::vector zerocopy_sent; + uint64_t outbox_size = 0, next_outbox_size = 0; + uint32_t zerocopy_notification_idx = 0; + uint32_t zerocopy_notification_prev = 0; + uint8_t zerocopy_notification_buf[256]; + struct msghdr zerocopy_notification_msg; ~osd_client_t() { @@ -127,6 +140,7 @@ protected: int osd_ping_timeout = 0; int log_level = 0; bool use_sync_send_recv = false; + bool use_zerocopy_send = false; #ifdef WITH_RDMA bool use_rdma = true; @@ -181,10 +195,12 @@ protected: void check_peer_config(osd_client_t *cl); void cancel_osd_ops(osd_client_t *cl); void cancel_op(osd_op_t *op); + void set_socket_options(osd_client_t *cl); bool try_send(osd_client_t *cl); void measure_exec(osd_op_t *cur_op); void handle_send(int result, osd_client_t *cl); + void handle_zerocopy_notification(osd_client_t *cl, int res); bool handle_read(int result, osd_client_t *cl); bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain); diff --git a/src/msgr_send.cpp b/src/msgr_send.cpp index 5248e641..64860548 100644 --- a/src/msgr_send.cpp +++ b/src/msgr_send.cpp @@ -6,6 +6,12 @@ #include "messenger.h" +#include + +#ifndef MSG_ZEROCOPY +#define MSG_ZEROCOPY 0 +#endif + void osd_messenger_t::outbox_push(osd_op_t *cur_op) { assert(cur_op->peer_fd); @@ -36,6 +42,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) } auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list; auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox; + auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size; if (cur_op->op_type == OSD_OP_IN) { measure_exec(cur_op); @@ -46,6 +53,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE }); cl->sent_ops[cur_op->req.hdr.id] = cur_op; } + to_size += OSD_PACKET_SIZE; to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR }); // Bitmap if (cur_op->op_type == OSD_OP_IN && @@ -57,6 +65,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) .iov_len = cur_op->reply.sec_rw.attr_len, }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + to_size += cur_op->reply.sec_rw.attr_len; } else if (cur_op->op_type == OSD_OP_OUT && (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) && @@ -67,6 +76,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) .iov_len = cur_op->req.sec_rw.attr_len, }); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + to_size += cur_op->req.sec_rw.attr_len; } // Operation data if ((cur_op->op_type == OSD_OP_IN @@ -86,14 +96,21 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) assert(cur_op->iov.buf[i].iov_base); to_send_list.push_back(cur_op->iov.buf[i]); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + to_size += cur_op->iov.buf[i].iov_len; } } if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) { if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0) + { to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval }); + to_size += cur_op->reply.hdr.retval; + } else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0) + { to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len }); + to_size += cur_op->req.sec_read_bmp.len; + } to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); } if (cur_op->op_type == OSD_OP_IN) @@ -177,17 +194,19 @@ bool osd_messenger_t::try_send(osd_client_t *cl) } cl->write_msg.msg_iov = cl->send_list.data(); cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX; + cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0); cl->refs++; ring_data_t* data = ((ring_data_t*)sqe->user_data); data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); }; - my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0); + my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags); } else { cl->write_msg.msg_iov = cl->send_list.data(); cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX; + cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0); cl->refs++; - int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL); + int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags); if (result < 0) { result = -errno; @@ -197,6 +216,62 @@ bool osd_messenger_t::try_send(osd_client_t *cl) return true; } +void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res) +{ + cl->refs--; + if (cl->peer_state == PEER_STOPPED) + { + if (cl->refs <= 0) + { + delete cl; + } + return; + } + if (res != 0) + { + return; + } + if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC) + { + fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd); + return; + } + for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm)) + { + if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) + { + struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm); + if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY) + { + // completed sends numbered serr->ee_info .. serr->ee_data + int start = 0; + while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info) + start++; + int end = start; + if (serr->ee_data < serr->ee_info) + { + // counter has wrapped around + while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend) + end++; + } + while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data) + end++; + if (end > start) + { + for (int i = start; i < end; i++) + { + delete cl->zerocopy_sent[i].op; + } + cl->zerocopy_sent.erase( + cl->zerocopy_sent.begin() + start, + cl->zerocopy_sent.begin() + end + ); + } + } + } + } +} + void osd_messenger_t::send_replies() { for (int i = 0; i < write_ready_clients.size(); i++) @@ -224,16 +299,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl) } return; } - if (result < 0 && result != -EAGAIN && result != -EINTR) + if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS) { // this is a client socket, so don't panic. just disconnect it fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result)); stop_client(cl->peer_fd); return; } + bool used_zerocopy = false; if (result >= 0) { + used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false; int done = 0; + int bytes_written = result; while (result > 0 && done < cl->send_list.size()) { iovec & iov = cl->send_list[done]; @@ -242,7 +320,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl) if (cl->outbox[done].flags & MSGR_SENDP_FREE) { // Reply fully sent - delete cl->outbox[done].op; + if (!used_zerocopy) + { + delete cl->outbox[done].op; + } + else + { + // With zero-copy send the difference is that we must keep the buffer (i.e. the operation) + // allocated until we get send notification from MSG_ERRQUEUE + cl->zerocopy_sent.push_back((msgr_zc_not_t){ + .op = cl->outbox[done].op, + .nsend = cl->zerocopy_notification_idx, + }); + } } result -= iov.iov_len; done++; @@ -254,6 +344,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl) break; } } + if (used_zerocopy) + { + cl->zerocopy_notification_idx++; + } + cl->outbox_size -= bytes_written; if (done > 0) { cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done); @@ -263,8 +358,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl) { cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end()); cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end()); + cl->outbox_size += cl->next_outbox_size; cl->next_send_list.clear(); cl->next_outbox.clear(); + cl->next_outbox_size = 0; } cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0; #ifdef WITH_RDMA @@ -287,4 +384,34 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl) { write_ready_clients.push_back(cl->peer_fd); } + if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 && + cl->zerocopy_sent.size() > 0) + { + cl->zerocopy_notification_prev = cl->zerocopy_notification_idx; + cl->zerocopy_notification_msg = { + .msg_control = cl->zerocopy_notification_buf, + .msg_controllen = sizeof(cl->zerocopy_notification_buf), + }; + cl->refs++; + io_uring_sqe* sqe = NULL; + if (ringloop && !use_sync_send_recv) + { + sqe = ringloop->get_sqe(); + } + if (!sqe) + { + int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT); + if (res < 0) + { + res = -errno; + } + handle_zerocopy_notification(cl, res); + } + else + { + ring_data_t* data = ((ring_data_t*)sqe->user_data); + data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); }; + my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE); + } + } }