#include "cluster_client.h" void cluster_client_t::read_requests() { for (int i = 0; i < read_ready_clients.size(); i++) { int peer_fd = read_ready_clients[i]; auto & cl = clients[peer_fd]; { timespec now; clock_gettime(CLOCK_REALTIME, &now); printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000); } io_uring_sqe* sqe = ringloop->get_sqe(); if (!sqe) { read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i); return; } ring_data_t* data = ((ring_data_t*)sqe->user_data); if (!cl.read_op || cl.read_remaining < receive_buffer_size) { cl.read_iov.iov_base = cl.in_buf; cl.read_iov.iov_len = receive_buffer_size; } else { cl.read_iov.iov_base = cl.read_buf; cl.read_iov.iov_len = cl.read_remaining; } cl.read_msg.msg_iov = &cl.read_iov; cl.read_msg.msg_iovlen = 1; data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); }; my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0); } read_ready_clients.clear(); } void cluster_client_t::handle_read(ring_data_t *data, int peer_fd) { auto cl_it = clients.find(peer_fd); if (cl_it != clients.end()) { auto & cl = cl_it->second; if (data->res < 0 && data->res != -EAGAIN) { // this is a client socket, so don't panic. just disconnect it printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); stop_client(peer_fd); return; } if (data->res == -EAGAIN || cl.read_iov.iov_base == cl.in_buf && data->res < receive_buffer_size) { cl.read_ready--; if (cl.read_ready > 0) read_ready_clients.push_back(peer_fd); } else { read_ready_clients.push_back(peer_fd); } if (data->res == -EAGAIN) { return; } if (data->res > 0) { if (cl.read_iov.iov_base == cl.in_buf) { // Compose operation(s) from the buffer int remain = data->res; void *curbuf = cl.in_buf; while (remain > 0) { if (!cl.read_op) { cl.read_op = new osd_op_t; cl.read_op->peer_fd = peer_fd; cl.read_op->op_type = OSD_OP_IN; cl.read_buf = cl.read_op->req.buf; cl.read_remaining = OSD_PACKET_SIZE; cl.read_state = CL_READ_HDR; } if (cl.read_remaining > remain) { memcpy(cl.read_buf, curbuf, remain); cl.read_remaining -= remain; cl.read_buf += remain; remain = 0; if (cl.read_remaining <= 0) handle_finished_read(cl); } else { memcpy(cl.read_buf, curbuf, cl.read_remaining); curbuf += cl.read_remaining; remain -= cl.read_remaining; cl.read_remaining = 0; cl.read_buf = NULL; handle_finished_read(cl); } } } else { // Long data cl.read_remaining -= data->res; cl.read_buf += data->res; if (cl.read_remaining <= 0) { handle_finished_read(cl); } } } } } void cluster_client_t::handle_finished_read(osd_client_t & cl) { if (cl.read_state == CL_READ_HDR) { if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC) handle_reply_hdr(&cl); else handle_op_hdr(&cl); } else if (cl.read_state == CL_READ_DATA) { // Operation is ready exec_op(cl.read_op); cl.read_op = NULL; cl.read_state = 0; } else if (cl.read_state == CL_READ_REPLY_DATA) { // Reply is ready auto req_it = cl.sent_ops.find(cl.read_reply_id); osd_op_t *request = req_it->second; cl.sent_ops.erase(req_it); cl.read_reply_id = 0; delete cl.read_op; cl.read_op = NULL; cl.read_state = 0; // Measure subop latency timespec tv_end; clock_gettime(CLOCK_REALTIME, &tv_end); stats.subop_stat_count[request->req.hdr.opcode]++; if (!stats.subop_stat_count[request->req.hdr.opcode]) { stats.subop_stat_count[request->req.hdr.opcode]++; stats.subop_stat_sum[request->req.hdr.opcode] = 0; } stats.subop_stat_sum[request->req.hdr.opcode] += ( (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 + (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000 ); request->callback(request); } else { assert(0); } } void cluster_client_t::handle_op_hdr(osd_client_t *cl) { osd_op_t *cur_op = cl->read_op; if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ) { if (cur_op->req.sec_rw.len > 0) cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len); cl->read_remaining = 0; } else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) { if (cur_op->req.sec_rw.len > 0) cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len); cl->read_remaining = cur_op->req.sec_rw.len; } else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK) { if (cur_op->req.sec_stab.len > 0) cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_stab.len); cl->read_remaining = cur_op->req.sec_stab.len; } else if (cur_op->req.hdr.opcode == OSD_OP_READ) { if (cur_op->req.rw.len > 0) cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len); cl->read_remaining = 0; } else if (cur_op->req.hdr.opcode == OSD_OP_WRITE) { if (cur_op->req.rw.len > 0) cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len); cl->read_remaining = cur_op->req.rw.len; } if (cl->read_remaining > 0) { // Read data cl->read_buf = cur_op->buf; cl->read_state = CL_READ_DATA; } else { // Operation is ready cl->read_op = NULL; cl->read_state = 0; exec_op(cur_op); } } void cluster_client_t::handle_reply_hdr(osd_client_t *cl) { osd_op_t *cur_op = cl->read_op; auto req_it = cl->sent_ops.find(cur_op->req.hdr.id); if (req_it == cl->sent_ops.end()) { // Command out of sync. Drop connection printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id); stop_client(cl->peer_fd); return; } osd_op_t *op = req_it->second; memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE); if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ && op->reply.hdr.retval > 0) { // Read data. In this case we assume that the buffer is preallocated by the caller (!) assert(op->buf); cl->read_state = CL_READ_REPLY_DATA; cl->read_reply_id = op->req.hdr.id; cl->read_buf = op->buf; cl->read_remaining = op->reply.hdr.retval; } else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST && op->reply.hdr.retval > 0) { op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval); cl->read_state = CL_READ_REPLY_DATA; cl->read_reply_id = op->req.hdr.id; cl->read_buf = op->buf; cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval; } else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0) { op->buf = malloc(op->reply.hdr.retval); cl->read_state = CL_READ_REPLY_DATA; cl->read_reply_id = op->req.hdr.id; cl->read_buf = op->buf; cl->read_remaining = op->reply.hdr.retval; } else { delete cl->read_op; cl->read_state = 0; cl->read_op = NULL; cl->sent_ops.erase(req_it); // Measure subop latency timespec tv_end; clock_gettime(CLOCK_REALTIME, &tv_end); stats.subop_stat_count[op->req.hdr.opcode]++; if (!stats.subop_stat_count[op->req.hdr.opcode]) { stats.subop_stat_count[op->req.hdr.opcode]++; stats.subop_stat_sum[op->req.hdr.opcode] = 0; } stats.subop_stat_sum[op->req.hdr.opcode] += ( (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 + (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000 ); // Copy lambda to be unaffected by `delete op` std::function(op->callback)(op); } }