2019-12-11 14:18:19 +03:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/epoll.h>
|
2019-12-14 20:51:41 +03:00
|
|
|
#include <sys/poll.h>
|
2019-12-11 14:18:19 +03:00
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <arpa/inet.h>
|
|
|
|
|
2019-12-15 01:11:51 +03:00
|
|
|
#include "osd.h"
|
2019-12-11 14:18:19 +03:00
|
|
|
|
2019-12-13 22:53:59 +03:00
|
|
|
#define CL_READ_OP 1
|
|
|
|
#define CL_READ_DATA 2
|
2019-12-14 20:51:41 +03:00
|
|
|
#define SQE_SENT 0x100l
|
|
|
|
#define CL_WRITE_READY 1
|
|
|
|
#define CL_WRITE_REPLY 2
|
|
|
|
#define CL_WRITE_DATA 3
|
2019-12-13 22:53:59 +03:00
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-15 01:11:51 +03:00
|
|
|
bind_address = config["bind_address"];
|
|
|
|
if (bind_address == "")
|
|
|
|
bind_address = "0.0.0.0";
|
|
|
|
bind_port = strtoull(config["bind_port"].c_str(), NULL, 10);
|
|
|
|
if (!bind_port || bind_port > 65535)
|
|
|
|
bind_port = 11203;
|
2019-12-12 11:32:20 +03:00
|
|
|
|
2019-12-13 20:12:31 +03:00
|
|
|
this->bs = bs;
|
2019-12-11 14:18:19 +03:00
|
|
|
this->ringloop = ringloop;
|
|
|
|
|
|
|
|
listen_fd = socket(AF_INET, SOCK_STREAM, 0);
|
|
|
|
if (listen_fd < 0)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
|
|
|
}
|
2019-12-13 14:05:11 +03:00
|
|
|
int enable = 1;
|
|
|
|
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
2019-12-11 14:18:19 +03:00
|
|
|
|
|
|
|
sockaddr_in addr;
|
2019-12-14 20:51:41 +03:00
|
|
|
int r;
|
|
|
|
if ((r = inet_pton(AF_INET, bind_address.c_str(), &addr.sin_addr)) != 1)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
|
|
|
|
}
|
|
|
|
addr.sin_family = AF_INET;
|
2019-12-13 14:05:11 +03:00
|
|
|
addr.sin_port = htons(bind_port);
|
2019-12-11 14:18:19 +03:00
|
|
|
|
2019-12-14 20:51:41 +03:00
|
|
|
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (listen(listen_fd, listen_backlog) < 0)
|
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
|
2019-12-12 11:32:20 +03:00
|
|
|
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
|
|
|
|
2019-12-11 14:18:19 +03:00
|
|
|
epoll_fd = epoll_create(1);
|
|
|
|
if (epoll_fd < 0)
|
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
|
2019-12-13 22:53:59 +03:00
|
|
|
epoll_event ev;
|
2019-12-11 14:18:19 +03:00
|
|
|
ev.data.fd = listen_fd;
|
2019-12-12 11:32:20 +03:00
|
|
|
ev.events = EPOLLIN;
|
|
|
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
|
|
|
|
consumer.loop = [this]() { loop(); };
|
|
|
|
ringloop->register_consumer(consumer);
|
|
|
|
}
|
|
|
|
|
|
|
|
osd_t::~osd_t()
|
|
|
|
{
|
|
|
|
ringloop->unregister_consumer(consumer);
|
|
|
|
close(epoll_fd);
|
|
|
|
close(listen_fd);
|
|
|
|
}
|
|
|
|
|
2019-12-15 01:52:08 +03:00
|
|
|
bool osd_t::shutdown()
|
|
|
|
{
|
|
|
|
// TODO
|
|
|
|
}
|
|
|
|
|
2019-12-11 14:18:19 +03:00
|
|
|
void osd_t::loop()
|
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
if (wait_state == 0)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
io_uring_sqe *sqe = ringloop->get_sqe();
|
|
|
|
if (!sqe)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
wait_state = 0;
|
|
|
|
return;
|
2019-12-11 14:18:19 +03:00
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
|
|
|
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
|
|
|
data->callback = [&](ring_data_t *data)
|
|
|
|
{
|
|
|
|
if (data->res < 0)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
|
|
|
}
|
|
|
|
handle_epoll_events();
|
|
|
|
};
|
|
|
|
wait_state = 1;
|
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
else if (wait_state == 2)
|
|
|
|
{
|
|
|
|
handle_epoll_events();
|
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
send_replies();
|
|
|
|
read_requests();
|
2019-12-11 14:18:19 +03:00
|
|
|
ringloop->submit();
|
|
|
|
}
|
|
|
|
|
2019-12-12 11:32:20 +03:00
|
|
|
#define MAX_EPOLL_EVENTS 16
|
|
|
|
|
|
|
|
int osd_t::handle_epoll_events()
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
2019-12-12 11:32:20 +03:00
|
|
|
epoll_event events[MAX_EPOLL_EVENTS];
|
2019-12-16 21:37:13 +03:00
|
|
|
int nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
|
|
|
|
for (int i = 0; i < nfds; i++)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
if (events[i].data.fd == listen_fd)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
// Accept new connections
|
|
|
|
sockaddr_in addr;
|
|
|
|
socklen_t peer_addr_size = sizeof(addr);
|
|
|
|
int peer_fd;
|
|
|
|
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
char peer_str[256];
|
|
|
|
printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
|
|
|
|
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
|
|
|
clients[peer_fd] = {
|
|
|
|
.peer_addr = addr,
|
|
|
|
.peer_addr_size = peer_addr_size,
|
|
|
|
.peer_fd = peer_fd,
|
|
|
|
};
|
|
|
|
// Add FD to epoll
|
|
|
|
epoll_event ev;
|
|
|
|
ev.data.fd = peer_fd;
|
|
|
|
ev.events = EPOLLIN | EPOLLRDHUP;
|
|
|
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
2019-12-12 11:32:20 +03:00
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
// Try to accept next connection
|
|
|
|
peer_addr_size = sizeof(addr);
|
2019-12-12 11:32:20 +03:00
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
if (peer_fd == -1 && errno != EAGAIN)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
throw std::runtime_error(std::string("accept: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
auto & cl = clients[events[i].data.fd];
|
|
|
|
if (events[i].events & EPOLLRDHUP)
|
|
|
|
{
|
|
|
|
// Stop client
|
|
|
|
printf("osd: client %d disconnected\n", cl.peer_fd);
|
|
|
|
stop_client(cl.peer_fd);
|
|
|
|
}
|
|
|
|
else if (!cl.read_ready)
|
|
|
|
{
|
|
|
|
// Mark client as ready (i.e. some data is available)
|
|
|
|
cl.read_ready = true;
|
|
|
|
if (!cl.reading)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
read_ready_clients.push_back(cl.peer_fd);
|
|
|
|
ringloop->wakeup();
|
2019-12-12 11:32:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
wait_state = nfds == MAX_EPOLL_EVENTS ? 2 : 0;
|
|
|
|
return nfds;
|
2019-12-11 14:18:19 +03:00
|
|
|
}
|
2019-12-13 14:05:11 +03:00
|
|
|
|
|
|
|
void osd_t::stop_client(int peer_fd)
|
|
|
|
{
|
2019-12-13 22:53:59 +03:00
|
|
|
epoll_event ev;
|
2019-12-13 14:05:11 +03:00
|
|
|
ev.data.fd = peer_fd;
|
|
|
|
ev.events = EPOLLIN | EPOLLHUP;
|
|
|
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, &ev) < 0)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
auto it = clients.find(peer_fd);
|
2019-12-14 20:51:41 +03:00
|
|
|
if (it->second.read_ready)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-13 22:53:59 +03:00
|
|
|
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-13 20:12:31 +03:00
|
|
|
if (*rit == peer_fd)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-13 22:53:59 +03:00
|
|
|
read_ready_clients.erase(rit);
|
2019-12-13 14:05:11 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-12-13 20:12:31 +03:00
|
|
|
clients.erase(it);
|
2019-12-13 14:05:11 +03:00
|
|
|
close(peer_fd);
|
|
|
|
}
|
|
|
|
|
2019-12-14 20:51:41 +03:00
|
|
|
void osd_t::read_requests()
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-13 22:53:59 +03:00
|
|
|
for (int i = 0; i < read_ready_clients.size(); i++)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-13 22:53:59 +03:00
|
|
|
int peer_fd = read_ready_clients[i];
|
2019-12-13 14:05:11 +03:00
|
|
|
auto & cl = clients[peer_fd];
|
2019-12-13 22:53:59 +03:00
|
|
|
io_uring_sqe* sqe = ringloop->get_sqe();
|
2019-12-13 14:05:11 +03:00
|
|
|
if (!sqe)
|
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
2019-12-13 14:05:11 +03:00
|
|
|
return;
|
|
|
|
}
|
2019-12-13 22:53:59 +03:00
|
|
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
|
|
|
if (!cl.read_buf)
|
|
|
|
{
|
|
|
|
// no reads in progress, so this is probably a new command
|
|
|
|
cl.read_op = new osd_op_t;
|
|
|
|
cl.read_buf = &cl.read_op->op_buf;
|
|
|
|
cl.read_remaining = OSD_OP_PACKET_SIZE;
|
|
|
|
cl.read_state = CL_READ_OP;
|
|
|
|
}
|
|
|
|
cl.read_iov.iov_base = cl.read_buf;
|
|
|
|
cl.read_iov.iov_len = cl.read_remaining;
|
|
|
|
cl.read_msg.msg_iov = &cl.read_iov;
|
|
|
|
cl.read_msg.msg_iovlen = 1;
|
2019-12-13 14:05:11 +03:00
|
|
|
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
|
2019-12-13 22:53:59 +03:00
|
|
|
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
|
2019-12-13 14:05:11 +03:00
|
|
|
cl.reading = true;
|
2019-12-13 22:53:59 +03:00
|
|
|
cl.read_ready = false;
|
2019-12-13 14:05:11 +03:00
|
|
|
}
|
2019-12-13 22:53:59 +03:00
|
|
|
read_ready_clients.clear();
|
2019-12-13 14:05:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
auto cl_it = clients.find(peer_fd);
|
|
|
|
if (cl_it != clients.end())
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
auto & cl = cl_it->second;
|
2019-12-13 14:05:11 +03:00
|
|
|
if (data->res < 0 && data->res != -EAGAIN)
|
|
|
|
{
|
|
|
|
// this is a client socket, so don't panic. just disconnect it
|
|
|
|
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
|
|
|
|
stop_client(peer_fd);
|
|
|
|
return;
|
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
cl.reading = false;
|
|
|
|
if (cl.read_ready)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-13 22:53:59 +03:00
|
|
|
read_ready_clients.push_back(peer_fd);
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
|
|
|
if (data->res > 0)
|
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
cl.read_remaining -= data->res;
|
|
|
|
cl.read_buf += data->res;
|
|
|
|
if (cl.read_remaining <= 0)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
osd_op_t *cur_op = cl.read_op;
|
|
|
|
cl.read_buf = NULL;
|
|
|
|
if (cl.read_state == CL_READ_OP)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
|
|
|
if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_READ ||
|
|
|
|
cur_op->op.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
|
|
|
|
cur_op->op.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
|
|
|
|
{
|
|
|
|
// Allocate a buffer
|
|
|
|
cur_op->buf = memalign(512, cur_op->op.sec_rw.len);
|
|
|
|
}
|
|
|
|
if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_WRITE ||
|
|
|
|
cur_op->op.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
|
|
|
|
{
|
|
|
|
// Read data
|
2019-12-14 20:51:41 +03:00
|
|
|
cl.read_buf = cur_op->buf;
|
|
|
|
cl.read_remaining = cur_op->op.sec_rw.len;
|
|
|
|
cl.read_state = CL_READ_DATA;
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Command is ready
|
2019-12-13 22:53:59 +03:00
|
|
|
cur_op->peer_fd = peer_fd;
|
2019-12-13 20:12:31 +03:00
|
|
|
enqueue_op(cur_op);
|
2019-12-14 20:51:41 +03:00
|
|
|
cl.read_op = NULL;
|
|
|
|
cl.read_state = 0;
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
else if (cl.read_state == CL_READ_DATA)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
|
|
|
// Command is ready
|
2019-12-13 22:53:59 +03:00
|
|
|
cur_op->peer_fd = peer_fd;
|
2019-12-13 20:12:31 +03:00
|
|
|
enqueue_op(cur_op);
|
2019-12-14 20:51:41 +03:00
|
|
|
cl.read_op = NULL;
|
|
|
|
cl.read_state = 0;
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-15 15:30:51 +03:00
|
|
|
void osd_t::blockstore_op_callback(osd_op_t *cur_op)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-15 15:30:51 +03:00
|
|
|
auto cl_it = clients.find(cur_op->peer_fd);
|
|
|
|
if (cl_it != clients.end())
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-15 15:30:51 +03:00
|
|
|
auto & cl = cl_it->second;
|
|
|
|
if (cl.write_state == 0)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-15 15:30:51 +03:00
|
|
|
cl.write_state = CL_WRITE_READY;
|
|
|
|
write_ready_clients.push_back(cur_op->peer_fd);
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
2019-12-15 15:30:51 +03:00
|
|
|
cl.completions.push_back(cur_op);
|
|
|
|
ringloop->wakeup();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
delete cur_op;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::enqueue_op(osd_op_t *cur_op)
|
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
if (cur_op->op.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
|
|
|
|
cur_op->op.hdr.opcode < OSD_OP_MIN || cur_op->op.hdr.opcode > OSD_OP_MAX ||
|
|
|
|
(cur_op->op.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->op.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
|
|
|
|
(cur_op->op.sec_rw.len > OSD_RW_MAX || cur_op->op.sec_rw.len % OSD_RW_ALIGN || cur_op->op.sec_rw.offset % OSD_RW_ALIGN))
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
|
|
|
// Bad command
|
2019-12-14 20:51:41 +03:00
|
|
|
cur_op->bs_op.retval = -EINVAL;
|
2019-12-15 15:30:51 +03:00
|
|
|
blockstore_op_callback(cur_op);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (cur_op->op.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
|
|
|
|
{
|
|
|
|
// Sync and stabilize all objects
|
|
|
|
// This command is only valid for tests
|
|
|
|
// FIXME: Dedup between here & fio_engine
|
|
|
|
if (!allow_test_ops)
|
|
|
|
{
|
|
|
|
cur_op->bs_op.retval = -EINVAL;
|
|
|
|
blockstore_op_callback(cur_op);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
cur_op->bs_op.flags = OP_SYNC;
|
|
|
|
cur_op->bs_op.callback = [this, cur_op](blockstore_op_t *op)
|
|
|
|
{
|
|
|
|
auto & unstable_writes = bs->get_unstable_writes();
|
|
|
|
if (op->retval >= 0 && unstable_writes.size() > 0)
|
|
|
|
{
|
|
|
|
op->flags = OP_STABLE;
|
|
|
|
op->len = unstable_writes.size();
|
|
|
|
obj_ver_id *vers = new obj_ver_id[op->len];
|
|
|
|
op->buf = vers;
|
|
|
|
int i = 0;
|
|
|
|
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++, i++)
|
|
|
|
{
|
|
|
|
vers[i] = {
|
|
|
|
.oid = it->first,
|
|
|
|
.version = it->second,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
unstable_writes.clear();
|
|
|
|
op->callback = [this, cur_op](blockstore_op_t *op)
|
|
|
|
{
|
|
|
|
blockstore_op_callback(cur_op);
|
|
|
|
obj_ver_id *vers = (obj_ver_id*)op->buf;
|
|
|
|
delete[] vers;
|
|
|
|
};
|
|
|
|
bs->enqueue_op(op);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
blockstore_op_callback(cur_op);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
bs->enqueue_op(&cur_op->bs_op);
|
2019-12-13 20:12:31 +03:00
|
|
|
return;
|
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
// FIXME: LIST is not a blockstore op yet
|
2019-12-15 15:30:51 +03:00
|
|
|
cur_op->bs_op.callback = [this, cur_op](blockstore_op_t* bs_op) { blockstore_op_callback(cur_op); };
|
2019-12-14 20:51:41 +03:00
|
|
|
cur_op->bs_op.flags = (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_READ ? OP_READ
|
|
|
|
: (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_WRITE ? OP_WRITE
|
|
|
|
: (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_SYNC ? OP_SYNC
|
|
|
|
: (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ? OP_STABLE
|
|
|
|
: (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_DELETE ? OP_DELETE
|
|
|
|
: -1)))));
|
|
|
|
if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_READ ||
|
|
|
|
cur_op->op.hdr.opcode == OSD_OP_SECONDARY_WRITE)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
cur_op->bs_op.oid = cur_op->op.sec_rw.oid;
|
|
|
|
cur_op->bs_op.version = cur_op->op.sec_rw.version;
|
|
|
|
cur_op->bs_op.offset = cur_op->op.sec_rw.offset;
|
|
|
|
cur_op->bs_op.len = cur_op->op.sec_rw.len;
|
|
|
|
cur_op->bs_op.buf = cur_op->buf;
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
else if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_DELETE)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
cur_op->bs_op.oid = cur_op->op.sec_del.oid;
|
|
|
|
cur_op->bs_op.version = cur_op->op.sec_del.version;
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
2019-12-14 20:51:41 +03:00
|
|
|
else if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2019-12-14 20:51:41 +03:00
|
|
|
cur_op->bs_op.len = cur_op->op.sec_stabilize.len/sizeof(obj_ver_id);
|
|
|
|
cur_op->bs_op.buf = cur_op->buf;
|
|
|
|
}
|
|
|
|
bs->enqueue_op(&cur_op->bs_op);
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::send_replies()
|
|
|
|
{
|
|
|
|
for (int i = 0; i < write_ready_clients.size(); i++)
|
|
|
|
{
|
|
|
|
int peer_fd = write_ready_clients[i];
|
|
|
|
auto & cl = clients[peer_fd];
|
|
|
|
io_uring_sqe* sqe = ringloop->get_sqe();
|
|
|
|
if (!sqe)
|
|
|
|
{
|
|
|
|
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
|
|
|
if (!cl.write_buf)
|
|
|
|
{
|
|
|
|
// pick next command
|
|
|
|
cl.write_op = cl.completions.front();
|
|
|
|
cl.completions.pop_front();
|
|
|
|
make_reply(cl.write_op);
|
|
|
|
cl.write_buf = &cl.write_op->reply_buf;
|
|
|
|
cl.write_remaining = OSD_REPLY_PACKET_SIZE;
|
|
|
|
cl.write_state = CL_WRITE_REPLY;
|
|
|
|
}
|
|
|
|
cl.write_iov.iov_base = cl.write_buf;
|
|
|
|
cl.write_iov.iov_len = cl.write_remaining;
|
|
|
|
cl.write_msg.msg_iov = &cl.write_iov;
|
|
|
|
cl.write_msg.msg_iovlen = 1;
|
|
|
|
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
|
|
|
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
|
|
|
|
cl.write_state = cl.write_state | SQE_SENT;
|
|
|
|
}
|
|
|
|
write_ready_clients.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::make_reply(osd_op_t *op)
|
|
|
|
{
|
|
|
|
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
|
|
|
op->reply.hdr.id = op->op.hdr.id;
|
|
|
|
op->reply.hdr.retval = op->bs_op.retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::handle_send(ring_data_t *data, int peer_fd)
|
|
|
|
{
|
|
|
|
auto cl_it = clients.find(peer_fd);
|
|
|
|
if (cl_it != clients.end())
|
|
|
|
{
|
|
|
|
auto & cl = cl_it->second;
|
|
|
|
if (data->res < 0 && data->res != -EAGAIN)
|
|
|
|
{
|
|
|
|
// this is a client socket, so don't panic. just disconnect it
|
|
|
|
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
|
|
|
|
stop_client(peer_fd);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
cl.write_state = cl.write_state & ~SQE_SENT;
|
|
|
|
if (data->res > 0)
|
|
|
|
{
|
|
|
|
cl.write_remaining -= data->res;
|
|
|
|
cl.write_buf += data->res;
|
|
|
|
if (cl.write_remaining <= 0)
|
|
|
|
{
|
|
|
|
cl.write_buf = NULL;
|
|
|
|
osd_op_t *cur_op = cl.write_op;
|
|
|
|
if (cl.write_state == CL_WRITE_REPLY)
|
|
|
|
{
|
|
|
|
if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_READ &&
|
|
|
|
cur_op->reply.hdr.retval > 0)
|
|
|
|
{
|
|
|
|
// Send data
|
|
|
|
cl.write_buf = cur_op->buf;
|
|
|
|
cl.write_remaining = cur_op->reply.hdr.retval;
|
|
|
|
cl.write_state = CL_WRITE_DATA;
|
|
|
|
}
|
|
|
|
else if (cur_op->op.hdr.opcode == OSD_OP_SECONDARY_LIST)
|
|
|
|
{
|
|
|
|
// FIXME
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
goto op_done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (cl.write_state == CL_WRITE_DATA)
|
|
|
|
{
|
|
|
|
op_done:
|
|
|
|
// Done
|
|
|
|
delete cur_op;
|
|
|
|
cl.write_op = NULL;
|
|
|
|
cl.write_state = cl.completions.size() > 0 ? CL_WRITE_READY : 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (cl.write_state != 0)
|
|
|
|
{
|
|
|
|
write_ready_clients.push_back(peer_fd);
|
|
|
|
}
|
2019-12-13 14:05:11 +03:00
|
|
|
}
|
|
|
|
}
|