2019-12-11 14:18:19 +03:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/epoll.h>
|
2019-12-14 20:51:41 +03:00
|
|
|
#include <sys/poll.h>
|
2019-12-11 14:18:19 +03:00
|
|
|
#include <netinet/in.h>
|
2020-01-09 20:20:56 +03:00
|
|
|
#include <netinet/tcp.h>
|
2019-12-11 14:18:19 +03:00
|
|
|
#include <arpa/inet.h>
|
|
|
|
|
2019-12-15 01:11:51 +03:00
|
|
|
#include "osd.h"
|
2020-04-20 17:44:03 +03:00
|
|
|
#include "osd_http.h"
|
2019-12-11 14:18:19 +03:00
|
|
|
|
2020-03-31 17:50:50 +03:00
|
|
|
const char* osd_op_names[] = {
|
2020-02-28 12:25:54 +03:00
|
|
|
"",
|
|
|
|
"read",
|
|
|
|
"write",
|
|
|
|
"sync",
|
|
|
|
"stabilize",
|
|
|
|
"rollback",
|
|
|
|
"delete",
|
|
|
|
"sync_stab_all",
|
|
|
|
"list",
|
|
|
|
"show_config",
|
|
|
|
"primary_read",
|
|
|
|
"primary_write",
|
|
|
|
"primary_sync",
|
|
|
|
};
|
|
|
|
|
2019-12-15 14:49:10 +03:00
|
|
|
osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
2020-01-21 14:04:58 +03:00
|
|
|
this->config = config;
|
|
|
|
this->bs = bs;
|
|
|
|
this->ringloop = ringloop;
|
2020-04-02 22:16:46 +03:00
|
|
|
|
2020-01-30 22:06:46 +03:00
|
|
|
this->bs_block_size = bs->get_block_size();
|
|
|
|
// FIXME: use bitmap granularity instead
|
|
|
|
this->bs_disk_alignment = bs->get_disk_alignment();
|
2020-01-21 14:04:58 +03:00
|
|
|
|
2020-04-02 22:16:46 +03:00
|
|
|
parse_config(config);
|
|
|
|
|
2020-04-20 17:44:03 +03:00
|
|
|
epoll_fd = epoll_create(1);
|
|
|
|
if (epoll_fd < 0)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
|
|
|
|
}
|
2020-04-02 22:16:46 +03:00
|
|
|
|
2020-04-14 14:37:50 +03:00
|
|
|
this->stats_tfd = new timerfd_interval(ringloop, print_stats_interval, [this]()
|
2020-04-02 22:16:46 +03:00
|
|
|
{
|
|
|
|
print_stats();
|
|
|
|
});
|
|
|
|
|
2020-04-15 15:33:18 +03:00
|
|
|
this->tfd = new timerfd_manager_t(ringloop);
|
|
|
|
|
2020-04-17 01:59:06 +03:00
|
|
|
init_cluster();
|
|
|
|
|
2020-04-02 22:16:46 +03:00
|
|
|
consumer.loop = [this]() { loop(); };
|
|
|
|
ringloop->register_consumer(&consumer);
|
|
|
|
}
|
|
|
|
|
|
|
|
osd_t::~osd_t()
|
|
|
|
{
|
2020-04-15 15:33:18 +03:00
|
|
|
if (tfd)
|
|
|
|
{
|
|
|
|
delete tfd;
|
|
|
|
tfd = NULL;
|
|
|
|
}
|
2020-04-14 14:37:50 +03:00
|
|
|
if (stats_tfd)
|
|
|
|
{
|
|
|
|
delete stats_tfd;
|
|
|
|
stats_tfd = NULL;
|
|
|
|
}
|
2020-04-02 22:16:46 +03:00
|
|
|
if (sync_tfd)
|
|
|
|
{
|
|
|
|
delete sync_tfd;
|
|
|
|
sync_tfd = NULL;
|
|
|
|
}
|
2020-04-14 14:37:50 +03:00
|
|
|
if (consul_tfd)
|
|
|
|
{
|
|
|
|
delete consul_tfd;
|
|
|
|
consul_tfd = NULL;
|
|
|
|
}
|
2020-04-02 22:16:46 +03:00
|
|
|
ringloop->unregister_consumer(&consumer);
|
|
|
|
close(epoll_fd);
|
|
|
|
close(listen_fd);
|
|
|
|
}
|
|
|
|
|
|
|
|
osd_op_t::~osd_op_t()
|
|
|
|
{
|
|
|
|
if (bs_op)
|
|
|
|
{
|
|
|
|
delete bs_op;
|
|
|
|
}
|
|
|
|
if (op_data)
|
|
|
|
{
|
|
|
|
free(op_data);
|
|
|
|
}
|
|
|
|
if (rmw_buf)
|
|
|
|
{
|
|
|
|
free(rmw_buf);
|
|
|
|
}
|
|
|
|
if (buf)
|
|
|
|
{
|
|
|
|
// Note: reusing osd_op_t WILL currently lead to memory leaks
|
|
|
|
// So we don't reuse it, but free it every time
|
|
|
|
free(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::parse_config(blockstore_config_t & config)
|
|
|
|
{
|
2020-04-20 17:44:03 +03:00
|
|
|
// Initial startup configuration
|
2020-04-14 14:37:50 +03:00
|
|
|
consul_address = config["consul_address"];
|
2020-04-16 23:22:32 +03:00
|
|
|
consul_host = consul_address.find(':') >= 0 ? consul_address.substr(0, consul_address.find(':')) : consul_address;
|
2020-04-14 14:37:50 +03:00
|
|
|
consul_prefix = config["consul_prefix"];
|
|
|
|
if (consul_prefix == "")
|
|
|
|
consul_prefix = "microceph";
|
|
|
|
consul_report_interval = strtoull(config["consul_report_interval"].c_str(), NULL, 10);
|
|
|
|
if (consul_report_interval <= 0)
|
|
|
|
consul_report_interval = 30;
|
2020-01-21 14:04:58 +03:00
|
|
|
osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
|
|
|
|
if (!osd_num)
|
|
|
|
throw std::runtime_error("osd_num is required in the configuration");
|
2020-04-20 17:44:03 +03:00
|
|
|
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
|
|
|
|
// Cluster configuration
|
|
|
|
bind_address = config["bind_address"];
|
|
|
|
if (bind_address == "")
|
|
|
|
bind_address = "0.0.0.0";
|
|
|
|
bind_port = stoull_full(config["bind_port"]);
|
|
|
|
if (bind_port <= 0 || bind_port > 65535)
|
|
|
|
bind_port = 0;
|
|
|
|
if (config.find("bind_port_range_start") != config.end())
|
|
|
|
bind_port_range_start = stoull_full(config["bind_port_range_start"]);
|
|
|
|
if (config.find("bind_port_range_end") != config.end())
|
|
|
|
bind_port_range_end = stoull_full(config["bind_port_range_end"]);
|
2020-03-10 02:05:32 +03:00
|
|
|
if (config["immediate_commit"] == "all")
|
|
|
|
immediate_commit = IMMEDIATE_ALL;
|
|
|
|
else if (config["immediate_commit"] == "small")
|
|
|
|
immediate_commit = IMMEDIATE_SMALL;
|
2020-04-02 22:16:46 +03:00
|
|
|
autosync_interval = strtoull(config["autosync_interval"].c_str(), NULL, 10);
|
|
|
|
if (autosync_interval < 0 || autosync_interval > MAX_AUTOSYNC_INTERVAL)
|
|
|
|
autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
|
2020-04-04 02:18:29 +03:00
|
|
|
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
|
|
|
|
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
|
|
|
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
2020-04-11 12:05:28 +03:00
|
|
|
if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
|
|
|
|
readonly = true;
|
2020-04-14 14:37:50 +03:00
|
|
|
print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
|
|
|
|
if (!print_stats_interval)
|
|
|
|
print_stats_interval = 3;
|
2020-04-16 23:22:32 +03:00
|
|
|
peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
|
|
|
|
if (!peer_connect_interval)
|
|
|
|
peer_connect_interval = 5;
|
2020-04-20 15:43:07 +03:00
|
|
|
http_request_timeout = strtoull(config["http_request_timeout"].c_str(), NULL, 10);
|
|
|
|
if (!http_request_timeout)
|
|
|
|
http_request_timeout = 5;
|
|
|
|
peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
|
|
|
|
if (!peer_connect_timeout)
|
|
|
|
peer_connect_timeout = 5;
|
2020-04-02 22:16:46 +03:00
|
|
|
}
|
2019-12-11 14:18:19 +03:00
|
|
|
|
2020-04-02 22:16:46 +03:00
|
|
|
void osd_t::bind_socket()
|
|
|
|
{
|
2019-12-11 14:18:19 +03:00
|
|
|
listen_fd = socket(AF_INET, SOCK_STREAM, 0);
|
|
|
|
if (listen_fd < 0)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("socket: ") + strerror(errno));
|
|
|
|
}
|
2019-12-13 14:05:11 +03:00
|
|
|
int enable = 1;
|
|
|
|
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
|
2019-12-11 14:18:19 +03:00
|
|
|
|
|
|
|
sockaddr_in addr;
|
2019-12-14 20:51:41 +03:00
|
|
|
int r;
|
|
|
|
if ((r = inet_pton(AF_INET, bind_address.c_str(), &addr.sin_addr)) != 1)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
|
|
|
|
}
|
|
|
|
addr.sin_family = AF_INET;
|
|
|
|
|
2020-04-20 17:44:03 +03:00
|
|
|
if (bind_port == 0 && bind_port_range_start > 0 &&
|
|
|
|
bind_port_range_end > bind_port_range_start && bind_port_range_end < 65535)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
2020-04-20 17:44:03 +03:00
|
|
|
for (listening_port = bind_port_range_start; listening_port != bind_port_range_end; listening_port++)
|
|
|
|
{
|
|
|
|
addr.sin_port = htons(listening_port);
|
|
|
|
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) == 0)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (listening_port == bind_port_range_end)
|
|
|
|
{
|
|
|
|
listening_port = 0;
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
addr.sin_port = htons(bind_port);
|
|
|
|
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("bind: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
if (bind_port == 0)
|
|
|
|
{
|
|
|
|
socklen_t len = sizeof(addr);
|
|
|
|
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
|
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
listening_port = ntohs(addr.sin_port);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
listening_port = bind_port;
|
|
|
|
}
|
2019-12-11 14:18:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (listen(listen_fd, listen_backlog) < 0)
|
|
|
|
{
|
|
|
|
close(listen_fd);
|
|
|
|
throw std::runtime_error(std::string("listen: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
|
2019-12-12 11:32:20 +03:00
|
|
|
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
|
|
|
|
2019-12-13 22:53:59 +03:00
|
|
|
epoll_event ev;
|
2019-12-11 14:18:19 +03:00
|
|
|
ev.data.fd = listen_fd;
|
2020-02-29 01:46:12 +03:00
|
|
|
ev.events = EPOLLIN | EPOLLET;
|
2019-12-12 11:32:20 +03:00
|
|
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0)
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
2020-01-09 18:39:58 +03:00
|
|
|
close(listen_fd);
|
|
|
|
close(epoll_fd);
|
2019-12-11 14:18:19 +03:00
|
|
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
|
|
|
}
|
2019-12-23 21:56:03 +03:00
|
|
|
}
|
|
|
|
|
2019-12-15 01:52:08 +03:00
|
|
|
bool osd_t::shutdown()
|
|
|
|
{
|
2019-12-19 22:16:04 +03:00
|
|
|
stopping = true;
|
|
|
|
if (inflight_ops > 0)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return bs->is_safe_to_stop();
|
2019-12-15 01:52:08 +03:00
|
|
|
}
|
|
|
|
|
2019-12-11 14:18:19 +03:00
|
|
|
void osd_t::loop()
|
|
|
|
{
|
2020-02-29 01:46:12 +03:00
|
|
|
if (!wait_state)
|
2019-12-16 21:37:13 +03:00
|
|
|
{
|
|
|
|
handle_epoll_events();
|
2020-02-29 01:46:12 +03:00
|
|
|
wait_state = 1;
|
2019-12-16 21:37:13 +03:00
|
|
|
}
|
2020-01-21 14:04:58 +03:00
|
|
|
handle_peers();
|
2019-12-14 20:51:41 +03:00
|
|
|
read_requests();
|
2020-02-29 01:46:12 +03:00
|
|
|
send_replies();
|
2019-12-11 14:18:19 +03:00
|
|
|
ringloop->submit();
|
|
|
|
}
|
|
|
|
|
2020-02-29 01:46:12 +03:00
|
|
|
void osd_t::handle_epoll_events()
|
2019-12-11 14:18:19 +03:00
|
|
|
{
|
2020-03-03 01:47:56 +03:00
|
|
|
io_uring_sqe *sqe = ringloop->get_sqe();
|
|
|
|
if (!sqe)
|
|
|
|
{
|
|
|
|
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
|
|
|
}
|
|
|
|
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
|
|
|
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
|
|
|
data->callback = [this](ring_data_t *data)
|
|
|
|
{
|
|
|
|
if (data->res < 0)
|
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
|
|
|
}
|
|
|
|
handle_epoll_events();
|
|
|
|
};
|
|
|
|
ringloop->submit();
|
2020-02-29 01:46:12 +03:00
|
|
|
int nfds;
|
2019-12-12 11:32:20 +03:00
|
|
|
epoll_event events[MAX_EPOLL_EVENTS];
|
2020-02-29 01:46:12 +03:00
|
|
|
restart:
|
|
|
|
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
|
2019-12-16 21:37:13 +03:00
|
|
|
for (int i = 0; i < nfds; i++)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
if (events[i].data.fd == listen_fd)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
// Accept new connections
|
|
|
|
sockaddr_in addr;
|
|
|
|
socklen_t peer_addr_size = sizeof(addr);
|
|
|
|
int peer_fd;
|
|
|
|
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
char peer_str[256];
|
|
|
|
printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
|
|
|
|
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
2020-01-09 20:20:56 +03:00
|
|
|
int one = 1;
|
|
|
|
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
2019-12-16 21:37:13 +03:00
|
|
|
clients[peer_fd] = {
|
|
|
|
.peer_addr = addr,
|
2020-01-04 01:23:25 +03:00
|
|
|
.peer_port = ntohs(addr.sin_port),
|
2019-12-16 21:37:13 +03:00
|
|
|
.peer_fd = peer_fd,
|
2020-01-04 01:23:25 +03:00
|
|
|
.peer_state = PEER_CONNECTED,
|
2020-03-02 02:58:00 +03:00
|
|
|
.in_buf = malloc(receive_buffer_size),
|
2019-12-16 21:37:13 +03:00
|
|
|
};
|
|
|
|
// Add FD to epoll
|
|
|
|
epoll_event ev;
|
|
|
|
ev.data.fd = peer_fd;
|
2020-02-29 01:46:12 +03:00
|
|
|
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
2019-12-16 21:37:13 +03:00
|
|
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
2019-12-12 11:32:20 +03:00
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
// Try to accept next connection
|
|
|
|
peer_addr_size = sizeof(addr);
|
2019-12-12 11:32:20 +03:00
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
if (peer_fd == -1 && errno != EAGAIN)
|
2019-12-12 11:32:20 +03:00
|
|
|
{
|
2019-12-16 21:37:13 +03:00
|
|
|
throw std::runtime_error(std::string("accept: ") + strerror(errno));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-04-07 01:53:13 +03:00
|
|
|
auto cl_it = clients.find(events[i].data.fd);
|
|
|
|
if (cl_it != clients.end())
|
2020-02-09 18:22:29 +03:00
|
|
|
{
|
2020-04-07 01:53:13 +03:00
|
|
|
auto & cl = cl_it->second;
|
|
|
|
if (cl.peer_state == PEER_CONNECTING)
|
|
|
|
{
|
|
|
|
// Either OUT (connected) or HUP
|
|
|
|
handle_connect_result(cl.peer_fd);
|
|
|
|
}
|
|
|
|
else if (events[i].events & EPOLLRDHUP)
|
|
|
|
{
|
|
|
|
// Stop client
|
|
|
|
printf("osd: client %d disconnected\n", cl.peer_fd);
|
|
|
|
stop_client(cl.peer_fd);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Mark client as ready (i.e. some data is available)
|
|
|
|
cl.read_ready++;
|
|
|
|
if (cl.read_ready == 1)
|
|
|
|
{
|
|
|
|
read_ready_clients.push_back(cl.peer_fd);
|
|
|
|
ringloop->wakeup();
|
|
|
|
}
|
|
|
|
}
|
2019-12-16 21:37:13 +03:00
|
|
|
}
|
2020-02-29 01:46:12 +03:00
|
|
|
else
|
2019-12-16 21:37:13 +03:00
|
|
|
{
|
2020-04-07 01:53:13 +03:00
|
|
|
auto & cb = epoll_handlers[events[i].data.fd];
|
2020-04-12 02:08:00 +03:00
|
|
|
cb(events[i].data.fd, events[i].events);
|
2019-12-12 11:32:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-02-29 01:46:12 +03:00
|
|
|
if (nfds == MAX_EPOLL_EVENTS)
|
|
|
|
{
|
|
|
|
goto restart;
|
|
|
|
}
|
2019-12-11 14:18:19 +03:00
|
|
|
}
|
2019-12-13 14:05:11 +03:00
|
|
|
|
2020-02-11 13:59:10 +03:00
|
|
|
void osd_t::cancel_osd_ops(osd_client_t & cl)
|
|
|
|
{
|
|
|
|
for (auto p: cl.sent_ops)
|
|
|
|
{
|
|
|
|
cancel_op(p.second);
|
|
|
|
}
|
|
|
|
cl.sent_ops.clear();
|
|
|
|
for (auto op: cl.outbox)
|
|
|
|
{
|
|
|
|
cancel_op(op);
|
|
|
|
}
|
|
|
|
cl.outbox.clear();
|
|
|
|
if (cl.write_op)
|
|
|
|
{
|
|
|
|
cancel_op(cl.write_op);
|
|
|
|
cl.write_op = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::cancel_op(osd_op_t *op)
|
|
|
|
{
|
2020-02-28 01:46:39 +03:00
|
|
|
if (op->op_type == OSD_OP_OUT)
|
|
|
|
{
|
|
|
|
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
|
|
|
op->reply.hdr.id = op->req.hdr.id;
|
|
|
|
op->reply.hdr.opcode = op->req.hdr.opcode;
|
|
|
|
op->reply.hdr.retval = -EPIPE;
|
2020-03-31 17:50:50 +03:00
|
|
|
// Copy lambda to be unaffected by `delete op`
|
|
|
|
std::function<void(osd_op_t*)>(op->callback)(op);
|
2020-02-28 01:46:39 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-03-30 19:43:12 +03:00
|
|
|
finish_op(op, -EPIPE);
|
2020-02-28 01:46:39 +03:00
|
|
|
}
|
2020-02-11 13:59:10 +03:00
|
|
|
}
|
|
|
|
|
2019-12-13 14:05:11 +03:00
|
|
|
void osd_t::stop_client(int peer_fd)
|
|
|
|
{
|
2020-02-11 13:59:10 +03:00
|
|
|
auto it = clients.find(peer_fd);
|
|
|
|
if (it == clients.end())
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
2020-03-31 17:50:50 +03:00
|
|
|
osd_client_t cl = it->second;
|
2020-04-20 17:44:03 +03:00
|
|
|
if (cl.peer_state == PEER_CONNECTED)
|
2020-03-31 17:50:50 +03:00
|
|
|
{
|
2020-04-20 17:44:03 +03:00
|
|
|
if (cl.osd_num)
|
2020-04-19 01:01:26 +03:00
|
|
|
{
|
|
|
|
// Reload configuration from Consul when the connection is dropped
|
2020-04-20 17:44:03 +03:00
|
|
|
printf("[%lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
|
2020-04-19 01:01:26 +03:00
|
|
|
peer_states.erase(cl.osd_num);
|
2020-04-20 17:44:03 +03:00
|
|
|
repeer_pgs(cl.osd_num);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
printf("[%lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
2020-04-19 01:01:26 +03:00
|
|
|
}
|
2020-03-31 17:50:50 +03:00
|
|
|
}
|
|
|
|
clients.erase(it);
|
2020-01-09 18:39:58 +03:00
|
|
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
|
|
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
|
|
|
}
|
2020-02-11 13:59:10 +03:00
|
|
|
if (cl.osd_num)
|
2020-01-04 01:23:25 +03:00
|
|
|
{
|
2020-02-11 13:59:10 +03:00
|
|
|
// Cancel outbound operations
|
|
|
|
cancel_osd_ops(cl);
|
|
|
|
osd_peer_fds.erase(cl.osd_num);
|
2020-03-13 21:41:54 +03:00
|
|
|
peering_state |= OSD_CONNECTING_PEERS;
|
2020-01-04 01:23:25 +03:00
|
|
|
}
|
2020-02-11 13:59:10 +03:00
|
|
|
if (cl.read_op)
|
|
|
|
{
|
|
|
|
delete cl.read_op;
|
|
|
|
}
|
2019-12-17 01:44:08 +03:00
|
|
|
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-17 01:44:08 +03:00
|
|
|
if (*rit == peer_fd)
|
2019-12-13 14:05:11 +03:00
|
|
|
{
|
2019-12-17 01:44:08 +03:00
|
|
|
read_ready_clients.erase(rit);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
|
|
|
|
{
|
|
|
|
if (*wit == peer_fd)
|
|
|
|
{
|
|
|
|
write_ready_clients.erase(wit);
|
|
|
|
break;
|
2019-12-13 14:05:11 +03:00
|
|
|
}
|
|
|
|
}
|
2020-03-02 02:58:00 +03:00
|
|
|
free(cl.in_buf);
|
2019-12-13 14:05:11 +03:00
|
|
|
close(peer_fd);
|
|
|
|
}
|
|
|
|
|
2019-12-28 01:25:55 +03:00
|
|
|
void osd_t::exec_op(osd_op_t *cur_op)
|
2019-12-15 15:30:51 +03:00
|
|
|
{
|
2020-03-03 00:54:42 +03:00
|
|
|
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
2019-12-19 22:16:04 +03:00
|
|
|
if (stopping)
|
|
|
|
{
|
|
|
|
// Throw operation away
|
|
|
|
delete cur_op;
|
|
|
|
return;
|
|
|
|
}
|
2020-04-03 13:03:42 +03:00
|
|
|
inflight_ops++;
|
2020-02-29 02:27:10 +03:00
|
|
|
cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
|
2020-02-23 19:03:06 +03:00
|
|
|
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
|
|
|
|
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
|
2020-02-25 01:20:45 +03:00
|
|
|
(cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
|
2020-04-14 19:19:56 +03:00
|
|
|
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % bs_disk_alignment || cur_op->req.sec_rw.offset % bs_disk_alignment) ||
|
2020-02-25 01:20:45 +03:00
|
|
|
(cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
|
2020-04-14 19:19:56 +03:00
|
|
|
(cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % bs_disk_alignment || cur_op->req.rw.offset % bs_disk_alignment))
|
2019-12-13 20:12:31 +03:00
|
|
|
{
|
|
|
|
// Bad command
|
2020-03-31 17:50:50 +03:00
|
|
|
finish_op(cur_op, -EINVAL);
|
2019-12-15 15:30:51 +03:00
|
|
|
return;
|
|
|
|
}
|
2020-02-23 19:03:06 +03:00
|
|
|
if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
|
2019-12-15 15:30:51 +03:00
|
|
|
{
|
2019-12-28 01:25:55 +03:00
|
|
|
exec_sync_stab_all(cur_op);
|
2019-12-13 20:12:31 +03:00
|
|
|
}
|
2020-02-23 19:03:06 +03:00
|
|
|
else if (cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
|
2019-12-23 21:56:03 +03:00
|
|
|
{
|
2019-12-28 01:25:55 +03:00
|
|
|
exec_show_config(cur_op);
|
2019-12-23 21:56:03 +03:00
|
|
|
}
|
2020-03-13 21:41:54 +03:00
|
|
|
// FIXME: Do not handle operations immediately, manage some sort of a queue instead
|
2020-02-23 19:03:06 +03:00
|
|
|
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
|
2019-12-26 14:06:03 +03:00
|
|
|
{
|
2020-02-24 02:40:48 +03:00
|
|
|
continue_primary_read(cur_op);
|
2020-01-30 22:06:46 +03:00
|
|
|
}
|
2020-02-23 19:03:06 +03:00
|
|
|
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
|
2020-01-30 22:06:46 +03:00
|
|
|
{
|
2020-02-24 02:40:48 +03:00
|
|
|
continue_primary_write(cur_op);
|
2019-12-26 14:06:03 +03:00
|
|
|
}
|
2020-02-25 20:10:17 +03:00
|
|
|
else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
|
|
|
|
{
|
|
|
|
continue_primary_sync(cur_op);
|
|
|
|
}
|
2019-12-23 21:56:03 +03:00
|
|
|
else
|
|
|
|
{
|
2019-12-28 01:25:55 +03:00
|
|
|
exec_secondary(cur_op);
|
2019-12-13 14:05:11 +03:00
|
|
|
}
|
|
|
|
}
|
2020-04-02 22:16:46 +03:00
|
|
|
|
2020-04-14 14:37:50 +03:00
|
|
|
void osd_t::reset_stats()
|
2020-04-02 22:16:46 +03:00
|
|
|
{
|
2020-04-14 14:37:50 +03:00
|
|
|
for (int p = 0; p < 2; p++)
|
2020-04-02 22:16:46 +03:00
|
|
|
{
|
2020-04-14 14:37:50 +03:00
|
|
|
for (int i = 0; i <= OSD_OP_MAX; i++)
|
2020-04-02 22:16:46 +03:00
|
|
|
{
|
2020-04-14 14:37:50 +03:00
|
|
|
if (op_stat_count[p][i] != 0)
|
|
|
|
{
|
|
|
|
op_stat_count[p][i] = 0;
|
|
|
|
op_stat_sum[p][i] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (int i = 0; i <= OSD_OP_MAX; i++)
|
|
|
|
{
|
|
|
|
if (subop_stat_count[p][i] != 0)
|
|
|
|
{
|
|
|
|
subop_stat_count[p][i] = 0;
|
|
|
|
subop_stat_sum[p][i] = 0;
|
|
|
|
}
|
2020-04-02 22:16:46 +03:00
|
|
|
}
|
|
|
|
}
|
2020-04-14 14:37:50 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void osd_t::print_stats()
|
|
|
|
{
|
2020-04-02 22:16:46 +03:00
|
|
|
for (int i = 0; i <= OSD_OP_MAX; i++)
|
|
|
|
{
|
2020-04-14 14:37:50 +03:00
|
|
|
if (op_stat_count[0][i] != op_stat_count[1][i])
|
2020-04-02 22:16:46 +03:00
|
|
|
{
|
2020-04-14 14:37:50 +03:00
|
|
|
uint64_t avg = (op_stat_sum[0][i] - op_stat_sum[1][i])/(op_stat_count[0][i] - op_stat_count[1][i]);
|
|
|
|
printf("avg latency for op %d (%s): %ld us\n", i, osd_op_names[i], avg);
|
|
|
|
op_stat_count[1][i] = op_stat_count[0][i];
|
|
|
|
op_stat_sum[1][i] = op_stat_sum[0][i];
|
2020-04-02 22:16:46 +03:00
|
|
|
}
|
|
|
|
}
|
2020-04-14 14:37:50 +03:00
|
|
|
for (int i = 0; i <= OSD_OP_MAX; i++)
|
2020-04-02 22:16:46 +03:00
|
|
|
{
|
2020-04-14 14:37:50 +03:00
|
|
|
if (subop_stat_count[0][i] != subop_stat_count[1][i])
|
|
|
|
{
|
|
|
|
uint64_t avg = (subop_stat_sum[0][i] - subop_stat_sum[1][i])/(subop_stat_count[0][i] - subop_stat_count[1][i]);
|
|
|
|
printf("avg latency for subop %d (%s): %ld us\n", i, osd_op_names[i], avg);
|
2020-04-20 17:44:03 +03:00
|
|
|
subop_stat_count[1][i] = subop_stat_count[0][i];
|
|
|
|
subop_stat_sum[1][i] = subop_stat_sum[0][i];
|
2020-04-14 14:37:50 +03:00
|
|
|
}
|
2020-04-02 22:16:46 +03:00
|
|
|
}
|
|
|
|
if (incomplete_objects > 0)
|
|
|
|
{
|
|
|
|
printf("%lu object(s) incomplete\n", incomplete_objects);
|
|
|
|
}
|
|
|
|
if (degraded_objects > 0)
|
|
|
|
{
|
|
|
|
printf("%lu object(s) degraded\n", degraded_objects);
|
|
|
|
}
|
|
|
|
if (misplaced_objects > 0)
|
|
|
|
{
|
|
|
|
printf("%lu object(s) misplaced\n", misplaced_objects);
|
|
|
|
}
|
|
|
|
}
|