Compare commits

..

1 Commits

Author SHA1 Message Date
83939f5a22 Test ZCTR 2022-01-01 02:30:25 +03:00
6 changed files with 173 additions and 269 deletions

View File

@@ -42,6 +42,7 @@ struct op_buf_t
struct sec_data
{
int connect_fd;
int data_fd;
/* block_size = 1 << block_order (128KB by default) */
uint64_t block_order = 17, block_size = 1 << 17;
std::unordered_map<uint64_t, op_buf_t*> queue;
@@ -157,6 +158,7 @@ static void sec_cleanup(struct thread_data *td)
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd)
{
close(bsd->data_fd);
close(bsd->connect_fd);
delete bsd;
}
@@ -170,20 +172,20 @@ static int sec_init(struct thread_data *td)
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order;
sockaddr addr;
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
struct sockaddr_storage addr = { 0 };
if (!string_to_addr(o->host ? o->host : "127.0.0.1", false, o->port > 0 ? o->port : 11203, (struct sockaddr*)&addr))
{
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
return 1;
}
bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
bsd->connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (bsd->connect_fd < 0)
{
perror("socket");
return 1;
}
if (connect(bsd->connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
if (connect(bsd->connect_fd, (sockaddr*)&addr, addr.ss_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)) < 0)
{
perror("connect");
return 1;
@@ -199,6 +201,39 @@ static int sec_init(struct thread_data *td)
}
}
if (!string_to_addr(o->host ? o->host : "127.0.0.1", false, 1 + (o->port > 0 ? o->port : 11203), (sockaddr*)&addr))
{
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
return 1;
}
bsd->data_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (bsd->data_fd < 0)
{
perror("socket");
return 1;
}
/* int mss = 4096;
if (setsockopt(bsd->data_fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) < 0)
{
perror("setsockopt TCP_MAXSEG");
return 1;
}*/
if (connect(bsd->data_fd, (sockaddr*)&addr, addr.ss_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)) < 0)
{
perror("connect");
return 1;
}
setsockopt(bsd->data_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
if (o->zerocopy_send)
{
if (setsockopt(bsd->data_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) < 0)
{
perror("setsockopt zerocopy");
return 1;
}
}
// FIXME: read config (block size) from OSD
return 0;
@@ -298,16 +333,18 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
bsd->op_n++;
bsd->queue[n] = op_buf;
iovec iov[2] = { { .iov_base = op.buf, .iov_len = OSD_PACKET_SIZE } };
int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
if (io->ddir == DDIR_WRITE)
{
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
wtotal += io->xfer_buflen;
// It may make you laugh but ZCTR is only stable if we write data before header :-) O_o
if (write_blocking(bsd->data_fd, io->xfer_buf, io->xfer_buflen) != io->xfer_buflen)
{
perror("write");
exit(1);
}
}
if (sendv_blocking(bsd->connect_fd, iov, iovcnt, opt->zerocopy_send ? MSG_ZEROCOPY : 0) != wtotal)
if (write_blocking(bsd->connect_fd, op.buf, OSD_PACKET_SIZE) != OSD_PACKET_SIZE)
{
perror("sendmsg");
perror("write");
exit(1);
}
@@ -346,23 +383,21 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
exit(1);
}
// Support bitmap
uint64_t bitmap = 0;
int iovcnt = 0;
iovec iov[2];
if (reply.sec_rw.attr_len > 0)
{
if (reply.sec_rw.attr_len <= 8)
iov[iovcnt++] = { .iov_base = &bitmap, .iov_len = reply.sec_rw.attr_len };
{
uint64_t bitmap = 0;
read_blocking(bsd->connect_fd, &bitmap, reply.sec_rw.attr_len);
}
else
iov[iovcnt++] = { .iov_base = (void*)(bitmap = (uint64_t)malloc(reply.sec_rw.attr_len)), .iov_len = reply.sec_rw.attr_len };
}
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
readv_blocking(bsd->connect_fd, iov, iovcnt);
if (reply.sec_rw.attr_len > 8)
{
free((void*)bitmap);
{
void *bitmap = malloc(reply.sec_rw.attr_len);
read_blocking(bsd->connect_fd, bitmap, reply.sec_rw.attr_len);
free(bitmap);
}
}
read_blocking(bsd->data_fd, io->xfer_buf, io->xfer_buflen);
}
else if (io->ddir == DDIR_WRITE)
{

View File

@@ -4,12 +4,10 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/random.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include <stdexcept>
#include "base64.h"
#include "addr_util.h"
#include "messenger.h"
@@ -196,7 +194,7 @@ void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
try_connect_peer(peer_osd);
}
void osd_messenger_t::try_connect_peer(osd_num_t peer_osd)
void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
{
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end() || wp_it->second.connecting ||
@@ -217,75 +215,40 @@ void osd_messenger_t::try_connect_peer(osd_num_t peer_osd)
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
wp.connecting = true;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port, NULL, [this](osd_num_t peer_osd, int peer_fd)
{
if (peer_fd >= 0)
osd_peer_fds[peer_osd] = peer_fd;
on_connect_peer(peer_osd, peer_fd);
});
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
}
static std::string urandom_str(int bytes)
{
std::string str;
str.resize(bytes);
char *buf = (char*)str.data();
while (bytes > 0)
{
int r = getrandom(buf, bytes, 0);
if (r < 0)
throw std::runtime_error(std::string("getrandom: ") + strerror(errno));
buf += r;
bytes -= r;
}
return str;
}
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port,
osd_client_t *meta_cl, std::function<void(osd_num_t, int)> connect_callback)
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
assert(peer_osd != this->osd_num);
struct sockaddr addr;
if (!meta_cl)
if (!string_to_addr(peer_host, 0, peer_port, &addr))
{
if (!string_to_addr(peer_host, 0, peer_port, &addr))
{
connect_callback(peer_osd, -EINVAL);
return;
}
}
else
{
addr = meta_cl->peer_addr;
on_connect_peer(peer_osd, -EINVAL);
return;
}
int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
if (peer_fd >= 0)
{
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
peer_fd = -1;
}
}
if (peer_fd < 0)
{
connect_callback(peer_osd, -errno);
on_connect_peer(peer_osd, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
on_connect_peer(peer_osd, -errno);
return;
}
clients[peer_fd] = new osd_client_t();
clients[peer_fd]->peer_addr = addr;
clients[peer_fd]->peer_port = ((struct sockaddr_in*)&addr)->sin_port;
clients[peer_fd]->peer_port = peer_port;
clients[peer_fd]->peer_fd = peer_fd;
clients[peer_fd]->peer_state = PEER_CONNECTING;
clients[peer_fd]->connect_timeout_id = -1;
clients[peer_fd]->connect_callback = connect_callback;
clients[peer_fd]->osd_num = peer_osd;
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
clients[peer_fd]->data_for = meta_cl ? addr_to_string(meta_cl->peer_addr) : "";
clients[peer_fd]->data_connection_cookie = meta_cl
? meta_cl->data_connection_cookie : base64_encode(urandom_str(12));
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
@@ -295,12 +258,10 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
{
clients[peer_fd]->connect_timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
{
auto cl = clients.at(peer_fd);
auto connect_callback = cl->connect_callback;
cl->connect_callback = NULL;
osd_num_t peer_osd = cl->osd_num;
osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
stop_client(peer_fd, true);
connect_callback(peer_osd, -EPIPE);
on_connect_peer(peer_osd, -EPIPE);
return;
});
}
}
@@ -322,10 +283,8 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
}
if (result != 0)
{
auto connect_callback = cl->connect_callback;
cl->connect_callback = NULL;
stop_client(peer_fd, true);
connect_callback(peer_osd, -result);
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
@@ -405,11 +364,6 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
void osd_messenger_t::check_peer_config(osd_client_t *cl)
{
json11::Json::object payload;
if (cl->data_connection_cookie != "")
{
payload["data_cookie"] = cl->data_connection_cookie;
}
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = cl->peer_fd;
@@ -422,33 +376,24 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
},
},
};
if (cl->data_for == "")
{
#ifdef WITH_RDMA
if (rdma_context)
if (rdma_context)
{
cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
if (cl->rdma_conn)
{
cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
if (cl->rdma_conn)
{
payload["connect_rdma"] = cl->rdma_conn->addr.to_string();
payload["rdma_max_msg"] = cl->rdma_conn->max_msg;
}
json11::Json payload = json11::Json::object {
{ "connect_rdma", cl->rdma_conn->addr.to_string() },
{ "rdma_max_msg", cl->rdma_conn->max_msg },
};
std::string payload_str = payload.dump();
op->req.show_conf.json_len = payload_str.size();
op->buf = malloc_or_die(payload_str.size());
op->iov.push_back(op->buf, payload_str.size());
memcpy(op->buf, payload_str.c_str(), payload_str.size());
}
}
#endif
}
else
{
// Mark it as a data connection
payload["data_for"] = cl->data_for;
}
if (payload.size())
{
std::string payload_str = json11::Json(payload).dump();
op->req.show_conf.json_len = payload_str.size();
op->buf = malloc_or_die(payload_str.size());
op->iov.push_back(op->buf, payload_str.size());
memcpy(op->buf, payload_str.c_str(), payload_str.size());
}
op->callback = [this, cl](osd_op_t *op)
{
std::string json_err;
@@ -481,30 +426,18 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
cl->osd_num, config["protocol_version"].uint64_value(), OSD_PROTOCOL_VERSION
);
}
else if (cl->data_for != "" && config["data_for"] != cl->data_for)
{
err = true;
fprintf(
stderr, "OSD %lu does not support separate data connections."
" Proceeding with a single connection\n", cl->osd_num
);
}
}
if (err)
{
osd_num_t peer_osd = cl->osd_num;
auto connect_callback = cl->connect_callback;
cl->connect_callback = NULL;
stop_client(op->peer_fd);
connect_callback(peer_osd, -EINVAL);
on_connect_peer(peer_osd, -1);
delete op;
return;
}
#ifdef WITH_RDMA
if (rdma_context && cl->rdma_conn && config["rdma_address"].is_string())
if (config["rdma_address"].is_string())
{
// Prevent creating data connection - we are trying RDMA
cl->data_connection_cookie = "";
msgr_rdma_address_t addr;
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
cl->rdma_conn->connect(&addr) != 0)
@@ -517,10 +450,8 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
cl->rdma_conn = NULL;
// FIXME: Keep TCP connection in this case
osd_num_t peer_osd = cl->osd_num;
auto connect_callback = cl->connect_callback;
cl->connect_callback = NULL;
stop_client(cl->peer_fd);
connect_callback(peer_osd, -EPIPE);
on_connect_peer(peer_osd, -1);
delete op;
return;
}
@@ -542,37 +473,8 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
}
}
#endif
if (cl->data_connection_cookie != "")
{
// Try to open second connection to the same address
try_connect_peer_addr(cl->osd_num, NULL, 0, cl, [this, peer_fd = cl->peer_fd](osd_num_t data_peer, int data_peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
// Proceed with or without the data connection
auto cl = cl_it->second;
if (data_peer_fd >= 0)
{
cl->data_connection_fd = data_peer_fd;
auto data_cl = clients.at(data_peer_fd);
data_cl->meta_connection_fd = cl->peer_fd;
}
osd_peer_fds[cl->osd_num] = cl->peer_fd;
on_connect_peer(cl->osd_num, cl->peer_fd);
}
else if (data_peer_fd >= 0)
{
stop_client(data_peer_fd);
}
});
}
else
{
auto connect_callback = cl->connect_callback;
cl->connect_callback = NULL;
connect_callback(cl->osd_num, cl->peer_fd);
}
osd_peer_fds[cl->osd_num] = cl->peer_fd;
on_connect_peer(cl->osd_num, cl->peer_fd);
delete op;
};
outbox_push(op);
@@ -598,7 +500,6 @@ void osd_messenger_t::accept_connections(int listen_fd)
clients[peer_fd]->peer_fd = peer_fd;
clients[peer_fd]->peer_state = PEER_CONNECTED;
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
clients_by_addr[addr_to_string(addr)] = peer_fd;
// Add FD to epoll
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{

View File

@@ -57,10 +57,6 @@ struct osd_client_t
int ping_time_remaining = 0;
int idle_time_remaining = 0;
osd_num_t osd_num = 0;
std::function<void(osd_num_t, int)> connect_callback;
int data_connection_fd = -1, meta_connection_fd = -1;
std::string data_connection_cookie, data_for;
void *in_buf = NULL;
@@ -152,7 +148,6 @@ public:
osd_num_t osd_num;
uint64_t next_subop_id = 1;
std::map<int, osd_client_t*> clients;
std::map<std::string, int> clients_by_addr;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
// op statistics
@@ -162,7 +157,6 @@ public:
void parse_config(const json11::Json & config);
void connect_peer(uint64_t osd_num, json11::Json peer_state);
void stop_client(int peer_fd, bool force = false, bool force_delete = false);
void break_data_client_pair(osd_client_t *cl);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
@@ -180,8 +174,7 @@ public:
protected:
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port,
osd_client_t *meta_cl, std::function<void(osd_num_t, int)> connect_callback);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_peer_epoll(int peer_fd, int epoll_events);
void handle_connect_epoll(int peer_fd);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);

View File

@@ -4,7 +4,6 @@
#include <unistd.h>
#include <assert.h>
#include "addr_util.h"
#include "messenger.h"
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
@@ -59,8 +58,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
{
if (cl->osd_num)
{
fprintf(stderr, "[OSD %lu] Stopping client %d (OSD %speer %lu)\n",
osd_num, peer_fd, cl->meta_connection_fd >= 0 ? " data" : "", cl->osd_num);
fprintf(stderr, "[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
}
else
{
@@ -70,7 +68,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
// First set state to STOPPED so another stop_client() call doesn't try to free it again
cl->refs++;
cl->peer_state = PEER_STOPPED;
if (cl->osd_num && cl->meta_connection_fd < 0)
if (cl->osd_num)
{
// ...and forget OSD peer
osd_peer_fds.erase(cl->osd_num);
@@ -102,17 +100,9 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
#endif
if (cl->osd_num)
{
if (cl->meta_connection_fd < 0)
{
// Then repeer PGs because cancel_op() callbacks can try to perform
// some actions and we need correct PG states to not do something silly
repeer_pgs(cl->osd_num);
}
else
{
// FIXME Try to re-establish data connection
// Only when the connection is outbound, but here it's always outbound
}
// Then repeer PGs because cancel_op() callbacks can try to perform
// some actions and we need correct PG states to not do something silly
repeer_pgs(cl->osd_num);
}
// Then cancel all operations
if (cl->read_op)
@@ -138,7 +128,6 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
delete cl->rdma_conn;
}
#endif
clients_by_addr.erase(addr_to_string(cl->peer_addr));
#endif
// Find the item again because it can be invalidated at this point
it = clients.find(peer_fd);
@@ -146,40 +135,9 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
{
clients.erase(it);
}
// Break metadata/data connection pair
if (cl->data_connection_fd >= 0)
{
// No sense to keep data connection when metadata connection is stopped
auto dc_it = clients.find(cl->data_connection_fd);
cl->data_connection_fd = -1;
if (dc_it != clients.end() && dc_it->second->meta_connection_fd == cl->peer_fd)
{
stop_client(dc_it->second->peer_fd);
}
}
break_data_client_pair(cl);
// Refcount and delete
cl->refs--;
if (cl->refs <= 0 || force_delete)
{
delete cl;
}
}
void osd_messenger_t::break_data_client_pair(osd_client_t *cl)
{
if (cl->meta_connection_fd >= 0)
{
auto dc_it = clients.find(cl->meta_connection_fd);
if (dc_it != clients.end() && dc_it->second->data_connection_fd == cl->peer_fd)
dc_it->second->data_connection_fd = -1;
cl->meta_connection_fd = -1;
}
if (cl->data_connection_fd >= 0)
{
auto dc_it = clients.find(cl->data_connection_fd);
if (dc_it != clients.end() && dc_it->second->meta_connection_fd == cl->peer_fd)
dc_it->second->meta_connection_fd = -1;
cl->data_connection_fd = -1;
}
}

View File

@@ -178,37 +178,6 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
}
}
}
else
{
#endif
if (req_json["data_for"].is_string())
{
auto cli = msgr.clients.at(cur_op->peer_fd);
auto md_it = msgr.clients_by_addr.find(req_json["data_for"].string_value());
if (md_it != msgr.clients_by_addr.end())
{
int md_peer_fd = md_it->second;
auto md_it = msgr.clients.find(md_peer_fd);
if (md_it != msgr.clients.end() && md_it->second->data_connection_cookie != "" &&
req_json["data_cookie"].string_value() == md_it->second->data_connection_cookie)
{
// Break previous metadata/data connections for both FDs, if present
msgr.break_data_client_pair(cli);
msgr.break_data_client_pair(md_it->second);
// And setup the new pair
cli->meta_connection_fd = md_it->second->peer_fd;
md_it->second->data_connection_fd = cli->peer_fd;
wire_config["data_for"] = req_json["data_for"];
}
}
}
else if (req_json["data_cookie"].is_string())
{
auto cli = msgr.clients.at(cur_op->peer_fd);
cli->data_connection_cookie = req_json["data_cookie"].string_value();
}
#ifdef WITH_RDMA
}
#endif
if (cur_op->buf)
free(cur_op->buf);

View File

@@ -24,7 +24,9 @@
*/
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <poll.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
@@ -43,19 +45,32 @@
int bind_stub(std::string bind_address, int bind_port);
void run_stub(int peer_fd);
void run_stub(int peer_fd, int peer_data_fd);
int main(int narg, char *args[])
{
int listen_fd = bind_stub("0.0.0.0", 11203);
int listen_data_fd = bind_stub("0.0.0.0", 11204);
/* int mss = 8192;
if (setsockopt(listen_data_fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) < 0)
{
throw std::runtime_error(std::string("setsockopt TCP_MAXSEG: ") + strerror(errno));
}
int rcvlowat = 4096;
if (setsockopt(listen_data_fd, SOL_SOCKET, SO_RCVLOWAT, &rcvlowat, sizeof(rcvlowat)) < 0)
{
throw std::runtime_error(std::string("setsockopt SO_RCVLOWAT: ") + strerror(errno));
}*/
// Accept new connections
sockaddr addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
socklen_t peer_addr_size;
int peer_fd, peer_data_fd;
const int one = 1;
while (1)
{
printf("stub_osd: waiting for 1 client\n");
peer_fd = accept(listen_fd, &addr, &peer_addr_size);
peer_addr_size = sizeof(addr);
peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size);
if (peer_fd == -1)
{
if (errno == EAGAIN)
@@ -63,15 +78,27 @@ int main(int narg, char *args[])
else
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
printf("stub_osd: new client %d: connection from %s\n", peer_fd,
addr_to_string(addr).c_str());
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
run_stub(peer_fd);
close(peer_fd);
printf("stub_osd: client %d disconnected\n", peer_fd);
// Try to accept next connection
printf("stub_osd: new client %d: connection from %s\n", peer_fd,
addr_to_string(*((sockaddr*)&addr)).c_str());
printf("stub_osd: waiting for 1 data connection\n");
peer_addr_size = sizeof(addr);
peer_data_fd = accept(listen_data_fd, (sockaddr*)&addr, &peer_addr_size);
if (peer_data_fd == -1)
{
if (errno == EAGAIN)
continue;
else
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
setsockopt(peer_data_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
printf("stub_osd: new client %d: data connection from %s\n", peer_data_fd,
addr_to_string(*((sockaddr*)&addr)).c_str());
run_stub(peer_fd, peer_data_fd);
close(peer_data_fd);
close(peer_fd);
printf("stub_osd: client %d / data %d disconnected\n", peer_fd, peer_data_fd);
// Try to accept next connection
}
return 0;
}
@@ -80,13 +107,13 @@ int bind_stub(std::string bind_address, int bind_port)
{
int listen_backlog = 128;
sockaddr addr;
if (!string_to_addr(bind_address, 0, bind_port, &addr))
sockaddr_storage addr = { 0 };
if (!string_to_addr(bind_address, 0, bind_port, (sockaddr*)&addr))
{
throw std::runtime_error("bind address "+bind_address+" is not valid");
}
int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
int listen_fd = socket(addr.ss_family, SOCK_STREAM, 0);
if (listen_fd < 0)
{
throw std::runtime_error(std::string("socket: ") + strerror(errno));
@@ -94,7 +121,7 @@ int bind_stub(std::string bind_address, int bind_port)
int enable = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
if (bind(listen_fd, &addr, sizeof(addr)) < 0)
if (bind(listen_fd, (sockaddr*)&addr, addr.ss_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
@@ -109,11 +136,16 @@ int bind_stub(std::string bind_address, int bind_port)
return listen_fd;
}
void run_stub(int peer_fd)
void run_stub(int peer_fd, int peer_data_fd)
{
osd_any_op_t op;
osd_any_reply_t reply = { 0 };
void *buf = NULL;
unsigned bufsize = 4*1024*1024;
void *buf = mmap(NULL, bufsize, PROT_READ, MAP_SHARED, peer_data_fd, 0);
if (buf == MAP_FAILED)
{
throw std::runtime_error(std::string("mmap: ") + strerror(errno));
}
while (1)
{
int r = read_blocking(peer_fd, op.buf, OSD_PACKET_SIZE);
@@ -132,19 +164,36 @@ void run_stub(int peer_fd)
if (op.hdr.opcode == OSD_OP_SEC_READ)
{
reply.hdr.retval = op.sec_rw.len;
buf = malloc(op.sec_rw.len);
void *buf = malloc(op.sec_rw.len);
r = write_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
if (r == OSD_PACKET_SIZE)
r = write_blocking(peer_fd, buf, op.sec_rw.len);
r = write_blocking(peer_data_fd, buf, op.sec_rw.len);
free(buf);
if (r < op.sec_rw.len)
break;
}
else if (op.hdr.opcode == OSD_OP_SEC_WRITE || op.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{
buf = malloc(op.sec_rw.len);
r = read_blocking(peer_fd, buf, op.sec_rw.len);
free(buf);
struct pollfd pfd = { .fd = peer_data_fd, .events = POLLIN };
poll(&pfd, 1, 10000);
struct tcp_zerocopy_receive zc = { .address = (uint64_t)buf, .length = op.sec_rw.len };
socklen_t zc_len = sizeof(zc);
r = getsockopt(peer_data_fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len);
r = r == -1 ? 0 : zc.length;
if (r > 0)
{
uint64_t hash = 0;
for (int k = 0; k < r/8; k++)
hash ^= ((uint64_t*)buf)[k];
printf("ZCTR: op=%lx r=%d len=%d skip=%d hash=%lx\n", op.hdr.id, r, zc.length, zc.recv_skip_hint, hash);
}
if (r < op.sec_rw.len)
{
int rest = op.sec_rw.len - r;
void *buf = malloc(rest);
r += read_blocking(peer_data_fd, buf, rest);
free(buf);
}
reply.hdr.retval = op.sec_rw.len;
if (r == op.sec_rw.len)
r = write_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
@@ -166,5 +215,4 @@ void run_stub(int peer_fd)
break;
}
}
free(buf);
}