Compare commits
1 Commits
test-submi
...
blocking-u
Author | SHA1 | Date | |
---|---|---|---|
d2b37e083a |
18
Makefile
18
Makefile
@@ -2,7 +2,7 @@ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o
|
|||||||
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
||||||
# -fsanitize=address
|
# -fsanitize=address
|
||||||
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
||||||
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd stub_bench osd_test
|
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd osd_test
|
||||||
clean:
|
clean:
|
||||||
rm -f *.o
|
rm -f *.o
|
||||||
|
|
||||||
@@ -14,7 +14,7 @@ allocator.o: allocator.cpp allocator.h
|
|||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
ringloop.o: ringloop.cpp ringloop.h
|
ringloop.o: ringloop.cpp ringloop.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
|
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
|
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
|
||||||
@@ -26,13 +26,13 @@ libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
|
|||||||
g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
|
g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
|
||||||
|
|
||||||
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
|
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
|
||||||
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
|
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
|
osd_receive.o: osd_receive.cpp osd.h osd_ops.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
|
osd_send.o: osd_send.cpp osd.h osd_ops.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
|
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
|
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
@@ -40,16 +40,14 @@ osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
|
|||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
|
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
|
||||||
g++ $(CXXFLAGS) -o $@ $<
|
g++ $(CXXFLAGS) -o $@ $<
|
||||||
osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
|
osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
|
osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
|
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
|
||||||
g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
|
g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
|
||||||
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
|
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
|
||||||
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
|
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
|
||||||
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
|
|
||||||
g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
|
|
||||||
rw_blocking.o: rw_blocking.cpp rw_blocking.h
|
rw_blocking.o: rw_blocking.cpp rw_blocking.h
|
||||||
g++ $(CXXFLAGS) -c -o $@ $<
|
g++ $(CXXFLAGS) -c -o $@ $<
|
||||||
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
|
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
|
||||||
|
@@ -196,6 +196,11 @@ void blockstore_impl_t::loop()
|
|||||||
{
|
{
|
||||||
flusher->loop();
|
flusher->loop();
|
||||||
}
|
}
|
||||||
|
int ret = ringloop->submit();
|
||||||
|
if (ret < 0)
|
||||||
|
{
|
||||||
|
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
|
||||||
|
}
|
||||||
if ((initial_ring_space - ringloop->space_left()) > 0)
|
if ((initial_ring_space - ringloop->space_left()) > 0)
|
||||||
{
|
{
|
||||||
live = true;
|
live = true;
|
||||||
|
@@ -261,18 +261,15 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
|||||||
bsd->op_n++;
|
bsd->op_n++;
|
||||||
bsd->queue[n] = io;
|
bsd->queue[n] = io;
|
||||||
|
|
||||||
iovec iov[2] = { { .iov_base = op.buf, .iov_len = OSD_PACKET_SIZE } };
|
if (write(bsd->connect_fd, op.buf, OSD_PACKET_SIZE) != OSD_PACKET_SIZE)
|
||||||
int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
|
{
|
||||||
|
perror("write");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
if (io->ddir == DDIR_WRITE)
|
if (io->ddir == DDIR_WRITE)
|
||||||
{
|
{
|
||||||
iov[1] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
// Send data
|
||||||
wtotal += io->xfer_buflen;
|
write_blocking(bsd->connect_fd, io->xfer_buf, io->xfer_buflen);
|
||||||
iovcnt++;
|
|
||||||
}
|
|
||||||
if (writev_blocking(bsd->connect_fd, iov, iovcnt) != wtotal)
|
|
||||||
{
|
|
||||||
perror("writev");
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (io->error != 0)
|
if (io->error != 0)
|
||||||
|
39
osd.cpp
39
osd.cpp
@@ -50,7 +50,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
|
|||||||
}
|
}
|
||||||
if (send_stat_count != 0)
|
if (send_stat_count != 0)
|
||||||
{
|
{
|
||||||
printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
|
printf("avg latency to send subops with data: %ld us\n", send_stat_sum/send_stat_count);
|
||||||
send_stat_count = 0;
|
send_stat_count = 0;
|
||||||
send_stat_sum = 0;
|
send_stat_sum = 0;
|
||||||
}
|
}
|
||||||
@@ -175,26 +175,11 @@ void osd_t::loop()
|
|||||||
handle_peers();
|
handle_peers();
|
||||||
read_requests();
|
read_requests();
|
||||||
send_replies();
|
send_replies();
|
||||||
|
ringloop->submit();
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::handle_epoll_events()
|
void osd_t::handle_epoll_events()
|
||||||
{
|
{
|
||||||
io_uring_sqe *sqe = ringloop->get_sqe();
|
|
||||||
if (!sqe)
|
|
||||||
{
|
|
||||||
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
|
||||||
}
|
|
||||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
|
||||||
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
|
||||||
data->callback = [this](ring_data_t *data)
|
|
||||||
{
|
|
||||||
if (data->res < 0)
|
|
||||||
{
|
|
||||||
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
|
||||||
}
|
|
||||||
handle_epoll_events();
|
|
||||||
};
|
|
||||||
ringloop->submit();
|
|
||||||
int nfds;
|
int nfds;
|
||||||
epoll_event events[MAX_EPOLL_EVENTS];
|
epoll_event events[MAX_EPOLL_EVENTS];
|
||||||
restart:
|
restart:
|
||||||
@@ -211,7 +196,6 @@ restart:
|
|||||||
{
|
{
|
||||||
char peer_str[256];
|
char peer_str[256];
|
||||||
printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
|
printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
|
||||||
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
|
||||||
int one = 1;
|
int one = 1;
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||||
clients[peer_fd] = {
|
clients[peer_fd] = {
|
||||||
@@ -223,7 +207,7 @@ restart:
|
|||||||
// Add FD to epoll
|
// Add FD to epoll
|
||||||
epoll_event ev;
|
epoll_event ev;
|
||||||
ev.data.fd = peer_fd;
|
ev.data.fd = peer_fd;
|
||||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
ev.events = EPOLLET | EPOLLRDHUP;
|
||||||
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
||||||
@@ -266,6 +250,21 @@ restart:
|
|||||||
{
|
{
|
||||||
goto restart;
|
goto restart;
|
||||||
}
|
}
|
||||||
|
io_uring_sqe *sqe = ringloop->get_sqe();
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
||||||
|
}
|
||||||
|
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||||
|
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
||||||
|
data->callback = [this](ring_data_t *data)
|
||||||
|
{
|
||||||
|
if (data->res < 0)
|
||||||
|
{
|
||||||
|
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
||||||
|
}
|
||||||
|
handle_epoll_events();
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::cancel_osd_ops(osd_client_t & cl)
|
void osd_t::cancel_osd_ops(osd_client_t & cl)
|
||||||
@@ -349,7 +348,7 @@ void osd_t::stop_client(int peer_fd)
|
|||||||
|
|
||||||
void osd_t::exec_op(osd_op_t *cur_op)
|
void osd_t::exec_op(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
gettimeofday(&cur_op->tv_begin, NULL);
|
||||||
if (stopping)
|
if (stopping)
|
||||||
{
|
{
|
||||||
// Throw operation away
|
// Throw operation away
|
||||||
|
12
osd.h
12
osd.h
@@ -1,7 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <time.h>
|
#include <sys/time.h>
|
||||||
#include <sys/ioctl.h>
|
#include <sys/ioctl.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
@@ -93,8 +93,8 @@ struct osd_primary_op_data_t;
|
|||||||
|
|
||||||
struct osd_op_t
|
struct osd_op_t
|
||||||
{
|
{
|
||||||
timespec tv_begin;
|
timeval tv_begin;
|
||||||
timespec tv_send;
|
timeval tv_send;
|
||||||
int op_type = OSD_OP_IN;
|
int op_type = OSD_OP_IN;
|
||||||
int peer_fd;
|
int peer_fd;
|
||||||
osd_any_op_t req;
|
osd_any_op_t req;
|
||||||
@@ -131,8 +131,8 @@ struct osd_client_t
|
|||||||
int read_ready = 0;
|
int read_ready = 0;
|
||||||
osd_op_t *read_op = NULL;
|
osd_op_t *read_op = NULL;
|
||||||
int read_reply_id = 0;
|
int read_reply_id = 0;
|
||||||
iovec read_iov;
|
iovec read_iov = { 0 };
|
||||||
msghdr read_msg;
|
msghdr read_msg = { 0 };
|
||||||
void *read_buf = NULL;
|
void *read_buf = NULL;
|
||||||
int read_remaining = 0;
|
int read_remaining = 0;
|
||||||
int read_state = 0;
|
int read_state = 0;
|
||||||
@@ -215,11 +215,11 @@ class osd_t
|
|||||||
// event loop, socket read/write
|
// event loop, socket read/write
|
||||||
void loop();
|
void loop();
|
||||||
void handle_epoll_events();
|
void handle_epoll_events();
|
||||||
|
bool try_receive(osd_client_t & cl);
|
||||||
void read_requests();
|
void read_requests();
|
||||||
void handle_read(ring_data_t *data, int peer_fd);
|
void handle_read(ring_data_t *data, int peer_fd);
|
||||||
void handle_op_hdr(osd_client_t *cl);
|
void handle_op_hdr(osd_client_t *cl);
|
||||||
void handle_reply_hdr(osd_client_t *cl);
|
void handle_reply_hdr(osd_client_t *cl);
|
||||||
bool try_send(osd_client_t & cl);
|
|
||||||
void send_replies();
|
void send_replies();
|
||||||
void handle_send(ring_data_t *data, int peer_fd);
|
void handle_send(ring_data_t *data, int peer_fd);
|
||||||
void outbox_push(osd_client_t & cl, osd_op_t *op);
|
void outbox_push(osd_client_t & cl, osd_op_t *op);
|
||||||
|
@@ -89,7 +89,7 @@ void osd_t::connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port
|
|||||||
// Add FD to epoll (EPOLLOUT for tracking connect() result)
|
// Add FD to epoll (EPOLLOUT for tracking connect() result)
|
||||||
epoll_event ev;
|
epoll_event ev;
|
||||||
ev.data.fd = peer_fd;
|
ev.data.fd = peer_fd;
|
||||||
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
|
ev.events = EPOLLOUT | EPOLLRDHUP | EPOLLET;
|
||||||
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
||||||
@@ -115,12 +115,13 @@ void osd_t::handle_connect_result(int peer_fd)
|
|||||||
}
|
}
|
||||||
int one = 1;
|
int one = 1;
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||||
|
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) & ~O_NONBLOCK);
|
||||||
// Disable EPOLLOUT on this fd
|
// Disable EPOLLOUT on this fd
|
||||||
cl.connect_callback = NULL;
|
cl.connect_callback = NULL;
|
||||||
cl.peer_state = PEER_CONNECTED;
|
cl.peer_state = PEER_CONNECTED;
|
||||||
epoll_event ev;
|
epoll_event ev;
|
||||||
ev.data.fd = peer_fd;
|
ev.data.fd = peer_fd;
|
||||||
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
|
ev.events = EPOLLRDHUP | EPOLLET;
|
||||||
if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
|
if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
|
||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
||||||
|
@@ -1,40 +1,46 @@
|
|||||||
#include "osd.h"
|
#include "osd.h"
|
||||||
|
|
||||||
|
bool osd_t::try_receive(osd_client_t & cl)
|
||||||
|
{
|
||||||
|
int peer_fd = cl.peer_fd;
|
||||||
|
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
|
if (!cl.read_buf)
|
||||||
|
{
|
||||||
|
// no reads in progress
|
||||||
|
// so this is either a new command or a reply to a previously sent command
|
||||||
|
if (!cl.read_op)
|
||||||
|
{
|
||||||
|
cl.read_op = new osd_op_t;
|
||||||
|
cl.read_op->peer_fd = peer_fd;
|
||||||
|
}
|
||||||
|
cl.read_op->op_type = OSD_OP_IN;
|
||||||
|
cl.read_buf = &cl.read_op->req.buf;
|
||||||
|
cl.read_remaining = OSD_PACKET_SIZE;
|
||||||
|
cl.read_state = CL_READ_OP;
|
||||||
|
}
|
||||||
|
cl.read_iov.iov_base = cl.read_buf;
|
||||||
|
cl.read_iov.iov_len = cl.read_remaining;
|
||||||
|
cl.read_msg.msg_iov = &cl.read_iov;
|
||||||
|
cl.read_msg.msg_iovlen = 1;
|
||||||
|
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
|
||||||
|
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void osd_t::read_requests()
|
void osd_t::read_requests()
|
||||||
{
|
{
|
||||||
for (int i = 0; i < read_ready_clients.size(); i++)
|
for (auto & p: clients)
|
||||||
{
|
{
|
||||||
int peer_fd = read_ready_clients[i];
|
if (p.second.peer_state == PEER_CONNECTED && p.second.read_iov.iov_len == 0)
|
||||||
auto & cl = clients[peer_fd];
|
|
||||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
|
||||||
if (!sqe)
|
|
||||||
{
|
{
|
||||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
try_receive(p.second);
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
|
||||||
if (!cl.read_buf)
|
|
||||||
{
|
|
||||||
// no reads in progress
|
|
||||||
// so this is either a new command or a reply to a previously sent command
|
|
||||||
if (!cl.read_op)
|
|
||||||
{
|
|
||||||
cl.read_op = new osd_op_t;
|
|
||||||
cl.read_op->peer_fd = peer_fd;
|
|
||||||
}
|
|
||||||
cl.read_op->op_type = OSD_OP_IN;
|
|
||||||
cl.read_buf = &cl.read_op->req.buf;
|
|
||||||
cl.read_remaining = OSD_PACKET_SIZE;
|
|
||||||
cl.read_state = CL_READ_OP;
|
|
||||||
}
|
|
||||||
cl.read_iov.iov_base = cl.read_buf;
|
|
||||||
cl.read_iov.iov_len = cl.read_remaining;
|
|
||||||
cl.read_msg.msg_iov = &cl.read_iov;
|
|
||||||
cl.read_msg.msg_iovlen = 1;
|
|
||||||
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
|
|
||||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
|
|
||||||
}
|
}
|
||||||
read_ready_clients.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
||||||
@@ -43,11 +49,9 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
|||||||
if (cl_it != clients.end())
|
if (cl_it != clients.end())
|
||||||
{
|
{
|
||||||
auto & cl = cl_it->second;
|
auto & cl = cl_it->second;
|
||||||
|
cl.read_iov.iov_len = 0;
|
||||||
if (data->res == -EAGAIN)
|
if (data->res == -EAGAIN)
|
||||||
{
|
{
|
||||||
cl.read_ready--;
|
|
||||||
if (cl.read_ready > 0)
|
|
||||||
read_ready_clients.push_back(peer_fd);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (data->res < 0)
|
else if (data->res < 0)
|
||||||
@@ -57,7 +61,6 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
|||||||
stop_client(peer_fd);
|
stop_client(peer_fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
read_ready_clients.push_back(peer_fd);
|
|
||||||
if (data->res > 0)
|
if (data->res > 0)
|
||||||
{
|
{
|
||||||
cl.read_remaining -= data->res;
|
cl.read_remaining -= data->res;
|
||||||
@@ -92,12 +95,12 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
|||||||
cl.read_reply_id = 0;
|
cl.read_reply_id = 0;
|
||||||
cl.read_state = 0;
|
cl.read_state = 0;
|
||||||
// Measure subop latency
|
// Measure subop latency
|
||||||
timespec tv_end;
|
timeval tv_end;
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
gettimeofday(&tv_end, NULL);
|
||||||
subop_stat_count[request->req.hdr.opcode]++;
|
subop_stat_count[request->req.hdr.opcode]++;
|
||||||
subop_stat_sum[request->req.hdr.opcode] += (
|
subop_stat_sum[request->req.hdr.opcode] += (
|
||||||
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
|
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
|
||||||
(tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
|
tv_end.tv_usec - request->tv_begin.tv_usec
|
||||||
);
|
);
|
||||||
request->callback(request);
|
request->callback(request);
|
||||||
}
|
}
|
||||||
@@ -192,12 +195,12 @@ void osd_t::handle_reply_hdr(osd_client_t *cl)
|
|||||||
cl->read_state = 0;
|
cl->read_state = 0;
|
||||||
cl->sent_ops.erase(req_it);
|
cl->sent_ops.erase(req_it);
|
||||||
// Measure subop latency
|
// Measure subop latency
|
||||||
timespec tv_end;
|
timeval tv_end;
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
gettimeofday(&tv_end, NULL);
|
||||||
subop_stat_count[op->req.hdr.opcode]++;
|
subop_stat_count[op->req.hdr.opcode]++;
|
||||||
subop_stat_sum[op->req.hdr.opcode] += (
|
subop_stat_sum[op->req.hdr.opcode] += (
|
||||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
tv_end.tv_usec - op->tv_begin.tv_usec
|
||||||
);
|
);
|
||||||
op->callback(op);
|
op->callback(op);
|
||||||
}
|
}
|
||||||
|
104
osd_send.cpp
104
osd_send.cpp
@@ -5,52 +5,15 @@ void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
|
|||||||
assert(cur_op->peer_fd);
|
assert(cur_op->peer_fd);
|
||||||
if (cur_op->op_type == OSD_OP_OUT)
|
if (cur_op->op_type == OSD_OP_OUT)
|
||||||
{
|
{
|
||||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
gettimeofday(&cur_op->tv_begin, NULL);
|
||||||
|
}
|
||||||
|
if (cl.write_state == 0)
|
||||||
|
{
|
||||||
|
cl.write_state = CL_WRITE_READY;
|
||||||
|
write_ready_clients.push_back(cur_op->peer_fd);
|
||||||
}
|
}
|
||||||
cl.outbox.push_back(cur_op);
|
cl.outbox.push_back(cur_op);
|
||||||
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
|
ringloop->wakeup();
|
||||||
{
|
|
||||||
if (cl.write_state == 0)
|
|
||||||
{
|
|
||||||
cl.write_state = CL_WRITE_READY;
|
|
||||||
write_ready_clients.push_back(cur_op->peer_fd);
|
|
||||||
}
|
|
||||||
ringloop->wakeup();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool osd_t::try_send(osd_client_t & cl)
|
|
||||||
{
|
|
||||||
int peer_fd = cl.peer_fd;
|
|
||||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
|
||||||
if (!sqe)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
|
||||||
if (!cl.write_op)
|
|
||||||
{
|
|
||||||
// pick next command
|
|
||||||
cl.write_op = cl.outbox.front();
|
|
||||||
cl.outbox.pop_front();
|
|
||||||
cl.write_state = CL_WRITE_REPLY;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &cl.write_op->tv_send);
|
|
||||||
if (cl.write_op->op_type == OSD_OP_IN)
|
|
||||||
{
|
|
||||||
// Measure execution latency
|
|
||||||
timespec tv_end = cl.write_op->tv_send;
|
|
||||||
op_stat_count[cl.write_op->req.hdr.opcode]++;
|
|
||||||
op_stat_sum[cl.write_op->req.hdr.opcode] += (
|
|
||||||
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
|
|
||||||
(tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
|
|
||||||
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
|
|
||||||
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
|
|
||||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::send_replies()
|
void osd_t::send_replies()
|
||||||
@@ -58,11 +21,40 @@ void osd_t::send_replies()
|
|||||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||||
{
|
{
|
||||||
int peer_fd = write_ready_clients[i];
|
int peer_fd = write_ready_clients[i];
|
||||||
if (!try_send(clients[peer_fd]))
|
auto & cl = clients[peer_fd];
|
||||||
|
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||||
|
if (!sqe)
|
||||||
{
|
{
|
||||||
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
|
if (!cl.write_op)
|
||||||
|
{
|
||||||
|
// pick next command
|
||||||
|
cl.write_op = cl.outbox.front();
|
||||||
|
cl.outbox.pop_front();
|
||||||
|
cl.write_state = CL_WRITE_REPLY;
|
||||||
|
if (cl.write_op->op_type == OSD_OP_OUT)
|
||||||
|
{
|
||||||
|
gettimeofday(&cl.write_op->tv_send, NULL);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Measure execution latency
|
||||||
|
timeval tv_end;
|
||||||
|
gettimeofday(&tv_end, NULL);
|
||||||
|
op_stat_count[cl.write_op->req.hdr.opcode]++;
|
||||||
|
op_stat_sum[cl.write_op->req.hdr.opcode] += (
|
||||||
|
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
|
||||||
|
tv_end.tv_usec - cl.write_op->tv_begin.tv_usec
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
|
||||||
|
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
|
||||||
|
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
|
||||||
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
|
||||||
}
|
}
|
||||||
write_ready_clients.clear();
|
write_ready_clients.clear();
|
||||||
}
|
}
|
||||||
@@ -101,22 +93,24 @@ void osd_t::handle_send(ring_data_t *data, int peer_fd)
|
|||||||
if (cur_op->send_list.sent >= cur_op->send_list.count)
|
if (cur_op->send_list.sent >= cur_op->send_list.count)
|
||||||
{
|
{
|
||||||
// Done
|
// Done
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
|
|
||||||
{
|
|
||||||
timespec tv_end;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
|
||||||
send_stat_count++;
|
|
||||||
send_stat_sum += (
|
|
||||||
(tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
|
|
||||||
(tv_end.tv_nsec - cl.write_op->tv_send.tv_nsec)/1000
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (cur_op->op_type == OSD_OP_IN)
|
if (cur_op->op_type == OSD_OP_IN)
|
||||||
{
|
{
|
||||||
delete cur_op;
|
delete cur_op;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// Measure subops with data
|
||||||
|
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
|
||||||
|
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
|
||||||
|
{
|
||||||
|
timeval tv_end;
|
||||||
|
gettimeofday(&tv_end, NULL);
|
||||||
|
send_stat_count++;
|
||||||
|
send_stat_sum += (
|
||||||
|
(tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
|
||||||
|
tv_end.tv_usec - cl.write_op->tv_send.tv_usec
|
||||||
|
);
|
||||||
|
}
|
||||||
cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
|
cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
|
||||||
}
|
}
|
||||||
cl.write_op = NULL;
|
cl.write_op = NULL;
|
||||||
|
@@ -146,7 +146,8 @@ public:
|
|||||||
}
|
}
|
||||||
inline int wait()
|
inline int wait()
|
||||||
{
|
{
|
||||||
return io_uring_submit_and_wait(&ring, 1);
|
struct io_uring_cqe *cqe;
|
||||||
|
return io_uring_wait_cqe(&ring, &cqe);
|
||||||
}
|
}
|
||||||
inline unsigned space_left()
|
inline unsigned space_left()
|
||||||
{
|
{
|
||||||
|
@@ -50,37 +50,3 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
|
|||||||
}
|
}
|
||||||
return done;
|
return done;
|
||||||
}
|
}
|
||||||
|
|
||||||
int writev_blocking(int fd, iovec *iov, int iovcnt)
|
|
||||||
{
|
|
||||||
int v = 0;
|
|
||||||
int done = 0;
|
|
||||||
while (v < iovcnt)
|
|
||||||
{
|
|
||||||
ssize_t r = writev(fd, iov, iovcnt);
|
|
||||||
if (r < 0)
|
|
||||||
{
|
|
||||||
if (errno != EAGAIN && errno != EPIPE)
|
|
||||||
{
|
|
||||||
perror("writev");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
while (v < iovcnt)
|
|
||||||
{
|
|
||||||
if (iov[v].iov_len > r)
|
|
||||||
{
|
|
||||||
iov[v].iov_len -= r;
|
|
||||||
iov[v].iov_base += r;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
v++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
done += r;
|
|
||||||
}
|
|
||||||
return done;
|
|
||||||
}
|
|
||||||
|
@@ -1,8 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <sys/uio.h>
|
|
||||||
|
|
||||||
int read_blocking(int fd, void *read_buf, size_t remaining);
|
int read_blocking(int fd, void *read_buf, size_t remaining);
|
||||||
int write_blocking(int fd, void *write_buf, size_t remaining);
|
int write_blocking(int fd, void *write_buf, size_t remaining);
|
||||||
int writev_blocking(int fd, iovec *iov, int iovcnt);
|
|
||||||
|
148
stub_bench.cpp
148
stub_bench.cpp
@@ -1,148 +0,0 @@
|
|||||||
/**
|
|
||||||
* Stub benchmarker
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <time.h>
|
|
||||||
#include <sys/socket.h>
|
|
||||||
#include <netinet/in.h>
|
|
||||||
#include <netinet/tcp.h>
|
|
||||||
#include <arpa/inet.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <signal.h>
|
|
||||||
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#include "rw_blocking.h"
|
|
||||||
#include "osd_ops.h"
|
|
||||||
|
|
||||||
int connect_stub(const char *server_address, int server_port);
|
|
||||||
|
|
||||||
void run_bench(int peer_fd);
|
|
||||||
|
|
||||||
static uint64_t write_sum = 0, write_count = 0;
|
|
||||||
static uint64_t sync_sum = 0, sync_count = 0;
|
|
||||||
|
|
||||||
void handle_sigint(int sig)
|
|
||||||
{
|
|
||||||
printf("4k randwrite: %lu us avg\n", write_sum/write_count);
|
|
||||||
printf("sync: %lu us avg\n", sync_sum/sync_count);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
|
||||||
{
|
|
||||||
signal(SIGINT, handle_sigint);
|
|
||||||
int peer_fd = connect_stub("127.0.0.1", 11203);
|
|
||||||
run_bench(peer_fd);
|
|
||||||
close(peer_fd);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int connect_stub(const char *server_address, int server_port)
|
|
||||||
{
|
|
||||||
struct sockaddr_in addr;
|
|
||||||
int r;
|
|
||||||
if ((r = inet_pton(AF_INET, server_address, &addr.sin_addr)) != 1)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "server address: %s%s\n", server_address, r == 0 ? " is not valid" : ": no ipv4 support");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
addr.sin_family = AF_INET;
|
|
||||||
addr.sin_port = htons(server_port);
|
|
||||||
int connect_fd = socket(AF_INET, SOCK_STREAM, 0);
|
|
||||||
if (connect_fd < 0)
|
|
||||||
{
|
|
||||||
perror("socket");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
|
||||||
{
|
|
||||||
perror("connect");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
int one = 1;
|
|
||||||
setsockopt(connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
|
||||||
return connect_fd;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected)
|
|
||||||
{
|
|
||||||
if (r != OSD_PACKET_SIZE)
|
|
||||||
{
|
|
||||||
printf("read failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (reply.hdr.magic != SECONDARY_OSD_REPLY_MAGIC ||
|
|
||||||
reply.hdr.id != op.hdr.id || reply.hdr.opcode != op.hdr.opcode)
|
|
||||||
{
|
|
||||||
printf("bad reply: magic, id or opcode does not match request\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (reply.hdr.retval != expected)
|
|
||||||
{
|
|
||||||
printf("operation failed, retval=%ld (%s)\n", reply.hdr.retval, strerror(-reply.hdr.retval));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void run_bench(int peer_fd)
|
|
||||||
{
|
|
||||||
osd_any_op_t op;
|
|
||||||
osd_any_reply_t reply;
|
|
||||||
void *buf = NULL;
|
|
||||||
int r;
|
|
||||||
timespec tv_begin, tv_end;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
// write
|
|
||||||
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
|
|
||||||
op.hdr.id = 1;
|
|
||||||
op.hdr.opcode = OSD_OP_SECONDARY_WRITE;
|
|
||||||
op.sec_rw.oid.inode = 3;
|
|
||||||
op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
|
|
||||||
op.sec_rw.version = 0;
|
|
||||||
op.sec_rw.len = 4096;
|
|
||||||
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
|
|
||||||
buf = malloc(op.sec_rw.len);
|
|
||||||
memset(buf, rand() % 255, op.sec_rw.len);
|
|
||||||
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
|
|
||||||
if (r)
|
|
||||||
r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
|
|
||||||
free(buf);
|
|
||||||
if (!r)
|
|
||||||
break;
|
|
||||||
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
|
|
||||||
if (!check_reply(r, op, reply, op.sec_rw.len))
|
|
||||||
break;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
|
||||||
write_count++;
|
|
||||||
write_sum += (
|
|
||||||
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
|
|
||||||
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
|
|
||||||
);
|
|
||||||
// sync/stab
|
|
||||||
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
|
|
||||||
op.hdr.id = 1;
|
|
||||||
op.hdr.opcode = OSD_OP_TEST_SYNC_STAB_ALL;
|
|
||||||
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
|
|
||||||
if (!r)
|
|
||||||
break;
|
|
||||||
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
|
|
||||||
if (!check_reply(r, op, reply, 0))
|
|
||||||
break;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
|
||||||
sync_count++;
|
|
||||||
sync_sum += (
|
|
||||||
(tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
|
|
||||||
tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
Reference in New Issue
Block a user