Compare commits
8 Commits
blocking-u
...
test-sq-po
Author | SHA1 | Date | |
---|---|---|---|
eabfe4faac | |||
c9f3654905 | |||
2a5ca4ff6f | |||
2575431176 | |||
20125db181 | |||
7eac7b6d55 | |||
79839ec31d | |||
9d96e4bf0b |
18
Makefile
18
Makefile
@@ -2,7 +2,7 @@ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o
|
||||
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
|
||||
# -fsanitize=address
|
||||
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
|
||||
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd osd_test
|
||||
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd stub_bench osd_test
|
||||
clean:
|
||||
rm -f *.o
|
||||
|
||||
@@ -14,7 +14,7 @@ allocator.o: allocator.cpp allocator.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
ringloop.o: ringloop.cpp ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h
|
||||
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
|
||||
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
|
||||
@@ -26,13 +26,13 @@ libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
|
||||
g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
|
||||
|
||||
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
|
||||
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h
|
||||
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_receive.o: osd_receive.cpp osd.h osd_ops.h
|
||||
osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_send.o: osd_send.cpp osd.h osd_ops.h
|
||||
osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h
|
||||
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
@@ -40,14 +40,16 @@ osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
|
||||
g++ $(CXXFLAGS) -o $@ $<
|
||||
osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h
|
||||
osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h
|
||||
osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
|
||||
g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
|
||||
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
|
||||
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
|
||||
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
|
||||
g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
|
||||
rw_blocking.o: rw_blocking.cpp rw_blocking.h
|
||||
g++ $(CXXFLAGS) -c -o $@ $<
|
||||
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
|
||||
|
@@ -287,8 +287,9 @@ resume_1:
|
||||
data->iov = (struct iovec){ it->buf, (size_t)it->len };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
|
||||
sqe, bs->data_fd_index, &data->iov, 1, bs->data_offset + clean_loc + it->offset
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
}
|
||||
// Sync data before writing metadata
|
||||
@@ -320,8 +321,9 @@ resume_1:
|
||||
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector
|
||||
sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_old.sector
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
}
|
||||
if (has_delete)
|
||||
@@ -342,8 +344,9 @@ resume_1:
|
||||
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(
|
||||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector
|
||||
sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_new.sector
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
resume_7:
|
||||
if (wait_count > 0)
|
||||
@@ -405,7 +408,8 @@ resume_1:
|
||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
|
||||
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
resume_13:
|
||||
if (wait_count > 0)
|
||||
@@ -485,8 +489,9 @@ bool journal_flusher_co::scan_dirty(int wait_base)
|
||||
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
|
||||
data->callback = simple_callback_r;
|
||||
my_uring_prep_readv(
|
||||
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
|
||||
sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + submit_offset
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
}
|
||||
}
|
||||
@@ -568,8 +573,9 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
|
||||
data->callback = simple_callback_r;
|
||||
wr.submitted = true;
|
||||
my_uring_prep_readv(
|
||||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector
|
||||
sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + wr.sector
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
}
|
||||
else
|
||||
@@ -638,7 +644,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||
await_sqe(0);
|
||||
data->iov = { 0 };
|
||||
data->callback = simple_callback_w;
|
||||
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
|
||||
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd_index : bs->data_fd_index, IORING_FSYNC_DATASYNC);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
cur_sync->state = 1;
|
||||
wait_count++;
|
||||
resume_1:
|
||||
|
@@ -216,6 +216,7 @@ class blockstore_impl_t
|
||||
int data_fd;
|
||||
uint64_t meta_size, meta_area, meta_len;
|
||||
uint64_t data_size, data_len;
|
||||
int meta_fd_index, data_fd_index, journal_fd_index;
|
||||
|
||||
void *metadata_buffer = NULL;
|
||||
|
||||
|
@@ -55,7 +55,8 @@ int blockstore_init_meta::loop()
|
||||
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
|
||||
};
|
||||
data->callback = [this](ring_data_t *data) { handle_event(data); };
|
||||
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
|
||||
my_uring_prep_readv(sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + metadata_read);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
bs->ringloop->submit();
|
||||
submitted = (prev == 1 ? 2 : 1);
|
||||
prev = submitted;
|
||||
@@ -216,7 +217,8 @@ int blockstore_init_journal::loop()
|
||||
data = ((ring_data_t*)sqe->user_data);
|
||||
data->iov = { submitted_buf, bs->journal.block_size };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
|
||||
my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
bs->ringloop->submit();
|
||||
wait_count = 1;
|
||||
resume_1:
|
||||
@@ -254,7 +256,8 @@ resume_1:
|
||||
GET_SQE();
|
||||
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
|
||||
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
bs->ringloop->submit();
|
||||
resume_6:
|
||||
@@ -266,7 +269,8 @@ resume_1:
|
||||
if (!bs->disable_journal_fsync)
|
||||
{
|
||||
GET_SQE();
|
||||
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
|
||||
my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
data->iov = { 0 };
|
||||
data->callback = simple_callback;
|
||||
wait_count++;
|
||||
@@ -325,7 +329,8 @@ resume_1:
|
||||
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
|
||||
};
|
||||
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
|
||||
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos);
|
||||
my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + journal_pos);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
bs->ringloop->submit();
|
||||
}
|
||||
while (done.size() > 0)
|
||||
@@ -340,7 +345,8 @@ resume_1:
|
||||
GET_SQE();
|
||||
data->iov = { init_write_buf, bs->journal.block_size };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector);
|
||||
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + init_write_sector);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
bs->ringloop->submit();
|
||||
resume_7:
|
||||
@@ -354,7 +360,8 @@ resume_1:
|
||||
GET_SQE();
|
||||
data->iov = { 0 };
|
||||
data->callback = simple_callback;
|
||||
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
|
||||
my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
wait_count++;
|
||||
bs->ringloop->submit();
|
||||
}
|
||||
|
@@ -126,8 +126,9 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_
|
||||
};
|
||||
data->callback = cb;
|
||||
my_uring_prep_writev(
|
||||
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
|
||||
sqe, journal.fd_index, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
}
|
||||
|
||||
journal_t::~journal_t()
|
||||
|
@@ -130,7 +130,7 @@ struct journal_sector_info_t
|
||||
|
||||
struct journal_t
|
||||
{
|
||||
int fd;
|
||||
int fd, fd_index;
|
||||
uint64_t device_size;
|
||||
bool inmemory = false;
|
||||
void *buffer = NULL;
|
||||
|
@@ -140,6 +140,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||
|
||||
void blockstore_impl_t::calc_lengths()
|
||||
{
|
||||
// register fds
|
||||
data_fd_index = ringloop->register_fd(data_fd);
|
||||
meta_fd_index = meta_fd == data_fd ? data_fd_index : ringloop->register_fd(meta_fd);
|
||||
journal.fd_index = journal_fd_index = journal.fd == meta_fd ? meta_fd_index : ringloop->register_fd(journal.fd);
|
||||
// data
|
||||
data_len = data_size - data_offset;
|
||||
if (data_fd == meta_fd && data_offset < meta_offset)
|
||||
|
@@ -31,10 +31,11 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
|
||||
PRIV(op)->pending_ops++;
|
||||
my_uring_prep_readv(
|
||||
sqe,
|
||||
IS_JOURNAL(item_state) ? journal.fd : data_fd,
|
||||
IS_JOURNAL(item_state) ? journal_fd_index : data_fd_index,
|
||||
&data->iov, 1,
|
||||
(IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
|
||||
return 1;
|
||||
}
|
||||
|
@@ -78,7 +78,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
if (!disable_data_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
|
||||
my_uring_prep_fsync(sqe, data_fd_index, IORING_FSYNC_DATASYNC);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
||||
@@ -161,7 +162,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
if (!disable_journal_fsync)
|
||||
{
|
||||
BS_SUBMIT_GET_SQE(sqe, data);
|
||||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
||||
my_uring_prep_fsync(sqe, journal_fd_index, IORING_FSYNC_DATASYNC);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
data->iov = { 0 };
|
||||
data->callback = cb;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
|
@@ -122,8 +122,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
|
||||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||
my_uring_prep_writev(
|
||||
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
|
||||
sqe, data_fd_index, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
|
||||
);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
||||
// Remember big write as unsynced
|
||||
@@ -198,8 +199,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||
data2->iov = (struct iovec){ op->buf, op->len };
|
||||
data2->callback = cb;
|
||||
my_uring_prep_writev(
|
||||
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
|
||||
sqe2, journal_fd_index, &data2->iov, 1, journal.offset + journal.next_free
|
||||
);
|
||||
sqe2->flags |= IOSQE_FIXED_FILE;
|
||||
PRIV(op)->pending_ops++;
|
||||
}
|
||||
else
|
||||
|
@@ -261,15 +261,18 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
|
||||
bsd->op_n++;
|
||||
bsd->queue[n] = io;
|
||||
|
||||
if (write(bsd->connect_fd, op.buf, OSD_PACKET_SIZE) != OSD_PACKET_SIZE)
|
||||
{
|
||||
perror("write");
|
||||
exit(1);
|
||||
}
|
||||
iovec iov[2] = { { .iov_base = op.buf, .iov_len = OSD_PACKET_SIZE } };
|
||||
int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
|
||||
if (io->ddir == DDIR_WRITE)
|
||||
{
|
||||
// Send data
|
||||
write_blocking(bsd->connect_fd, io->xfer_buf, io->xfer_buflen);
|
||||
iov[1] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
|
||||
wtotal += io->xfer_buflen;
|
||||
iovcnt++;
|
||||
}
|
||||
if (writev_blocking(bsd->connect_fd, iov, iovcnt) != wtotal)
|
||||
{
|
||||
perror("writev");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (io->error != 0)
|
||||
|
44
osd.cpp
44
osd.cpp
@@ -50,7 +50,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
|
||||
}
|
||||
if (send_stat_count != 0)
|
||||
{
|
||||
printf("avg latency to send subops with data: %ld us\n", send_stat_sum/send_stat_count);
|
||||
printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
|
||||
send_stat_count = 0;
|
||||
send_stat_sum = 0;
|
||||
}
|
||||
@@ -110,6 +110,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
|
||||
close(listen_fd);
|
||||
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
|
||||
}
|
||||
epoll_fd_index = ringloop->register_fd(epoll_fd);
|
||||
|
||||
epoll_event ev;
|
||||
ev.data.fd = listen_fd;
|
||||
@@ -180,6 +181,25 @@ void osd_t::loop()
|
||||
|
||||
void osd_t::handle_epoll_events()
|
||||
{
|
||||
io_uring_sqe *sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
||||
}
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
data->allow_cancel = true;
|
||||
my_uring_prep_poll_add(sqe, epoll_fd_index, POLLIN);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
data->callback = [this](ring_data_t *data)
|
||||
{
|
||||
if (data->res < 0 && data->res != -ECANCELED)
|
||||
{
|
||||
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
||||
}
|
||||
handle_epoll_events();
|
||||
};
|
||||
ringloop->submit();
|
||||
// FIXME With SQ thread we have no guarantee that epoll request will be submitted right here...
|
||||
int nfds;
|
||||
epoll_event events[MAX_EPOLL_EVENTS];
|
||||
restart:
|
||||
@@ -203,12 +223,13 @@ restart:
|
||||
.peer_addr = addr,
|
||||
.peer_port = ntohs(addr.sin_port),
|
||||
.peer_fd = peer_fd,
|
||||
.peer_fd_index = ringloop->register_fd(peer_fd),
|
||||
.peer_state = PEER_CONNECTED,
|
||||
};
|
||||
// Add FD to epoll
|
||||
epoll_event ev;
|
||||
ev.data.fd = peer_fd;
|
||||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
|
||||
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
|
||||
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
|
||||
@@ -247,25 +268,10 @@ restart:
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nfds == MAX_EPOLL_EVENTS)
|
||||
if (nfds > 0)
|
||||
{
|
||||
goto restart;
|
||||
}
|
||||
io_uring_sqe *sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
|
||||
}
|
||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
|
||||
data->callback = [this](ring_data_t *data)
|
||||
{
|
||||
if (data->res < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
|
||||
}
|
||||
handle_epoll_events();
|
||||
};
|
||||
}
|
||||
|
||||
void osd_t::cancel_osd_ops(osd_client_t & cl)
|
||||
@@ -349,7 +355,7 @@ void osd_t::stop_client(int peer_fd)
|
||||
|
||||
void osd_t::exec_op(osd_op_t *cur_op)
|
||||
{
|
||||
gettimeofday(&cur_op->tv_begin, NULL);
|
||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
||||
if (stopping)
|
||||
{
|
||||
// Throw operation away
|
||||
|
11
osd.h
11
osd.h
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
@@ -93,8 +93,8 @@ struct osd_primary_op_data_t;
|
||||
|
||||
struct osd_op_t
|
||||
{
|
||||
timeval tv_begin;
|
||||
timeval tv_send;
|
||||
timespec tv_begin;
|
||||
timespec tv_send;
|
||||
int op_type = OSD_OP_IN;
|
||||
int peer_fd;
|
||||
osd_any_op_t req;
|
||||
@@ -122,7 +122,7 @@ struct osd_client_t
|
||||
{
|
||||
sockaddr_in peer_addr;
|
||||
int peer_port;
|
||||
int peer_fd;
|
||||
int peer_fd, peer_fd_index;
|
||||
int peer_state;
|
||||
std::function<void(osd_num_t, int)> connect_callback;
|
||||
osd_num_t osd_num = 0;
|
||||
@@ -196,7 +196,7 @@ class osd_t
|
||||
timerfd_interval *tick_tfd;
|
||||
|
||||
int wait_state = 0;
|
||||
int epoll_fd = 0;
|
||||
int epoll_fd = 0, epoll_fd_index = -1;
|
||||
int listen_fd = 0;
|
||||
ring_consumer_t consumer;
|
||||
|
||||
@@ -219,6 +219,7 @@ class osd_t
|
||||
void handle_read(ring_data_t *data, int peer_fd);
|
||||
void handle_op_hdr(osd_client_t *cl);
|
||||
void handle_reply_hdr(osd_client_t *cl);
|
||||
bool try_send(osd_client_t & cl);
|
||||
void send_replies();
|
||||
void handle_send(ring_data_t *data, int peer_fd);
|
||||
void outbox_push(osd_client_t & cl, osd_op_t *op);
|
||||
|
@@ -116,6 +116,7 @@ void osd_t::handle_connect_result(int peer_fd)
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
// Disable EPOLLOUT on this fd
|
||||
cl.peer_fd_index = ringloop->register_fd(peer_fd);
|
||||
cl.connect_callback = NULL;
|
||||
cl.peer_state = PEER_CONNECTED;
|
||||
epoll_event ev;
|
||||
|
@@ -32,36 +32,38 @@ void osd_t::read_requests()
|
||||
cl.read_msg.msg_iov = &cl.read_iov;
|
||||
cl.read_msg.msg_iovlen = 1;
|
||||
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
|
||||
my_uring_prep_recvmsg(sqe, cl.peer_fd_index, &cl.read_msg, 0);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
}
|
||||
read_ready_clients.clear();
|
||||
}
|
||||
|
||||
void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
||||
{
|
||||
int res = data->res;
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end())
|
||||
{
|
||||
auto & cl = cl_it->second;
|
||||
if (data->res == -EAGAIN)
|
||||
if (res == -EAGAIN)
|
||||
{
|
||||
cl.read_ready--;
|
||||
if (cl.read_ready > 0)
|
||||
read_ready_clients.push_back(peer_fd);
|
||||
return;
|
||||
}
|
||||
else if (data->res < 0)
|
||||
else if (res < 0)
|
||||
{
|
||||
// this is a client socket, so don't panic. just disconnect it
|
||||
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
|
||||
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res));
|
||||
stop_client(peer_fd);
|
||||
return;
|
||||
}
|
||||
read_ready_clients.push_back(peer_fd);
|
||||
if (data->res > 0)
|
||||
if (res > 0)
|
||||
{
|
||||
cl.read_remaining -= data->res;
|
||||
cl.read_buf += data->res;
|
||||
cl.read_remaining -= res;
|
||||
cl.read_buf += res;
|
||||
if (cl.read_remaining <= 0)
|
||||
{
|
||||
cl.read_buf = NULL;
|
||||
@@ -92,12 +94,12 @@ void osd_t::handle_read(ring_data_t *data, int peer_fd)
|
||||
cl.read_reply_id = 0;
|
||||
cl.read_state = 0;
|
||||
// Measure subop latency
|
||||
timeval tv_end;
|
||||
gettimeofday(&tv_end, NULL);
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
subop_stat_count[request->req.hdr.opcode]++;
|
||||
subop_stat_sum[request->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
|
||||
tv_end.tv_usec - request->tv_begin.tv_usec
|
||||
(tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
request->callback(request);
|
||||
}
|
||||
@@ -192,12 +194,12 @@ void osd_t::handle_reply_hdr(osd_client_t *cl)
|
||||
cl->read_state = 0;
|
||||
cl->sent_ops.erase(req_it);
|
||||
// Measure subop latency
|
||||
timeval tv_end;
|
||||
gettimeofday(&tv_end, NULL);
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
subop_stat_count[op->req.hdr.opcode]++;
|
||||
subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
tv_end.tv_usec - op->tv_begin.tv_usec
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
op->callback(op);
|
||||
}
|
||||
|
122
osd_send.cpp
122
osd_send.cpp
@@ -5,15 +5,53 @@ void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
|
||||
assert(cur_op->peer_fd);
|
||||
if (cur_op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
gettimeofday(&cur_op->tv_begin, NULL);
|
||||
}
|
||||
if (cl.write_state == 0)
|
||||
{
|
||||
cl.write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
||||
}
|
||||
cl.outbox.push_back(cur_op);
|
||||
ringloop->wakeup();
|
||||
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
|
||||
{
|
||||
if (cl.write_state == 0)
|
||||
{
|
||||
cl.write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::try_send(osd_client_t & cl)
|
||||
{
|
||||
int peer_fd = cl.peer_fd;
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
if (!cl.write_op)
|
||||
{
|
||||
// pick next command
|
||||
cl.write_op = cl.outbox.front();
|
||||
cl.outbox.pop_front();
|
||||
cl.write_state = CL_WRITE_REPLY;
|
||||
clock_gettime(CLOCK_REALTIME, &cl.write_op->tv_send);
|
||||
if (cl.write_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
// Measure execution latency
|
||||
timespec tv_end = cl.write_op->tv_send;
|
||||
op_stat_count[cl.write_op->req.hdr.opcode]++;
|
||||
op_stat_sum[cl.write_op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
}
|
||||
}
|
||||
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
|
||||
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
|
||||
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
|
||||
my_uring_prep_sendmsg(sqe, cl.peer_fd_index, &cl.write_msg, 0);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::send_replies()
|
||||
@@ -21,96 +59,66 @@ void osd_t::send_replies()
|
||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||
{
|
||||
int peer_fd = write_ready_clients[i];
|
||||
auto & cl = clients[peer_fd];
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
if (!try_send(clients[peer_fd]))
|
||||
{
|
||||
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
if (!cl.write_op)
|
||||
{
|
||||
// pick next command
|
||||
cl.write_op = cl.outbox.front();
|
||||
cl.outbox.pop_front();
|
||||
cl.write_state = CL_WRITE_REPLY;
|
||||
if (cl.write_op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
gettimeofday(&cl.write_op->tv_send, NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Measure execution latency
|
||||
timeval tv_end;
|
||||
gettimeofday(&tv_end, NULL);
|
||||
op_stat_count[cl.write_op->req.hdr.opcode]++;
|
||||
op_stat_sum[cl.write_op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
|
||||
tv_end.tv_usec - cl.write_op->tv_begin.tv_usec
|
||||
);
|
||||
}
|
||||
}
|
||||
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
|
||||
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
|
||||
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
|
||||
}
|
||||
write_ready_clients.clear();
|
||||
}
|
||||
|
||||
void osd_t::handle_send(ring_data_t *data, int peer_fd)
|
||||
{
|
||||
int res = data->res;
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end())
|
||||
{
|
||||
auto & cl = cl_it->second;
|
||||
if (data->res < 0 && data->res != -EAGAIN)
|
||||
if (res < 0 && res != -EAGAIN)
|
||||
{
|
||||
// this is a client socket, so don't panic. just disconnect it
|
||||
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
|
||||
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res));
|
||||
stop_client(peer_fd);
|
||||
return;
|
||||
}
|
||||
if (data->res >= 0)
|
||||
if (res >= 0)
|
||||
{
|
||||
osd_op_t *cur_op = cl.write_op;
|
||||
while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
|
||||
while (res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
|
||||
{
|
||||
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
|
||||
if (iov.iov_len <= data->res)
|
||||
if (iov.iov_len <= res)
|
||||
{
|
||||
data->res -= iov.iov_len;
|
||||
res -= iov.iov_len;
|
||||
cur_op->send_list.sent++;
|
||||
}
|
||||
else
|
||||
{
|
||||
iov.iov_len -= data->res;
|
||||
iov.iov_base += data->res;
|
||||
iov.iov_len -= res;
|
||||
iov.iov_base += res;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cur_op->send_list.sent >= cur_op->send_list.count)
|
||||
{
|
||||
// Done
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
|
||||
{
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
send_stat_count++;
|
||||
send_stat_sum += (
|
||||
(tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - cl.write_op->tv_send.tv_nsec)/1000
|
||||
);
|
||||
}
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
delete cur_op;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Measure subops with data
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
|
||||
{
|
||||
timeval tv_end;
|
||||
gettimeofday(&tv_end, NULL);
|
||||
send_stat_count++;
|
||||
send_stat_sum += (
|
||||
(tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
|
||||
tv_end.tv_usec - cl.write_op->tv_send.tv_usec
|
||||
);
|
||||
}
|
||||
cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
|
||||
}
|
||||
cl.write_op = NULL;
|
||||
|
126
ringloop.cpp
126
ringloop.cpp
@@ -1,14 +1,19 @@
|
||||
#include <set>
|
||||
|
||||
#include "ringloop.h"
|
||||
|
||||
ring_loop_t::ring_loop_t(int qd)
|
||||
{
|
||||
int ret = io_uring_queue_init(qd, &ring, 0);
|
||||
io_uring_params params = { 0 };
|
||||
params.flags = IORING_SETUP_SQPOLL;
|
||||
params.sq_thread_idle = 10;
|
||||
int ret = io_uring_queue_init_params(qd, &ring, ¶ms);
|
||||
if (ret < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
|
||||
}
|
||||
free_ring_data_ptr = *ring.cq.kring_entries;
|
||||
ring_datas = (struct ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr);
|
||||
ring_data_total = free_ring_data_ptr = *ring.cq.kring_entries;
|
||||
ring_datas = (ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr);
|
||||
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
|
||||
if (!ring_datas || !free_ring_data)
|
||||
{
|
||||
@@ -16,6 +21,7 @@ ring_loop_t::ring_loop_t(int qd)
|
||||
}
|
||||
for (int i = 0; i < free_ring_data_ptr; i++)
|
||||
{
|
||||
ring_datas[i] = { 0 };
|
||||
free_ring_data[i] = i;
|
||||
}
|
||||
}
|
||||
@@ -27,6 +33,105 @@ ring_loop_t::~ring_loop_t()
|
||||
io_uring_queue_exit(&ring);
|
||||
}
|
||||
|
||||
void ring_loop_t::drain_events(void *completions_ptr)
|
||||
{
|
||||
std::set<ring_data_t*> & completions = *((std::set<ring_data_t*> *)completions_ptr);
|
||||
if (free_ring_data_ptr < ring_data_total)
|
||||
{
|
||||
// Try to cancel requests that are allowed to be canceled by the caller (epoll, timerfd and similar)
|
||||
for (int i = 0; i < ring_data_total; i++)
|
||||
{
|
||||
if (ring_datas[i].allow_cancel)
|
||||
{
|
||||
// allow_cancel may only be true while the operation is inflight
|
||||
io_uring_sqe *sqe = get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
throw std::runtime_error("can't get SQE to cancel operation");
|
||||
}
|
||||
ring_data_t *data = (ring_data_t*)sqe->user_data;
|
||||
data->callback = NULL;
|
||||
ring_datas[i].res = -ECANCELED;
|
||||
my_uring_prep_cancel(sqe, &ring_datas[i], 0);
|
||||
// It seems (FIXME) cancel operations don't always get completions
|
||||
completions.insert(data);
|
||||
}
|
||||
}
|
||||
if (completions.size() > 0)
|
||||
{
|
||||
submit();
|
||||
}
|
||||
}
|
||||
int inflight = ring_data_total - free_ring_data_ptr;
|
||||
while (completions.size() < inflight)
|
||||
{
|
||||
io_uring_cqe *cqe;
|
||||
while (!io_uring_peek_cqe(&ring, &cqe))
|
||||
{
|
||||
ring_data_t *d = (ring_data_t*)cqe->user_data;
|
||||
d->res = cqe->res;
|
||||
d->allow_cancel = false;
|
||||
completions.insert(d);
|
||||
io_uring_cqe_seen(&ring, cqe);
|
||||
}
|
||||
if (completions.size() < inflight)
|
||||
{
|
||||
wait();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ring_loop_t::run_completions(void *completions_ptr)
|
||||
{
|
||||
std::set<ring_data_t*> & completions = *((std::set<ring_data_t*> *)completions_ptr);
|
||||
// Call event callbacks
|
||||
for (ring_data_t *d: completions)
|
||||
{
|
||||
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
|
||||
if (d->callback)
|
||||
d->callback(d);
|
||||
}
|
||||
}
|
||||
|
||||
int ring_loop_t::register_fd(int fd)
|
||||
{
|
||||
std::set<ring_data_t*> completions;
|
||||
drain_events((void*)&completions);
|
||||
// Modify registered files
|
||||
int idx = reg_fds.size();
|
||||
reg_fds.push_back(fd);
|
||||
if (registered)
|
||||
{
|
||||
io_uring_unregister_files(&ring);
|
||||
}
|
||||
int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size());
|
||||
if (ret != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret));
|
||||
}
|
||||
registered = 1;
|
||||
run_completions((void*)&completions);
|
||||
return idx;
|
||||
}
|
||||
|
||||
void ring_loop_t::unregister_fd(int fd_index)
|
||||
{
|
||||
std::set<ring_data_t*> completions;
|
||||
drain_events((void*)&completions);
|
||||
// Modify registered files
|
||||
reg_fds.erase(reg_fds.begin()+fd_index, reg_fds.begin()+fd_index+1);
|
||||
if (registered)
|
||||
{
|
||||
io_uring_unregister_files(&ring);
|
||||
}
|
||||
int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size());
|
||||
if (ret != 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret));
|
||||
}
|
||||
run_completions((void*)&completions);
|
||||
}
|
||||
|
||||
int ring_loop_t::register_consumer(ring_consumer_t & consumer)
|
||||
{
|
||||
consumer.number = consumers.size();
|
||||
@@ -50,17 +155,16 @@ void ring_loop_t::unregister_consumer(ring_consumer_t & consumer)
|
||||
|
||||
void ring_loop_t::loop()
|
||||
{
|
||||
struct io_uring_cqe *cqe;
|
||||
io_uring_cqe *cqe;
|
||||
while (!io_uring_peek_cqe(&ring, &cqe))
|
||||
{
|
||||
struct ring_data_t *d = (struct ring_data_t*)cqe->user_data;
|
||||
if (d->callback)
|
||||
{
|
||||
d->res = cqe->res;
|
||||
d->callback(d);
|
||||
}
|
||||
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
|
||||
ring_data_t *d = (ring_data_t*)cqe->user_data;
|
||||
d->res = cqe->res;
|
||||
d->allow_cancel = false;
|
||||
io_uring_cqe_seen(&ring, cqe);
|
||||
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
|
||||
if (d->callback)
|
||||
d->callback(d);
|
||||
}
|
||||
do
|
||||
{
|
||||
|
18
ringloop.h
18
ringloop.h
@@ -107,6 +107,7 @@ static inline void my_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_dat
|
||||
struct ring_data_t
|
||||
{
|
||||
struct iovec iov; // for single-entry read/write operations
|
||||
bool allow_cancel;
|
||||
int res;
|
||||
std::function<void(ring_data_t*)> callback;
|
||||
};
|
||||
@@ -122,22 +123,35 @@ class ring_loop_t
|
||||
std::vector<ring_consumer_t> consumers;
|
||||
struct ring_data_t *ring_datas;
|
||||
int *free_ring_data;
|
||||
unsigned free_ring_data_ptr;
|
||||
unsigned free_ring_data_ptr, ring_data_total;
|
||||
bool loop_again;
|
||||
struct io_uring ring;
|
||||
int registered = 0;
|
||||
std::vector<int> reg_fds;
|
||||
void drain_events(void *completions_ptr);
|
||||
void run_completions(void *completions_ptr);
|
||||
|
||||
public:
|
||||
ring_loop_t(int qd);
|
||||
~ring_loop_t();
|
||||
int register_consumer(ring_consumer_t & consumer);
|
||||
void unregister_consumer(ring_consumer_t & consumer);
|
||||
|
||||
int register_fd(int fd);
|
||||
void unregister_fd(int fd_index);
|
||||
|
||||
inline struct io_uring_sqe* get_sqe()
|
||||
{
|
||||
if (free_ring_data_ptr == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
|
||||
if (sqe)
|
||||
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
||||
{
|
||||
ring_data_t *data = ring_datas + free_ring_data[--free_ring_data_ptr];
|
||||
io_uring_sqe_set_data(sqe, data);
|
||||
}
|
||||
return sqe;
|
||||
}
|
||||
inline int submit()
|
||||
|
@@ -50,3 +50,37 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
|
||||
}
|
||||
return done;
|
||||
}
|
||||
|
||||
int writev_blocking(int fd, iovec *iov, int iovcnt)
|
||||
{
|
||||
int v = 0;
|
||||
int done = 0;
|
||||
while (v < iovcnt)
|
||||
{
|
||||
ssize_t r = writev(fd, iov, iovcnt);
|
||||
if (r < 0)
|
||||
{
|
||||
if (errno != EAGAIN && errno != EPIPE)
|
||||
{
|
||||
perror("writev");
|
||||
exit(1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
while (v < iovcnt)
|
||||
{
|
||||
if (iov[v].iov_len > r)
|
||||
{
|
||||
iov[v].iov_len -= r;
|
||||
iov[v].iov_base += r;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
v++;
|
||||
}
|
||||
}
|
||||
done += r;
|
||||
}
|
||||
return done;
|
||||
}
|
||||
|
@@ -1,6 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
int read_blocking(int fd, void *read_buf, size_t remaining);
|
||||
int write_blocking(int fd, void *write_buf, size_t remaining);
|
||||
int writev_blocking(int fd, iovec *iov, int iovcnt);
|
||||
|
148
stub_bench.cpp
Normal file
148
stub_bench.cpp
Normal file
@@ -0,0 +1,148 @@
|
||||
/**
|
||||
* Stub benchmarker
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <time.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "rw_blocking.h"
|
||||
#include "osd_ops.h"
|
||||
|
||||
int connect_stub(const char *server_address, int server_port);
|
||||
|
||||
void run_bench(int peer_fd);
|
||||
|
||||
static uint64_t write_sum = 0, write_count = 0;
|
||||
static uint64_t sync_sum = 0, sync_count = 0;
|
||||
|
||||
void handle_sigint(int sig)
|
||||
{
|
||||
printf("4k randwrite: %lu us avg\n", write_sum/write_count);
|
||||
printf("sync: %lu us avg\n", sync_sum/sync_count);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
int main(int narg, char *args[])
|
||||
{
|
||||
signal(SIGINT, handle_sigint);
|
||||
int peer_fd = connect_stub("127.0.0.1", 11203);
|
||||
run_bench(peer_fd);
|
||||
close(peer_fd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int connect_stub(const char *server_address, int server_port)
|
||||
{
|
||||
struct sockaddr_in addr;
|
||||
int r;
|
||||
if ((r = inet_pton(AF_INET, server_address, &addr.sin_addr)) != 1)
|
||||
{
|
||||
fprintf(stderr, "server address: %s%s\n", server_address, r == 0 ? " is not valid" : ": no ipv4 support");
|
||||
return -1;
|
||||
}
|
||||
addr.sin_family = AF_INET;
|
||||
addr.sin_port = htons(server_port);
|
||||
int connect_fd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
if (connect_fd < 0)
|
||||
{
|
||||
perror("socket");
|
||||
return -1;
|
||||
}
|
||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
perror("connect");
|
||||
return -1;
|
||||
}
|
||||
int one = 1;
|
||||
setsockopt(connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
return connect_fd;
|
||||
}
|
||||
|
||||
bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected)
|
||||
{
|
||||
if (r != OSD_PACKET_SIZE)
|
||||
{
|
||||
printf("read failed\n");
|
||||
return false;
|
||||
}
|
||||
if (reply.hdr.magic != SECONDARY_OSD_REPLY_MAGIC ||
|
||||
reply.hdr.id != op.hdr.id || reply.hdr.opcode != op.hdr.opcode)
|
||||
{
|
||||
printf("bad reply: magic, id or opcode does not match request\n");
|
||||
return false;
|
||||
}
|
||||
if (reply.hdr.retval != expected)
|
||||
{
|
||||
printf("operation failed, retval=%ld (%s)\n", reply.hdr.retval, strerror(-reply.hdr.retval));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void run_bench(int peer_fd)
|
||||
{
|
||||
osd_any_op_t op;
|
||||
osd_any_reply_t reply;
|
||||
void *buf = NULL;
|
||||
int r;
|
||||
timespec tv_begin, tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
while (1)
|
||||
{
|
||||
// write
|
||||
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
|
||||
op.hdr.id = 1;
|
||||
op.hdr.opcode = OSD_OP_SECONDARY_WRITE;
|
||||
op.sec_rw.oid.inode = 3;
|
||||
op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
|
||||
op.sec_rw.version = 0;
|
||||
op.sec_rw.len = 4096;
|
||||
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
|
||||
buf = malloc(op.sec_rw.len);
|
||||
memset(buf, rand() % 255, op.sec_rw.len);
|
||||
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
|
||||
if (r)
|
||||
r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
|
||||
free(buf);
|
||||
if (!r)
|
||||
break;
|
||||
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
|
||||
if (!check_reply(r, op, reply, op.sec_rw.len))
|
||||
break;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
write_count++;
|
||||
write_sum += (
|
||||
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
|
||||
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
|
||||
);
|
||||
// sync/stab
|
||||
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
|
||||
op.hdr.id = 1;
|
||||
op.hdr.opcode = OSD_OP_TEST_SYNC_STAB_ALL;
|
||||
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
|
||||
if (!r)
|
||||
break;
|
||||
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
|
||||
if (!check_reply(r, op, reply, 0))
|
||||
break;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
||||
sync_count++;
|
||||
sync_sum += (
|
||||
(tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
|
||||
tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
|
||||
);
|
||||
}
|
||||
}
|
@@ -21,6 +21,7 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func
|
||||
}
|
||||
consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(consumer);
|
||||
timerfd_index = ringloop->register_fd(timerfd);
|
||||
this->ringloop = ringloop;
|
||||
this->callback = cb;
|
||||
}
|
||||
@@ -44,10 +45,12 @@ void timerfd_interval::loop()
|
||||
return;
|
||||
}
|
||||
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||
my_uring_prep_poll_add(sqe, timerfd, POLLIN);
|
||||
my_uring_prep_poll_add(sqe, timerfd_index, POLLIN);
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
data->allow_cancel = true;
|
||||
data->callback = [&](ring_data_t *data)
|
||||
{
|
||||
if (data->res < 0)
|
||||
if (data->res < 0 && data->res != -ECANCELED)
|
||||
{
|
||||
throw std::runtime_error(std::string("waiting for timer failed: ") + strerror(-data->res));
|
||||
}
|
||||
|
@@ -5,7 +5,7 @@
|
||||
class timerfd_interval
|
||||
{
|
||||
int wait_state;
|
||||
int timerfd;
|
||||
int timerfd, timerfd_index;
|
||||
int status;
|
||||
ring_loop_t *ringloop;
|
||||
ring_consumer_t consumer;
|
||||
|
Reference in New Issue
Block a user