Compare commits

...

26 Commits

Author SHA1 Message Date
735b97fe33 Trace I/O operations (SQEs, recvmsg/sendmsg, uring_submit) 2020-06-09 00:52:29 +03:00
d56633843f Replace io_uring sendmsg/recvmsg with synchronous sendmsg/recvmsg 2020-06-09 00:52:29 +03:00
4dde8b8a42 Oops, fix fio_sec_osd block_order parsing 2020-06-09 00:52:00 +03:00
f5ccb154af Benchmark reads in stub_bench, too 2020-06-08 01:54:44 +03:00
73c80e2c39 Move accept_connections() to osd_messenger_t, add a simple uring OSD stub 2020-06-08 01:32:16 +03:00
437dc5b630 Implement a FIO engine for testing cluster I/O 2020-06-07 00:30:15 +03:00
226f5a2945 Allow to override block_size in fio_sec_osd 2020-06-07 00:10:13 +03:00
2187d06eac Add a parameter to pass the initial config to client 2020-06-07 00:10:12 +03:00
c573bc6bb3 (Probably almost) implement cluster client 2020-06-07 00:09:36 +03:00
2f6cf605a1 Rename cluster_client to osd_messenger 2020-06-04 12:57:54 +03:00
05ea97119f Fix BS_OP_LIST to account for deleted objects: only list the newest stable entry of each object
This allows list responses to be unaffected by journal flushes, which, in turn,
fixes PG peering when a peer OSD is replaying journal and journal contains deletions
2020-06-02 23:52:48 +03:00
571be0f380 Make deletions instantly stable
"2-phase" (write->stabilize) process is pointless for deletions because it
doesn't protect us from incomplete objects. This happens because it removes
the version information from metadata after stabilization. Deletions require
"3-phase" process with a potentially very long 3rd phase.

So, deletions will be allowed to generate degraded and incomplete objects,
and for it to not affect users' ability to delete something, the cluster
will allow to delete whole inodes while storing a list of them in etcd.
Proper TRIM will be impossible until the implementation of the aforementioned
"3-phase" process, though.

By the way, this change also fixes a possible write stall after rebalancing
which was caused by the lack of "stabilize delete" operations.
2020-06-02 23:45:22 +03:00
985c309d7f Remove duplicate code between blockstore_{rollback,stable} and blockstore_init 2020-06-02 20:37:00 +03:00
a56f8cd14e Simplify handle_primary_subop() arguments 2020-06-02 18:44:23 +03:00
46e111272f Replace assert(this_it == cur_op) with if() for the case of PG repeering 2020-06-02 14:30:57 +03:00
165c204555 Fix BS_OP_DELETE (the implementation was untested up to this point) 2020-06-02 14:26:01 +03:00
af5cd45071 Oh crap, got SIGPIPE. Add MSG_NOSIGNAL 2020-06-02 11:41:08 +03:00
c3fe9ad0d1 Fix rebalancing writes (add a forgotten state resume) 2020-06-02 01:26:14 +03:00
0fcdeae18b Do not die if a peer is already stopped on flush error 2020-06-01 23:07:08 +03:00
e6a4b634f8 Fix possible write stall
The stall occurred during fio Q=128 random write tests with low flusher_count (4).
It was caused by flushers being unable to flush the beginning of the journal
because it contained older writes to an object that also had writes in the very end
of the journal, after dirty_start.
2020-06-01 16:18:23 +03:00
c22e096943 Output journal offsets in debug trace in hex, add detailed "still waiting" messages 2020-06-01 16:18:19 +03:00
45b1c2fbf1 Fix canceling of write operations on PG re-peer (which led to use-after-free, too...) 2020-06-01 16:18:14 +03:00
3469bead67 Protect "delete this" with a stack refcounter
(to fix use-after-free, too, but "delete this" was a time bomb anyway)
2020-06-01 16:18:09 +03:00
3a5d488f19 Fix use-after-free in osd_flush.cpp 2020-06-01 01:56:24 +03:00
73e4e30b1f Auto-generate C++ header dependencies 2020-06-01 00:25:25 +03:00
5feff1ffb9 Slightly cleanup socket send/receive code 2020-05-31 15:03:27 +03:00
40 changed files with 2762 additions and 1190 deletions

46
Make-gen.pl Executable file
View File

@@ -0,0 +1,46 @@
#!/usr/bin/perl
use strict;
my $deps = {};
for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
{
if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
{
$deps->{$1}->{$2} = 1;
}
}
my $added;
do
{
$added = 0;
for my $file (keys %$deps)
{
for my $dep (keys %{$deps->{$file}})
{
if ($deps->{$dep})
{
for my $subdep (keys %{$deps->{$dep}})
{
if (!$deps->{$file}->{$subdep})
{
$added = 1;
$deps->{$file}->{$subdep} = 1;
}
}
}
}
}
} while ($added);
for my $file (sort keys %$deps)
{
if ($file =~ /\.cpp$/)
{
my $obj = $file;
$obj =~ s/\.cpp$/.o/s;
print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
}
}

191
Makefile
View File

@@ -2,85 +2,44 @@ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
# -fsanitize=address
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so stub_osd stub_bench osd_test dump_journal
all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal
clean:
rm -f *.o
crc32c.o: crc32c.c
g++ $(CXXFLAGS) -c -o $@ $<
json11.o: json11/json11.cpp
g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
allocator.o: allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h object_id.h
g++ $(CXXFLAGS) -c -o $@ $<
dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
g++ $(CXXFLAGS) -o $@ $< crc32c.o
libblockstore.so: $(BLOCKSTORE_OBJS)
g++ $(CXXFLAGS) -o libblockstore.so -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_primary_subops.o etcd_state_client.o cluster_client.o osd_cluster.o http_client.o pg_states.o \
OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o pg_states.o \
osd_rmw.o json11.o base64.o timerfd_manager.o
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_cluster.o: osd_cluster.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
http_client.o: http_client.cpp http_client.h
g++ $(CXXFLAGS) -c -o $@ $<
etcd_state_client.o: etcd_state_client.cpp etcd_state_client.h http_client.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
cluster_client.o: cluster_client.cpp cluster_client.h osd_ops.h timerfd_manager.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_flush.o: osd_flush.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
pg_states.o: pg_states.cpp pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -o $@ $<
osd_primary.o: osd_primary.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary_subops.o: osd_primary_subops.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp osd.h http_client.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
stub_osd: stub_osd.o rw_blocking.o
g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
stub_uring_osd: $(STUB_URING_OSD_OBJS)
g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
FIO_CLUSTER_OBJS := fio_cluster.o cluster_client.o epoll_manager.o etcd_state_client.o \
messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o pg_states.o timerfd_manager.o base64.o
libfio_cluster.so: $(FIO_CLUSTER_OBJS)
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) -luring
test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
@@ -88,3 +47,107 @@ test: test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
test_allocator: test_allocator.cpp allocator.o
g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
crc32c.o: crc32c.c crc32c.h
g++ $(CXXFLAGS) -c -o $@ $<
json11.o: json11/json11.cpp
g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
# Autogenerated
allocator.o: allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
messenger.o: messenger.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
msgr_receive.o: msgr_receive.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
msgr_send.o: msgr_send.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp object_id.h osd_id.h osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test.o: osd_rmw_test.cpp object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
g++ $(CXXFLAGS) -c -o $@ $<
pg_states.o: pg_states.cpp pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
test_allocator.o: test_allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<

View File

@@ -7,8 +7,8 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
dequeuing = false;
active_flushers = 0;
syncing_flushers = 0;
sync_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = sync_threshold;
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = flusher_start_threshold;
journal_trim_counter = 0;
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
co = new journal_flusher_co[flusher_count];
@@ -33,6 +33,12 @@ journal_flusher_co::journal_flusher_co()
);
}
wait_count--;
if (!wait_count)
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("finished %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
};
simple_callback_w = [this](ring_data_t* data)
{
@@ -45,6 +51,12 @@ journal_flusher_co::journal_flusher_co()
);
}
wait_count--;
if (!wait_count)
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("finished %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
};
}
@@ -81,7 +93,7 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version;
flush_queue.push_back(ov.oid);
}
if (!dequeuing && flush_queue.size() >= sync_threshold)
if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
{
dequeuing = true;
bs->ringloop->wakeup();
@@ -101,21 +113,32 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version;
flush_queue.push_front(ov.oid);
}
if (!dequeuing && flush_queue.size() >= sync_threshold)
if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
{
dequeuing = true;
bs->ringloop->wakeup();
}
}
void journal_flusher_t::force_start()
void journal_flusher_t::request_trim()
{
dequeuing = true;
trim_wanted++;
bs->ringloop->wakeup();
}
void journal_flusher_t::release_trim()
{
trim_wanted--;
}
#define await_sqe(label) \
resume_##label:\
{\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
}\
sqe = bs->get_sqe();\
if (!sqe)\
{\
@@ -181,11 +204,71 @@ resume_0:
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
{
// We can't flush journal sectors that are still written to
flusher->enqueue_flush(cur);
flusher->dequeuing = false;
wait_state = 0;
return true;
// We can't flush journal sectors that are still written to
// However, as we group flushes by oid, current oid may have older writes to flush!
// And it may even block writes if we don't flush the older version
// (if it's in the beginning of the journal)...
// So first try to find an older version of the same object to flush.
bool found = false;
while (dirty_end != bs->dirty_db.begin())
{
dirty_end--;
if (dirty_end->first.oid != cur.oid)
{
break;
}
if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start)))
{
found = true;
cur.version = dirty_end->first.version;
break;
}
}
if (!found)
{
// Try other objects
int search_left = flusher->flush_queue.size() - 1;
#ifdef BLOCKSTORE_DEBUG
printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
#endif
while (search_left > 0)
{
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
flusher->flush_queue.pop_front();
flusher->flush_versions.erase(cur.oid);
dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end())
{
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
{
#ifdef BLOCKSTORE_DEBUG
printf("Write %lu:%lu v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
#endif
flusher->enqueue_flush(cur);
}
else
{
break;
}
}
search_left--;
}
if (search_left <= 0)
{
#ifdef BLOCKSTORE_DEBUG
printf("No older flushes, stopping\n");
#endif
flusher->dequeuing = false;
wait_state = 0;
return true;
}
}
}
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end())
@@ -209,32 +292,26 @@ resume_0:
#endif
flusher->active_flushers++;
resume_1:
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
// Scan dirty versions of the object
if (!scan_dirty(1))
{
wait_state += 1;
return false;
}
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
// Writes and deletes shouldn't happen at the same time
assert(!(copy_count > 0 || has_writes) || !has_delete);
if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
{
// Nothing to flush
flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
{
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
}
flusher->sync_to_repeat.erase(repeat_it);
wait_state = 0;
goto resume_0;
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
goto trim_journal;
}
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
if (clean_loc == UINT64_MAX)
{
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
if (old_clean_loc == UINT64_MAX)
{
// Object not allocated. This is a bug.
char err[1024];
@@ -405,8 +482,9 @@ resume_1:
}
// Update clean_db and dirty_db, free old data locations
update_clean_db();
trim_journal:
// Clear unused part of the journal every <journal_trim_interval> flushes
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
{
flusher->journal_trim_counter = 0;
if (bs->journal.trim())
@@ -436,7 +514,7 @@ resume_1:
}
// All done
#ifdef BLOCKSTORE_DEBUG
printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
printf("Flushed %lu:%lu v%lu (%ld left)\n", cur.oid.inode, cur.oid.stripe, cur.version, flusher->flush_queue.size());
#endif
flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
@@ -464,7 +542,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
copy_count = 0;
clean_loc = UINT64_MAX;
has_delete = false;
has_empty = false;
has_writes = false;
skip_copy = false;
clean_init_bitmap = false;
while (1)
@@ -472,11 +550,8 @@ bool journal_flusher_co::scan_dirty(int wait_base)
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{
// First we submit all reads
if (dirty_it->second.len == 0)
{
has_empty = true;
}
else
has_writes = true;
if (dirty_it->second.len != 0)
{
offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len;
@@ -518,6 +593,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{
// There is an unflushed big write. Copy small writes in its position
has_writes = true;
clean_loc = dirty_it->second.location;
clean_init_bitmap = true;
clean_bitmap_offset = dirty_it->second.offset;

View File

@@ -45,7 +45,7 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete, has_empty;
bool skip_copy, has_delete, has_writes;
blockstore_clean_db_t::iterator clean_it;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
@@ -73,9 +73,10 @@ public:
// Journal flusher itself
class journal_flusher_t
{
int trim_wanted = 0;
bool dequeuing;
int flusher_count;
int sync_threshold;
int flusher_start_threshold;
journal_flusher_co *co;
blockstore_impl_t *bs;
friend class journal_flusher_co;
@@ -96,7 +97,8 @@ public:
~journal_flusher_t();
void loop();
bool is_active();
void force_start();
void request_trim();
void release_trim();
void enqueue_flush(obj_ver_id oid);
void unshift_flush(obj_ver_id oid);
};

View File

@@ -124,12 +124,6 @@ void blockstore_impl_t::loop()
if (PRIV(op)->wait_for)
{
check_wait(op);
#ifdef BLOCKSTORE_DEBUG
if (PRIV(op)->wait_for)
{
printf("still waiting for %d\n", PRIV(op)->wait_for);
}
#endif
if (PRIV(op)->wait_for == WAIT_SQE)
{
break;
@@ -150,7 +144,7 @@ void blockstore_impl_t::loop()
{
dequeue_op = dequeue_read(op);
}
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
else if (op->opcode == BS_OP_WRITE)
{
if (has_writes == 2)
{
@@ -160,6 +154,16 @@ void blockstore_impl_t::loop()
dequeue_op = dequeue_write(op);
has_writes = dequeue_op ? 1 : 2;
}
else if (op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
// Some writes could not be submitted
break;
}
dequeue_op = dequeue_del(op);
has_writes = dequeue_op ? 1 : 2;
}
else if (op->opcode == BS_OP_SYNC)
{
// wait for all small writes to be submitted
@@ -271,6 +275,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (ringloop->space_left() < PRIV(op)->wait_detail)
{
// stop submission if there's still no free space
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
#endif
return;
}
PRIV(op)->wait_for = 0;
@@ -280,8 +287,12 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (journal.used_start == PRIV(op)->wait_detail)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
#endif
return;
}
flusher->release_trim();
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
@@ -291,6 +302,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
journal.sector_info[next].dirty)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for a journal buffer\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
@@ -299,6 +313,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
{
if (!data_alloc->get_free_count() && !flusher->is_active())
{
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
@@ -363,7 +380,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
}
};
}
if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
{
std::function<void (blockstore_op_t*)>(op->callback)(op);
return;
@@ -390,9 +407,31 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
ringloop->wakeup();
}
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
{
while (search_start < search_end)
{
int pos = search_start+(search_end-search_start)/2;
if (oid < list[pos].oid)
{
search_end = pos;
}
else if (list[pos].oid < oid)
{
search_start = pos+1;
}
else
{
list[pos].version = version;
return true;
}
}
return false;
}
void blockstore_impl_t::process_list(blockstore_op_t *op)
{
// Count objects
// Check PG
uint32_t list_pg = op->offset;
uint32_t pg_count = op->len;
uint64_t pg_stripe_size = op->oid.stripe;
@@ -402,70 +441,131 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
FINISH_OP(op);
return;
}
uint64_t stable_count = 0;
if (pg_count > 0)
{
for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{
uint32_t pg = (it->first.inode + it->first.stripe / pg_stripe_size) % pg_count;
if (pg == list_pg)
{
stable_count++;
}
}
}
else
{
stable_count = clean_db.size();
}
uint64_t total_count = stable_count;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
{
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
{
if (IS_STABLE(it->second.state))
{
stable_count++;
}
total_count++;
}
}
// Allocate memory
op->version = stable_count;
op->retval = total_count;
op->buf = malloc(sizeof(obj_ver_id) * total_count);
if (!op->buf)
// Copy clean_db entries (sorted)
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
obj_ver_id *vers = (obj_ver_id*)op->buf;
int i = 0;
for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{
if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
{
vers[i++] = {
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
stable[stable_count++] = {
.oid = it->first,
.version = it->second.version,
};
}
}
int j = stable_count;
int clean_stable_count = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
obj_ver_id *unstable = NULL;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
{
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
{
if (IS_STABLE(it->second.state))
if (IS_DELETE(it->second.state))
{
vers[i++] = it->first;
// Deletions are always stable, so try to zero out two possible entries
if (!replace_stable(it->first.oid, 0, 0, clean_stable_count, stable))
{
replace_stable(it->first.oid, 0, clean_stable_count, stable_count, stable);
}
}
else if (IS_STABLE(it->second.state))
{
// First try to replace a clean stable version in the first part of the list
if (!replace_stable(it->first.oid, it->first.version, 0, clean_stable_count, stable))
{
// Then try to replace the last dirty stable version in the second part of the list
if (stable[stable_count-1].oid == it->first.oid)
{
stable[stable_count-1].version = it->first.version;
}
else
{
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
stable[stable_count++] = it->first;
}
}
}
else
{
vers[j++] = it->first;
if (unstable_count >= unstable_alloc)
{
unstable_alloc += 32768;
unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
if (!unstable)
{
if (stable)
free(stable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
unstable[unstable_count++] = it->first;
}
}
}
// Remove zeroed out stable entries
int j = 0;
for (int i = 0; i < stable_count; i++)
{
if (stable[i].version != 0)
{
stable[j++] = stable[i];
}
}
stable_count = j;
if (stable_count+unstable_count > stable_alloc)
{
stable_alloc = stable_count+unstable_count;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
// Copy unstable entries
for (int i = 0; i < unstable_count; i++)
{
stable[j++] = unstable[i];
}
free(unstable);
op->version = stable_count;
op->retval = stable_count+unstable_count;
op->buf = stable;
FINISH_OP(op);
}

View File

@@ -62,6 +62,11 @@
struct ring_data_t *data = ((ring_data_t*)sqe->user_data)
#define BS_SUBMIT_GET_ONLY_SQE(sqe) \
{\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
}\
struct io_uring_sqe *sqe = get_sqe();\
if (!sqe)\
{\
@@ -71,6 +76,11 @@
}
#define BS_SUBMIT_GET_SQE_DECL(sqe) \
{\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
}\
sqe = get_sqe();\
if (!sqe)\
{\
@@ -286,12 +296,14 @@ class blockstore_impl_t
// Stabilize
int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov);
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
void stabilize_object(object_id oid, uint64_t max_ver);
// Rollback
int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);

View File

@@ -404,7 +404,7 @@ resume_1:
bs->journal.trim();
bs->journal.dirty_start = bs->journal.next_free;
printf(
"Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
entries_loaded,
(bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@@ -475,7 +475,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (location != je->small_write.data_offset)
{
char err[1024];
snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
throw std::runtime_error(err);
}
uint32_t data_crc32 = 0;
@@ -537,7 +537,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
});
bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG
printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
);
#endif
auto & unstab = bs->unstable_writes[ov.oid];
unstab = unstab < ov.version ? ov.version : unstab;
@@ -584,33 +587,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->stable.oid,
.version = je->stable.version,
};
auto it = bs->dirty_db.find(ov);
if (it == bs->dirty_db.end())
{
// journal contains a legitimate STABLE entry for a non-existing dirty write
// this probably means that journal was trimmed between WRITE and STABLE entries
// skip it
}
else
{
while (1)
{
it->second.state = (it->second.state == ST_D_SYNCED
? ST_D_STABLE
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
if (it == bs->dirty_db.begin())
break;
it--;
if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
break;
}
bs->flusher->enqueue_flush(ov);
}
auto unstab_it = bs->unstable_writes.find(ov.oid);
if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
{
bs->unstable_writes.erase(unstab_it);
}
bs->mark_stable(ov);
}
else if (je->type == JE_ROLLBACK)
{
@@ -618,70 +595,39 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
#endif
// rollback dirty writes of <oid> up to <version>
auto it = bs->dirty_db.lower_bound((obj_ver_id){
obj_ver_id ov = {
.oid = je->rollback.oid,
.version = UINT64_MAX,
});
if (it != bs->dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (it->first.oid == je->rollback.oid &&
it->first.version > je->rollback.version &&
!IS_IN_FLIGHT(it->second.state) &&
!IS_STABLE(it->second.state))
{
if (it->first.oid != je->rollback.oid)
break;
else if (it->first.version <= je->rollback.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == bs->dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
}
auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
if (unstab_it != bs->unstable_writes.end())
{
if (max_unstable == 0)
bs->unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
.version = je->rollback.version,
};
bs->mark_rolled_back(ov);
}
else if (je->type == JE_DELETE)
{
#ifdef BLOCKSTORE_DEBUG
printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
#endif
// oid, version
obj_ver_id ov = {
.oid = je->del.oid,
.version = je->del.version,
};
bs->dirty_db.emplace(ov, (dirty_entry){
.state = ST_DEL_SYNCED,
.flags = 0,
.location = 0,
.offset = 0,
.len = 0,
.journal_sector = proc_pos,
});
bs->journal.used_sectors[proc_pos]++;
auto clean_it = bs->clean_db.find(je->del.oid);
if (clean_it == bs->clean_db.end() ||
clean_it->second.version < je->del.version)
{
// oid, version
obj_ver_id ov = {
.oid = je->del.oid,
.version = je->del.version,
};
bs->dirty_db.emplace(ov, (dirty_entry){
.state = ST_DEL_SYNCED,
.flags = 0,
.location = 0,
.offset = 0,
.len = 0,
.journal_sector = proc_pos,
});
bs->journal.used_sectors[proc_pos]++;
// Deletions are treated as immediately stable, because
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
bs->mark_stable(ov);
}
}
started = true;
pos += je->size;

View File

@@ -101,7 +101,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
: bs->journal.used_start - bs->journal.next_free)
);
PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->force_start();
bs->flusher->request_trim();
PRIV(op)->wait_detail = bs->journal.used_start;
return 0;
}
@@ -180,8 +180,8 @@ bool journal_t::trim()
auto journal_used_it = used_sectors.lower_bound(used_start);
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, first_used=%08lx, usage_count=%08lx)\n",
used_start, next_free,
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
used_start, next_free, dirty_start,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
);

View File

@@ -77,33 +77,6 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
}
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
// FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
});
uint64_t max_unstable = 0;
while (dirty_it != dirty_db.begin())
{
dirty_it--;
if (dirty_it->first.oid != v->oid)
break;
else if (dirty_it->first.version <= v->version)
{
if (!IS_STABLE(dirty_it->second.state))
max_unstable = dirty_it->first.version;
break;
}
}
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
journal.sector_info[journal.cur_sector].dirty = false;
@@ -161,26 +134,7 @@ resume_5:
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// Erase dirty_db entries
auto rm_end = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
});
auto rm_start = rm_end;
assert(rm_start != dirty_db.begin());
rm_start--;
while (1)
{
if (rm_start->first.oid != v->oid || rm_start->first.version <= v->version)
{
rm_start++;
break;
}
if (rm_start == dirty_db.begin())
break;
rm_start--;
}
erase_dirty(rm_start, rm_end, UINT64_MAX);
mark_rolled_back(*v);
}
journal.trim();
inflight_writes--;
@@ -190,6 +144,54 @@ resume_5:
return 1;
}
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
{
auto it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (it != dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (it->first.oid == ov.oid &&
it->first.version > ov.version &&
!IS_IN_FLIGHT(it->second.state) &&
!IS_STABLE(it->second.state))
{
if (it->first.oid != ov.oid)
break;
else if (it->first.version <= ov.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
erase_dirty(rm_start, rm_end, UINT64_MAX);
}
auto unstab_it = unstable_writes.find(ov.oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
}
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
@@ -225,11 +227,13 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
#endif
data_alloc->set(dirty_it->second.location >> block_order, false);
}
#ifdef BLOCKSTORE_DEBUG
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
int used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08lx by %lu:%lu v%lu (%d refs)\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
#endif
if (used == 0)
{
journal.used_sectors.erase(dirty_it->second.journal_sector);

View File

@@ -109,12 +109,6 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// FIXME: Only stabilize versions that aren't stable yet
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v->version)
{
unstable_writes.erase(unstab_it);
}
journal_entry_stable *je = (journal_entry_stable*)
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
journal.sector_info[journal.cur_sector].dirty = false;
@@ -153,6 +147,11 @@ resume_2:
resume_3:
if (!disable_journal_fsync)
{
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
@@ -174,42 +173,7 @@ resume_5:
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// Mark all dirty_db entries up to op->version as stable
auto dirty_it = dirty_db.find(*v);
if (dirty_it != dirty_db.end())
{
while (1)
{
if (dirty_it->second.state == ST_J_SYNCED)
{
dirty_it->second.state = ST_J_STABLE;
}
else if (dirty_it->second.state == ST_D_SYNCED)
{
dirty_it->second.state = ST_D_STABLE;
}
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v->oid)
{
break;
}
}
#ifdef BLOCKSTORE_DEBUG
printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
#endif
flusher->enqueue_flush(*v);
}
mark_stable(*v);
}
inflight_writes--;
// Acknowledge op
@@ -218,6 +182,52 @@ resume_5:
return 1;
}
void blockstore_impl_t::mark_stable(const obj_ver_id & v)
{
auto dirty_it = dirty_db.find(v);
if (dirty_it != dirty_db.end())
{
while (1)
{
if (dirty_it->second.state == ST_J_SYNCED)
{
dirty_it->second.state = ST_J_STABLE;
}
else if (dirty_it->second.state == ST_D_SYNCED)
{
dirty_it->second.state = ST_D_STABLE;
}
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v.oid)
{
break;
}
}
#ifdef BLOCKSTORE_DEBUG
printf("enqueue_flush %lu:%lu v%lu\n", v.oid.inode, v.oid.stripe, v.version);
#endif
flusher->enqueue_flush(v);
}
auto unstab_it = unstable_writes.find(v.oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v.version)
{
unstable_writes.erase(unstab_it);
}
}
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
@@ -232,6 +242,11 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("finished %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
PRIV(op)->op_state++;
if (!continue_stable(op))
{

View File

@@ -133,7 +133,11 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif
je->oid = it->oid;
je->version = it->version;
@@ -271,7 +275,16 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
#endif
auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab;
dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
if (dirty_db[*it].state == ST_DEL_WRITTEN)
{
dirty_db[*it].state = ST_DEL_SYNCED;
// Deletions are treated as immediately stable
mark_stable(*it);
}
else /* == ST_J_WRITTEN */
{
dirty_db[*it].state = ST_J_SYNCED;
}
}
in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
op->retval = 0;

View File

@@ -100,6 +100,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
if (dirty_it->second.state == ST_J_WAIT_BIG)
{
return 0;
@@ -213,7 +214,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif
// Figure out where data will be
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
@@ -288,6 +293,7 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 4)
@@ -296,6 +302,11 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
return 1;
resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
sqe = get_sqe();
if (!sqe)
{
@@ -306,7 +317,11 @@ resume_2:
journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf("journal offset %lu is used by %lu:%lu v%lu\n", journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version);
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif
je->oid = op->oid;
je->version = op->version;
@@ -323,6 +338,11 @@ resume_2:
return 1;
resume_4:
// Switch object state
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("write_done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
#ifdef BLOCKSTORE_DEBUG
printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
#endif
@@ -345,6 +365,11 @@ resume_4:
else if (dirty_it->second.state == ST_DEL_SUBMITTED)
{
dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
if (imm)
{
// Deletions are treated as immediately stable
mark_stable(dirty_it->first);
}
}
if (immediate_commit == IMMEDIATE_ALL)
{
@@ -403,6 +428,10 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
{
// We know for sure that we won't write into this sector anymore
uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
if (new_ds >= journal.len)
{
new_ds = journal.block_size;
}
if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
(new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
{
@@ -423,6 +452,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
{
@@ -457,7 +487,11 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif
je->oid = op->oid;
je->version = op->version;

View File

@@ -1,358 +1,349 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include "cluster_client.h"
osd_op_t::~osd_op_t()
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
{
assert(!bs_op);
if (op_data)
this->ringloop = ringloop;
this->tfd = tfd;
msgr.tfd = tfd;
msgr.ringloop = ringloop;
msgr.repeer_pgs = [this](osd_num_t peer_osd)
{
free(op_data);
}
if (rmw_buf)
// peer_osd just connected or dropped connection
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
{
// really connected :)
continue_ops();
}
};
st_cli.tfd = tfd;
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
log_level = config["log_level"].int64_value();
st_cli.parse_config(config);
st_cli.load_global_config();
}
void cluster_client_t::continue_ops()
{
for (auto op_it = unsent_ops.begin(); op_it != unsent_ops.end(); )
{
free(rmw_buf);
}
if (buf)
{
// Note: reusing osd_op_t WILL currently lead to memory leaks
// So we don't reuse it, but free it every time
free(buf);
cluster_op_t *op = *op_it;
if (op->needs_reslice && !op->sent_count)
{
op->parts.clear();
op->done_count = 0;
op->needs_reslice = false;
}
if (!op->parts.size())
{
unsent_ops.erase(op_it++);
execute(op);
continue;
}
if (!op->needs_reslice)
{
for (auto & op_part: op->parts)
{
if (!op_part.sent && !op_part.done)
{
try_send(op, &op_part);
}
}
if (op->sent_count == op->parts.size() - op->done_count)
{
unsent_ops.erase(op_it++);
sent_ops.insert(op);
}
else
op_it++;
}
else
op_it++;
}
}
void cluster_client_t::connect_peer(uint64_t peer_osd, json11::Json address_list, int port)
static uint32_t is_power_of_two(uint64_t value)
{
if (wanted_peers.find(peer_osd) == wanted_peers.end())
uint32_t l = 0;
while (value > 1)
{
wanted_peers[peer_osd] = (osd_wanted_peer_t){
.address_list = address_list,
.port = port,
};
if (value & 1)
{
return 64;
}
value = value >> 1;
l++;
}
return l;
}
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
bs_block_size = config["block_size"].uint64_value();
bs_disk_alignment = config["disk_alignment"].uint64_value();
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
if (!bs_block_size)
bs_block_size = DEFAULT_BLOCK_SIZE;
if (!bs_disk_alignment)
bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
if (!bs_bitmap_granularity)
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
{
uint32_t block_order;
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
throw std::runtime_error("Bad block size");
}
if (config.find("pg_stripe_size") != config.end())
{
pg_stripe_size = config["pg_stripe_size"].uint64_value();
if (!pg_stripe_size)
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
if (config["immediate_commit"] == "all")
{
// Cluster-wide immediate_commit mode
immediate_commit = true;
}
msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
if (!msgr.peer_connect_interval)
msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
if (!msgr.peer_connect_timeout)
msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
}
void cluster_client_t::on_load_pgs_hook(bool success)
{
if (success)
{
pg_count = st_cli.pg_config.size();
continue_ops();
}
}
void cluster_client_t::on_change_hook(json11::Json::object & changes)
{
if (pg_count != st_cli.pg_config.size())
{
// At this point, all operations should be suspended
// And they need to be resliced!
for (auto op: unsent_ops)
{
op->needs_reslice = true;
}
for (auto op: sent_ops)
{
op->needs_reslice = true;
}
pg_count = st_cli.pg_config.size();
}
continue_ops();
}
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
{
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
{
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
}
// FIXME: Implement OSD_OP_SYNC for immediate_commit == false
void cluster_client_t::execute(cluster_op_t *op)
{
if (op->opcode == OSD_OP_SYNC && immediate_commit)
{
// Syncs are not required in the immediate_commit mode
op->retval = 0;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
if (op->opcode != OSD_OP_READ && op->opcode != OSD_OP_OUT || !op->inode || !op->len ||
op->offset % bs_disk_alignment || op->len % bs_disk_alignment)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
if (!pg_stripe_size)
{
// Config is not loaded yet
unsent_ops.insert(op);
return;
}
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
{
// Copy operation
cluster_op_t *op_copy = new cluster_op_t();
op_copy->opcode = op->opcode;
op_copy->inode = op->inode;
op_copy->offset = op->offset;
op_copy->len = op->len;
op_copy->buf = malloc(op->len);
memcpy(op_copy->buf, op->buf, op->len);
unsynced_ops.push_back(op_copy);
unsynced_bytes += op->len;
if (inmemory_commit)
{
// Immediately acknowledge write and continue with the copy
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
op = op_copy;
}
if (unsynced_bytes >= inmemory_dirty_limit)
{
// Push an extra SYNC operation
}
}
// Slice the request into individual object stripe requests
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
uint64_t pg_block_size = bs_block_size * pg_part_count;
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
int part_count = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
if (op->offset < (stripe+pg_block_size) && (op->offset+op->len) > stripe)
{
part_count++;
}
}
op->parts.resize(part_count);
bool resend = false;
int i = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
uint64_t stripe_end = stripe + pg_block_size;
if (op->offset < stripe_end && (op->offset+op->len) > stripe)
{
pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pg_count + 1;
op->parts[i] = {
.parent = op,
.offset = op->offset < stripe ? stripe : op->offset,
.len = (uint32_t)((op->offset+op->len) > stripe_end ? pg_block_size : op->offset+op->len-stripe),
.pg_num = pg_num,
.buf = op->buf + (op->offset < stripe ? stripe-op->offset : 0),
.sent = false,
.done = false,
};
if (!try_send(op, &op->parts[i]))
{
// Part needs to be sent later
resend = true;
}
i++;
}
}
if (resend)
{
unsent_ops.insert(op);
}
else
{
wanted_peers[peer_osd].address_list = address_list;
wanted_peers[peer_osd].port = port;
}
wanted_peers[peer_osd].address_changed = true;
if (!wanted_peers[peer_osd].connecting &&
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
{
try_connect_peer(peer_osd);
sent_ops.insert(op);
}
}
void cluster_client_t::try_connect_peer(uint64_t peer_osd)
bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
{
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end())
auto pg_it = st_cli.pg_config.find(part->pg_num);
if (pg_it != st_cli.pg_config.end() &&
!pg_it->second.pause && pg_it->second.cur_primary)
{
return;
}
if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
{
wanted_peers.erase(peer_osd);
return;
}
auto & wp = wp_it->second;
if (wp.address_index >= wp.address_list.array_items().size())
{
return;
}
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
}
void cluster_client_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{
on_connect_peer(peer_osd, -EINVAL);
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
on_connect_peer(peer_osd, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int timeout_id = -1;
if (peer_connect_timeout > 0)
{
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
osd_num_t primary_osd = pg_it->second.cur_primary;
auto peer_it = msgr.osd_peer_fds.find(primary_osd);
if (peer_it != msgr.osd_peer_fds.end())
{
osd_num_t peer_osd = clients[peer_fd].osd_num;
stop_client(peer_fd);
on_connect_peer(peer_osd, -EIO);
return;
});
}
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
on_connect_peer(peer_osd, -errno);
return;
}
assert(peer_osd != this->osd_num);
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_timeout_id = timeout_id,
.osd_num = peer_osd,
.in_buf = malloc(receive_buffer_size),
};
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
handle_connect_epoll(peer_fd);
});
}
void cluster_client_t::handle_connect_epoll(int peer_fd)
{
auto & cl = clients[peer_fd];
if (cl.connect_timeout_id >= 0)
{
tfd->clear_timer(cl.connect_timeout_id);
cl.connect_timeout_id = -1;
}
osd_num_t peer_osd = cl.osd_num;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
stop_client(peer_fd);
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl.peer_state = PEER_CONNECTED;
// FIXME Disable EPOLLOUT on this fd
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Check OSD number
check_peer_config(cl);
}
void cluster_client_t::handle_peer_epoll(int peer_fd, int epoll_events)
{
// Mark client as ready (i.e. some data is available)
if (epoll_events & EPOLLRDHUP)
{
// Stop client
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
stop_client(peer_fd);
}
else if (epoll_events & EPOLLIN)
{
// Mark client as ready (i.e. some data is available)
auto & cl = clients[peer_fd];
cl.read_ready++;
if (cl.read_ready == 1)
{
read_ready_clients.push_back(cl.peer_fd);
ringloop->wakeup();
}
}
}
void cluster_client_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
{
auto & wp = wanted_peers.at(peer_osd);
wp.connecting = false;
if (peer_fd < 0)
{
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
if (wp.address_changed)
{
wp.address_changed = false;
wp.address_index = 0;
try_connect_peer(peer_osd);
}
else if (wp.address_index < wp.address_list.array_items().size()-1)
{
// Try other addresses
wp.address_index++;
try_connect_peer(peer_osd);
}
else
{
// Retry again in <peer_connect_interval> seconds
wp.last_connect_attempt = time(NULL);
wp.address_index = 0;
tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
int peer_fd = peer_it->second;
part->osd_num = primary_osd;
part->sent = true;
op->sent_count++;
part->op = {
.op_type = OSD_OP_OUT,
.peer_fd = peer_fd,
.req = { .rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = op_id++,
.opcode = op->opcode,
},
.inode = op->inode,
.offset = part->offset,
.len = part->len,
} },
.callback = [this, part](osd_op_t *op_part)
{
handle_op_part(part);
},
};
part->op.send_list.push_back(part->op.req.buf, OSD_PACKET_SIZE);
if (op->opcode == OSD_OP_WRITE)
{
try_connect_peer(peer_osd);
});
}
return;
}
printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
wanted_peers.erase(peer_osd);
repeer_pgs(peer_osd);
}
void cluster_client_t::check_peer_config(osd_client_t & cl)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.show_conf = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SHOW_CONFIG,
},
},
};
op->callback = [this](osd_op_t *op)
{
osd_client_t & cl = clients[op->peer_fd];
std::string json_err;
json11::Json config;
bool err = false;
if (op->reply.hdr.retval < 0)
{
err = true;
printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
}
else
{
config = json11::Json::parse(std::string((char*)op->buf), json_err);
if (json_err != "")
{
err = true;
printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
part->op.send_list.push_back(part->buf, part->len);
}
else if (config["osd_num"].uint64_value() != cl.osd_num)
else
{
err = true;
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
on_connect_peer(cl.osd_num, -1);
part->op.buf = part->buf;
}
msgr.outbox_push(&part->op);
return true;
}
if (err)
else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
{
stop_client(op->peer_fd);
delete op;
return;
msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
}
osd_peer_fds[cl.osd_num] = cl.peer_fd;
on_connect_peer(cl.osd_num, cl.peer_fd);
delete op;
};
outbox_push(op);
}
return false;
}
void cluster_client_t::cancel_osd_ops(osd_client_t & cl)
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
{
for (auto p: cl.sent_ops)
cluster_op_t *op = part->parent;
part->sent = false;
op->sent_count--;
part->op.buf = NULL;
if (part->op.reply.hdr.retval != part->op.req.rw.len)
{
cancel_out_op(p.second);
// Operation failed, retry
printf(
"Operation part failed on OSD %lu: retval=%ld (expected %u), reconnecting\n",
part->osd_num, part->op.reply.hdr.retval, part->op.req.rw.len
);
msgr.stop_client(part->op.peer_fd);
if (op->sent_count == op->parts.size() - op->done_count - 1)
{
// Resend later when OSDs come up
// FIXME: Check for different types of errors
// FIXME: Repeat operations after a small timeout, for the case when OSD is coming up
sent_ops.erase(op);
unsent_ops.insert(op);
}
if (op->sent_count == 0 && op->needs_reslice)
{
// PG count has changed, reslice the operation
unsent_ops.erase(op);
op->parts.clear();
op->done_count = 0;
op->needs_reslice = false;
execute(op);
}
}
cl.sent_ops.clear();
for (auto op: cl.outbox)
else
{
cancel_out_op(op);
}
cl.outbox.clear();
if (cl.write_op)
{
cancel_out_op(cl.write_op);
cl.write_op = NULL;
}
}
void cluster_client_t::cancel_out_op(osd_op_t *op)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
void cluster_client_t::stop_client(int peer_fd)
{
assert(peer_fd != 0);
auto it = clients.find(peer_fd);
if (it == clients.end())
{
return;
}
uint64_t repeer_osd = 0;
osd_client_t cl = it->second;
if (cl.peer_state == PEER_CONNECTED)
{
if (cl.osd_num)
{
// Reload configuration from etcd when the connection is dropped
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
repeer_osd = cl.osd_num;
}
else
{
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
}
}
clients.erase(it);
tfd->set_fd_handler(peer_fd, NULL);
if (cl.osd_num)
{
osd_peer_fds.erase(cl.osd_num);
// Cancel outbound operations
cancel_osd_ops(cl);
}
if (cl.read_op)
{
delete cl.read_op;
cl.read_op = NULL;
}
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
{
if (*rit == peer_fd)
{
read_ready_clients.erase(rit);
break;
}
}
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
{
if (*wit == peer_fd)
{
write_ready_clients.erase(wit);
break;
}
}
free(cl.in_buf);
assert(peer_fd != 0);
close(peer_fd);
if (repeer_osd)
{
repeer_pgs(repeer_osd);
// OK
part->done = true;
op->done_count++;
if (op->done_count >= op->parts.size())
{
// Finished!
sent_ops.erase(op);
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
}
}
}

View File

@@ -1,209 +1,80 @@
#pragma once
#include <sys/types.h>
#include <stdint.h>
#include <arpa/inet.h>
#include <malloc.h>
#include "messenger.h"
#include "etcd_state_client.h"
#include <set>
#include <map>
#include <deque>
#include <vector>
#define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BLOCK_SIZE 128*1024
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
#define DEFAULT_DISK_ALIGNMENT 4096
#define DEFAULT_BITMAP_GRANULARITY 4096
#include "json11/json11.hpp"
#include "osd_ops.h"
#include "timerfd_manager.h"
#include "ringloop.h"
struct cluster_op_t;
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define MAX_EPOLL_EVENTS 64
#define OSD_OP_INLINE_BUF_COUNT 16
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
struct osd_op_buf_list_t
struct cluster_op_part_t
{
int count = 0, alloc = 0, sent = 0;
iovec *buf = NULL;
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
~osd_op_buf_list_t()
{
if (buf && buf != inline_buf)
{
free(buf);
}
}
inline iovec* get_iovec()
{
return (buf ? buf : inline_buf) + sent;
}
inline int get_size()
{
return count - sent;
}
inline void push_back(void *nbuf, size_t len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
}
}
buf[count++] = { .iov_base = nbuf, .iov_len = len };
}
cluster_op_t *parent;
uint64_t offset;
uint32_t len;
pg_num_t pg_num;
osd_num_t osd_num;
void *buf;
bool sent;
bool done;
osd_op_t op;
};
struct blockstore_op_t;
struct osd_primary_op_data_t;
struct osd_op_t
struct cluster_op_t
{
timespec tv_begin;
uint64_t op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
osd_op_buf_list_t send_list;
~osd_op_t();
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC
uint64_t inode;
uint64_t offset;
uint64_t len;
int retval;
void *buf;
std::function<void(cluster_op_t*)> callback;
protected:
bool needs_reslice = false;
int sent_count = 0, done_count = 0;
std::vector<cluster_op_part_t> parts;
friend class cluster_client_t;
};
struct osd_client_t
{
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
int connect_timeout_id = -1;
osd_num_t osd_num = 0;
void *in_buf = NULL;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Outbound operations sent to this peer
std::map<int, osd_op_t*> sent_ops;
// Outbound messages (replies or requests)
std::deque<osd_op_t*> outbox;
// PGs dirtied by this client's primary-writes (FIXME to drop the connection)
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
};
struct osd_wanted_peer_t
{
json11::Json address_list;
int port;
time_t last_connect_attempt;
bool connecting, address_changed;
int address_index;
std::string cur_addr;
int cur_port;
};
struct osd_op_stats_t
{
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
};
struct cluster_client_t
class cluster_client_t
{
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
// osd_num_t is only for logging and asserts
osd_num_t osd_num;
int receive_buffer_size = 9000;
int peer_connect_interval = 5;
int peer_connect_timeout = 5;
int log_level = 0;
uint64_t pg_part_count = 2;
uint64_t pg_stripe_size = 0;
uint64_t bs_block_size = 0;
uint64_t bs_disk_alignment = 0;
uint64_t bs_bitmap_granularity = 0;
uint64_t pg_count = 0;
bool immediate_commit = false;
bool inmemory_commit = false;
uint64_t inmemory_dirty_limit = 32*1024*1024;
int log_level;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
uint64_t next_subop_id = 1;
uint64_t op_id = 1;
etcd_state_client_t st_cli;
osd_messenger_t msgr;
std::set<cluster_op_t*> sent_ops, unsent_ops;
// unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
std::vector<cluster_op_t*> unsynced_ops;
uint64_t unsynced_bytes = 0;
std::map<int, osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
public:
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
void execute(cluster_op_t *op);
// op statistics
osd_op_stats_t stats;
// public
void connect_peer(uint64_t osd_num, json11::Json address_list, int port);
void stop_client(int peer_fd);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
// private
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_connect_epoll(int peer_fd);
void handle_peer_epoll(int peer_fd, int epoll_events);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t & cl);
void cancel_osd_ops(osd_client_t & cl);
void cancel_out_op(osd_op_t *op);
bool try_send(osd_client_t & cl);
void send_replies();
void handle_send(ring_data_t *data, int peer_fd);
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
void handle_finished_read(osd_client_t & cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
protected:
void continue_ops();
void on_load_config_hook(json11::Json::object & cfg);
void on_load_pgs_hook(bool success);
void on_change_hook(json11::Json::object & changes);
void on_change_osd_state_hook(uint64_t peer_osd);
bool try_send(cluster_op_t *op, cluster_op_part_t *part);
void handle_op_part(cluster_op_part_t *part);
};

92
epoll_manager.cpp Normal file
View File

@@ -0,0 +1,92 @@
#include <sys/epoll.h>
#include <sys/poll.h>
#include <unistd.h>
#include "epoll_manager.h"
#define MAX_EPOLL_EVENTS 64
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
{
this->ringloop = ringloop;
epoll_fd = epoll_create(1);
if (epoll_fd < 0)
{
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
}
tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
handle_epoll_events();
}
epoll_manager_t::~epoll_manager_t()
{
if (tfd)
{
delete tfd;
tfd = NULL;
}
close(epoll_fd);
}
void epoll_manager_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
{
if (handler != NULL)
{
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev;
ev.data.fd = fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers[fd] = handler;
}
else
{
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers.erase(fd);
}
}
void epoll_manager_t::handle_epoll_events()
{
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("epoll %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
{
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
data->callback = [this](ring_data_t *data)
{
if (data->res < 0)
{
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
}
handle_epoll_events();
};
ringloop->submit();
int nfds;
epoll_event events[MAX_EPOLL_EVENTS];
do
{
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
for (int i = 0; i < nfds; i++)
{
auto & cb = epoll_handlers[events[i].data.fd];
cb(events[i].data.fd, events[i].events);
}
} while (nfds == MAX_EPOLL_EVENTS);
}

20
epoll_manager.h Normal file
View File

@@ -0,0 +1,20 @@
#pragma once
#include <map>
#include "ringloop.h"
#include "timerfd_manager.h"
class epoll_manager_t
{
int epoll_fd;
ring_loop_t *ringloop;
std::map<int, std::function<void(int, int)>> epoll_handlers;
public:
epoll_manager_t(ring_loop_t *ringloop);
~epoll_manager_t();
void set_fd_handler(int fd, std::function<void(int, int)> handler);
void handle_epoll_events();
timerfd_manager_t *tfd;
};

View File

@@ -43,6 +43,53 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
http_request_json(tfd, etcd_address, req, timeout, callback);
}
void etcd_state_client_t::parse_config(json11::Json & config)
{
this->etcd_addresses.clear();
if (config["etcd_address"].is_string())
{
std::string ea = config["etcd_address"].string_value();
while (1)
{
int pos = ea.find(',');
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
if (addr.length() > 0)
{
if (addr.find('/') < 0)
addr += "/v3";
this->etcd_addresses.push_back(addr);
}
if (pos >= 0)
ea = ea.substr(pos+1);
else
break;
}
}
else if (config["etcd_address"].array_items().size())
{
for (auto & ea: config["etcd_address"].array_items())
{
std::string addr = ea.string_value();
if (addr != "")
{
if (addr.find('/') < 0)
addr += "/v3";
this->etcd_addresses.push_back(addr);
}
}
}
this->etcd_prefix = config["etcd_prefix"].string_value();
if (this->etcd_prefix == "")
{
this->etcd_prefix = "/microceph";
}
else if (this->etcd_prefix[0] != '/')
{
this->etcd_prefix = "/"+this->etcd_prefix;
}
this->log_level = config["log_level"].int64_value();
}
void etcd_state_client_t::start_etcd_watcher()
{
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
@@ -93,7 +140,10 @@ void etcd_state_client_t::start_etcd_watcher()
parse_state(kv.first, kv.second);
}
// React to changes
on_change_hook(changes);
if (on_change_hook != NULL)
{
on_change_hook(changes);
}
}
}
if (msg->eof)
@@ -208,7 +258,7 @@ void etcd_state_client_t::load_pgs()
},
};
json11::Json::object req = { { "success", txn } };
json11::Json checks = load_pgs_checks_hook();
json11::Json checks = load_pgs_checks_hook != NULL ? load_pgs_checks_hook() : json11::Json();
if (checks.array_items().size() > 0)
{
req["compare"] = checks;

View File

@@ -1,5 +1,6 @@
#pragma once
#include "osd_id.h"
#include "http_client.h"
#include "timerfd_manager.h"
@@ -56,4 +57,5 @@ struct etcd_state_client_t
void load_global_config();
void load_pgs();
void parse_state(const std::string & key, const json11::Json & value);
void parse_config(json11::Json & config);
};

298
fio_cluster.cpp Normal file
View File

@@ -0,0 +1,298 @@
// FIO engine to test cluster I/O
//
// Random write:
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
//
// Linear write:
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
//
// Random read (run with -iodepth=32 or -iodepth=1):
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <vector>
#include <unordered_map>
#include "epoll_manager.h"
#include "cluster_client.h"
extern "C" {
#define CONFIG_HAVE_GETTID
#define CONFIG_PWRITEV2
#include "fio/fio.h"
#include "fio/optgroup.h"
}
struct sec_data
{
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
bool last_sync = false;
/* The list of completed io_u structs. */
std::vector<io_u*> completed;
uint64_t op_n = 0, inflight = 0;
bool trace = false;
};
struct sec_options
{
int __pad;
char *etcd_host = NULL;
char *etcd_prefix = NULL;
int inode = 0;
int trace = 0;
};
static struct fio_option options[] = {
{
.name = "etcd",
.lname = "etcd address",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, etcd_host),
.help = "etcd address in the form HOST:PORT[/PATH]",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "etcd",
.lname = "etcd key prefix",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, etcd_prefix),
.help = "etcd key prefix, by default /microceph",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "inode",
.lname = "inode to run tests on",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, inode),
.help = "inode to run tests on (1 by default)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "osd_trace",
.lname = "OSD trace",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, trace),
.help = "Trace OSD operations",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
};
static int sec_setup(struct thread_data *td)
{
sec_data *bsd;
bsd = new sec_data;
if (!bsd)
{
td_verror(td, errno, "calloc");
return 1;
}
td->io_ops_data = bsd;
if (!td->files_index)
{
add_file(td, "osd_cluster", 0, 0);
td->o.nr_files = td->o.nr_files ? : 1;
td->o.open_files++;
}
return 0;
}
static void sec_cleanup(struct thread_data *td)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd)
{
delete bsd->cli;
delete bsd->epmgr;
delete bsd->ringloop;
bsd->cli = NULL;
bsd->epmgr = NULL;
bsd->ringloop = NULL;
}
}
/* Connect to the server from each thread. */
static int sec_init(struct thread_data *td)
{
sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
json11::Json cfg = json11::Json::object {
{ "etcd_address", std::string(o->etcd_host) },
{ "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/microceph") },
};
bsd->ringloop = new ring_loop_t(512);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
bsd->trace = o->trace ? true : false;
return 0;
}
/* Begin read or write request. */
static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
{
sec_options *opt = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
int n = bsd->op_n;
fio_ro_check(td, io);
if (io->ddir == DDIR_SYNC && bsd->last_sync)
{
return FIO_Q_COMPLETED;
}
io->engine_data = bsd;
cluster_op_t *op = new cluster_op_t;
switch (io->ddir)
{
case DDIR_READ:
op->opcode = OSD_OP_READ;
op->inode = opt->inode;
op->offset = io->offset;
op->len = io->xfer_buflen;
op->buf = io->xfer_buf;
bsd->last_sync = false;
break;
case DDIR_WRITE:
op->opcode = OSD_OP_WRITE;
op->inode = opt->inode;
op->offset = io->offset;
op->len = io->xfer_buflen;
op->buf = io->xfer_buf;
bsd->last_sync = false;
break;
case DDIR_SYNC:
op->opcode = OSD_OP_SYNC;
bsd->last_sync = true;
break;
default:
io->error = EINVAL;
return FIO_Q_COMPLETED;
}
op->callback = [io, n](cluster_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
sec_data *bsd = (sec_data*)io->engine_data;
bsd->inflight--;
bsd->completed.push_back(io);
if (bsd->trace)
{
printf("--- %s n=%d retval=%d\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n, op->retval);
}
delete op;
};
if (opt->trace)
{
printf("+++ %s # %d\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n);
}
io->error = 0;
bsd->inflight++;
bsd->op_n++;
bsd->cli->execute(op);
if (io->error != 0)
return FIO_Q_COMPLETED;
return FIO_Q_QUEUED;
}
static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
while (true)
{
bsd->ringloop->loop();
if (bsd->completed.size() >= min)
break;
bsd->ringloop->wait();
}
return bsd->completed.size();
}
static struct io_u *sec_event(struct thread_data *td, int event)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd->completed.size() == 0)
return NULL;
/* FIXME We ignore the event number and assume fio calls us exactly once for [0..nr_events-1] */
struct io_u *ev = bsd->completed.back();
bsd->completed.pop_back();
return ev;
}
static int sec_io_u_init(struct thread_data *td, struct io_u *io)
{
io->engine_data = NULL;
return 0;
}
static void sec_io_u_free(struct thread_data *td, struct io_u *io)
{
}
static int sec_open_file(struct thread_data *td, struct fio_file *f)
{
return 0;
}
static int sec_invalidate(struct thread_data *td, struct fio_file *f)
{
return 0;
}
struct ioengine_ops ioengine = {
.name = "microceph_cluster",
.version = FIO_IOOPS_VERSION,
.flags = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
.setup = sec_setup,
.init = sec_init,
.queue = sec_queue,
.getevents = sec_getevents,
.event = sec_event,
.cleanup = sec_cleanup,
.open_file = sec_open_file,
.invalidate = sec_invalidate,
.io_u_init = sec_io_u_init,
.io_u_free = sec_io_u_free,
.option_struct_size = sizeof(struct sec_options),
.options = options,
};
static void fio_init fio_sec_register(void)
{
register_ioengine(&ioengine);
}
static void fio_exit fio_sec_unregister(void)
{
unregister_ioengine(&ioengine);
}

View File

@@ -5,7 +5,7 @@
// Random write:
//
// fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -host=127.0.0.1 -port=11203 [-single_primary=1] -size=1000M
// -host=127.0.0.1 -port=11203 [-block_size_order=17] [-single_primary=1] -size=1000M
//
// Linear write:
//
@@ -53,6 +53,7 @@ struct sec_options
int port = 0;
int single_primary = 0;
int trace = 0;
int block_order = 17;
};
static struct fio_option options[] = {
@@ -74,6 +75,15 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "block_size_order",
.lname = "Blockstore block size order",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, block_order),
.help = "Blockstore block size order (size = 2^order)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "single_primary",
.lname = "Single Primary",
@@ -140,6 +150,8 @@ static int sec_init(struct thread_data *td)
{
sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order;
struct sockaddr_in addr;
int r;

View File

@@ -50,8 +50,15 @@ struct http_co_t
websocket_t ws;
int onstack = 0;
bool ended = false;
~http_co_t();
inline void stackin() { onstack++; }
inline void stackout() { onstack--; if (!onstack && ended) end(); }
inline void end() { ended = true; if (!onstack) { delete this; } }
void start_connection();
void handle_events();
void handle_connect_result();
void submit_read();
void submit_send();
@@ -137,7 +144,7 @@ void websocket_t::post_message(int type, const std::string & msg)
void websocket_t::close()
{
delete co;
co->end();
}
http_co_t::~http_co_t()
@@ -173,14 +180,15 @@ http_co_t::~http_co_t()
void http_co_t::start_connection()
{
stackin();
int port = extract_port(host);
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
{
parsed.error_code = ENXIO;
// FIXME 'delete this' is ugly...
delete this;
stackout();
end();
return;
}
addr.sin_family = AF_INET;
@@ -189,7 +197,8 @@ void http_co_t::start_connection()
if (peer_fd < 0)
{
parsed.error_code = errno;
delete this;
stackout();
end();
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
@@ -201,71 +210,81 @@ void http_co_t::start_connection()
{
parsed.error_code = ETIME;
}
delete this;
end();
});
}
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
if (state == HTTP_CO_CONNECTING)
{
handle_connect_result();
}
else
{
if (this->epoll_events & EPOLLIN)
{
submit_read();
}
else if (this->epoll_events & (EPOLLRDHUP|EPOLLERR))
{
delete this;
}
}
});
epoll_events = 0;
// Finally call connect
r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
parsed.error_code = errno;
delete this;
stackout();
end();
return;
}
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
handle_events();
});
state = HTTP_CO_CONNECTING;
stackout();
}
void http_co_t::handle_events()
{
stackin();
while (epoll_events)
{
if (state == HTTP_CO_CONNECTING)
{
handle_connect_result();
}
else
{
epoll_events &= ~EPOLLOUT;
if (epoll_events & EPOLLIN)
{
submit_read();
}
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
{
end();
break;
}
}
}
stackout();
}
void http_co_t::handle_connect_result()
{
if (epoll_events & (EPOLLOUT | EPOLLERR))
stackin();
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
parsed.error_code = result;
delete this;
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
state = HTTP_CO_SENDING_REQUEST;
submit_send();
result = errno;
}
else
if (result != 0)
{
delete this;
parsed.error_code = result;
stackout();
end();
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
state = HTTP_CO_SENDING_REQUEST;
submit_send();
stackout();
}
void http_co_t::submit_read()
{
stackin();
int res;
again:
if (rbuf.size() != READ_BUFFER_SIZE)
{
rbuf.resize(READ_BUFFER_SIZE);
@@ -273,39 +292,30 @@ again:
read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
read_msg.msg_iov = &read_iov;
read_msg.msg_iovlen = 1;
epoll_events = epoll_events & ~EPOLLIN;
res = recvmsg(peer_fd, &read_msg, 0);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN)
if (res == -EAGAIN || res == 0)
{
res = 0;
epoll_events = epoll_events & ~EPOLLIN;
}
if (res < 0)
else if (res < 0)
{
delete this;
return;
end();
}
response += std::string(rbuf.data(), res);
if (res == READ_BUFFER_SIZE)
else if (res > 0)
{
goto again;
}
if (!handle_read())
{
return;
}
if (res < READ_BUFFER_SIZE && (epoll_events & (EPOLLRDHUP|EPOLLERR)))
{
delete this;
return;
response += std::string(rbuf.data(), res);
handle_read();
}
stackout();
}
void http_co_t::submit_send()
{
stackin();
int res;
again:
if (sent < request.size())
@@ -313,7 +323,7 @@ again:
send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
send_msg.msg_iov = &send_iov;
send_msg.msg_iovlen = 1;
res = sendmsg(peer_fd, &send_msg, 0);
res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL);
if (res < 0)
{
res = -errno;
@@ -324,14 +334,17 @@ again:
}
else if (res < 0)
{
delete this;
stackout();
end();
return;
}
sent += res;
if (state == HTTP_CO_SENDING_REQUEST)
{
if (sent >= request.size())
{
state = HTTP_CO_REQUEST_SENT;
}
else
goto again;
}
@@ -342,10 +355,12 @@ again:
goto again;
}
}
stackout();
}
bool http_co_t::handle_read()
{
stackin();
if (state == HTTP_CO_REQUEST_SENT)
{
int pos = response.find("\r\n\r\n");
@@ -380,7 +395,8 @@ bool http_co_t::handle_read()
if (!target_response_size)
{
// Sorry, unsupported response
delete this;
stackout();
end();
return false;
}
}
@@ -388,7 +404,8 @@ bool http_co_t::handle_read()
}
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
{
delete this;
stackout();
end();
return false;
}
if (state == HTTP_CO_CHUNKED && response.size() > 0)
@@ -416,7 +433,8 @@ bool http_co_t::handle_read()
}
if (parsed.eof)
{
delete this;
stackout();
end();
return false;
}
if (want_streaming && parsed.body.size() > 0)
@@ -433,11 +451,13 @@ bool http_co_t::handle_read()
parsed.body = "";
}
}
stackout();
return true;
}
void http_co_t::post_message(int type, const std::string & msg)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
request += ws_format_frame(type, msg.size());
@@ -449,6 +469,7 @@ void http_co_t::post_message(int type, const std::string & msg)
ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg;
}
stackout();
}
uint64_t stoull_full(const std::string & str, int base)

398
messenger.cpp Normal file
View File

@@ -0,0 +1,398 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include "messenger.h"
osd_op_t::~osd_op_t()
{
assert(!bs_op);
assert(!op_data);
if (rmw_buf)
{
free(rmw_buf);
}
if (buf)
{
// Note: reusing osd_op_t WILL currently lead to memory leaks
// So we don't reuse it, but free it every time
free(buf);
}
}
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
{
if (wanted_peers.find(peer_osd) == wanted_peers.end())
{
wanted_peers[peer_osd] = (osd_wanted_peer_t){
.address_list = peer_state["addresses"],
.port = (int)peer_state["port"].int64_value(),
};
}
else
{
wanted_peers[peer_osd].address_list = peer_state["addresses"];
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
}
wanted_peers[peer_osd].address_changed = true;
if (!wanted_peers[peer_osd].connecting &&
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
{
try_connect_peer(peer_osd);
}
}
void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
{
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end())
{
return;
}
if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
{
wanted_peers.erase(peer_osd);
return;
}
auto & wp = wp_it->second;
if (wp.address_index >= wp.address_list.array_items().size())
{
return;
}
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
}
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{
on_connect_peer(peer_osd, -EINVAL);
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
on_connect_peer(peer_osd, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int timeout_id = -1;
if (peer_connect_timeout > 0)
{
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
{
osd_num_t peer_osd = clients[peer_fd].osd_num;
stop_client(peer_fd);
on_connect_peer(peer_osd, -EIO);
return;
});
}
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
on_connect_peer(peer_osd, -errno);
return;
}
assert(peer_osd != this->osd_num);
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_timeout_id = timeout_id,
.osd_num = peer_osd,
.in_buf = malloc(receive_buffer_size),
};
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
handle_connect_epoll(peer_fd);
});
}
void osd_messenger_t::handle_connect_epoll(int peer_fd)
{
auto & cl = clients[peer_fd];
if (cl.connect_timeout_id >= 0)
{
tfd->clear_timer(cl.connect_timeout_id);
cl.connect_timeout_id = -1;
}
osd_num_t peer_osd = cl.osd_num;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
stop_client(peer_fd);
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl.peer_state = PEER_CONNECTED;
// FIXME Disable EPOLLOUT on this fd
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Check OSD number
check_peer_config(cl);
}
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
{
// Mark client as ready (i.e. some data is available)
if (epoll_events & EPOLLRDHUP)
{
// Stop client
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
stop_client(peer_fd);
}
else if (epoll_events & EPOLLIN)
{
// Mark client as ready (i.e. some data is available)
auto & cl = clients[peer_fd];
cl.read_ready++;
if (cl.read_ready == 1)
{
read_ready_clients.push_back(cl.peer_fd);
ringloop->wakeup();
}
}
}
void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
{
auto & wp = wanted_peers.at(peer_osd);
wp.connecting = false;
if (peer_fd < 0)
{
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
if (wp.address_changed)
{
wp.address_changed = false;
wp.address_index = 0;
try_connect_peer(peer_osd);
}
else if (wp.address_index < wp.address_list.array_items().size()-1)
{
// Try other addresses
wp.address_index++;
try_connect_peer(peer_osd);
}
else
{
// Retry again in <peer_connect_interval> seconds
wp.last_connect_attempt = time(NULL);
wp.address_index = 0;
tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
{
try_connect_peer(peer_osd);
});
}
return;
}
printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
wanted_peers.erase(peer_osd);
repeer_pgs(peer_osd);
}
void osd_messenger_t::check_peer_config(osd_client_t & cl)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.show_conf = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SHOW_CONFIG,
},
},
};
op->callback = [this](osd_op_t *op)
{
osd_client_t & cl = clients[op->peer_fd];
std::string json_err;
json11::Json config;
bool err = false;
if (op->reply.hdr.retval < 0)
{
err = true;
printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
}
else
{
config = json11::Json::parse(std::string((char*)op->buf), json_err);
if (json_err != "")
{
err = true;
printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
}
else if (config["osd_num"].uint64_value() != cl.osd_num)
{
err = true;
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
on_connect_peer(cl.osd_num, -1);
}
}
if (err)
{
stop_client(op->peer_fd);
delete op;
return;
}
osd_peer_fds[cl.osd_num] = cl.peer_fd;
on_connect_peer(cl.osd_num, cl.peer_fd);
delete op;
};
outbox_push(op);
}
void osd_messenger_t::cancel_osd_ops(osd_client_t & cl)
{
for (auto p: cl.sent_ops)
{
cancel_op(p.second);
}
cl.sent_ops.clear();
for (auto op: cl.outbox)
{
cancel_op(op);
}
cl.outbox.clear();
if (cl.write_op)
{
cancel_op(cl.write_op);
cl.write_op = NULL;
}
}
void osd_messenger_t::cancel_op(osd_op_t *op)
{
if (op->op_type == OSD_OP_OUT)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
else
{
// This function is only called in stop_client(), so it's fine to destroy the operation
delete op;
}
}
void osd_messenger_t::stop_client(int peer_fd)
{
assert(peer_fd != 0);
auto it = clients.find(peer_fd);
if (it == clients.end())
{
return;
}
uint64_t repeer_osd = 0;
osd_client_t cl = it->second;
if (cl.peer_state == PEER_CONNECTED)
{
if (cl.osd_num)
{
// Reload configuration from etcd when the connection is dropped
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
repeer_osd = cl.osd_num;
}
else
{
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
}
}
clients.erase(it);
tfd->set_fd_handler(peer_fd, NULL);
if (cl.osd_num)
{
osd_peer_fds.erase(cl.osd_num);
// Cancel outbound operations
cancel_osd_ops(cl);
}
if (cl.read_op)
{
delete cl.read_op;
cl.read_op = NULL;
}
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
{
if (*rit == peer_fd)
{
read_ready_clients.erase(rit);
break;
}
}
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
{
if (*wit == peer_fd)
{
write_ready_clients.erase(wit);
break;
}
}
free(cl.in_buf);
close(peer_fd);
if (repeer_osd)
{
repeer_pgs(repeer_osd);
}
}
void osd_messenger_t::accept_connections(int listen_fd)
{
// Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
assert(peer_fd != 0);
char peer_str[256];
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
.in_buf = malloc(receive_buffer_size),
};
// Add FD to epoll
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Try to accept next connection
peer_addr_size = sizeof(addr);
}
if (peer_fd == -1 && errno != EAGAIN)
{
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
}

213
messenger.h Normal file
View File

@@ -0,0 +1,213 @@
#pragma once
#include <sys/types.h>
#include <stdint.h>
#include <arpa/inet.h>
#include <malloc.h>
#include <set>
#include <map>
#include <deque>
#include <vector>
#include "json11/json11.hpp"
#include "osd_ops.h"
#include "timerfd_manager.h"
#include "ringloop.h"
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define OSD_OP_INLINE_BUF_COUNT 16
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
#define DEFAULT_PEER_CONNECT_INTERVAL 5
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
struct osd_op_buf_list_t
{
int count = 0, alloc = 0, sent = 0;
iovec *buf = NULL;
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
~osd_op_buf_list_t()
{
if (buf && buf != inline_buf)
{
free(buf);
}
}
inline iovec* get_iovec()
{
return (buf ? buf : inline_buf) + sent;
}
inline int get_size()
{
return count - sent;
}
inline void push_back(void *nbuf, size_t len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
}
}
buf[count++] = { .iov_base = nbuf, .iov_len = len };
}
};
struct blockstore_op_t;
struct osd_primary_op_data_t;
struct osd_op_t
{
timespec tv_begin;
uint64_t op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
osd_op_buf_list_t send_list;
~osd_op_t();
};
struct osd_client_t
{
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
int connect_timeout_id = -1;
osd_num_t osd_num = 0;
void *in_buf = NULL;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Incoming operations
std::vector<osd_op_t*> received_ops;
// Outbound operations
std::deque<osd_op_t*> outbox;
std::map<int, osd_op_t*> sent_ops;
// PGs dirtied by this client's primary-writes (FIXME to drop the connection)
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
};
struct osd_wanted_peer_t
{
json11::Json address_list;
int port;
time_t last_connect_attempt;
bool connecting, address_changed;
int address_index;
std::string cur_addr;
int cur_port;
};
struct osd_op_stats_t
{
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
};
struct osd_messenger_t
{
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
// osd_num_t is only for logging and asserts
osd_num_t osd_num;
int receive_buffer_size = 9000;
int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
int log_level = 0;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
uint64_t next_subop_id = 1;
std::map<int, osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// op statistics
osd_op_stats_t stats;
public:
void connect_peer(uint64_t osd_num, json11::Json peer_state);
void stop_client(int peer_fd);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
void handle_peer_epoll(int peer_fd, int epoll_events);
void read_requests();
void send_replies();
void accept_connections(int listen_fd);
protected:
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_connect_epoll(int peer_fd);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t & cl);
void cancel_osd_ops(osd_client_t & cl);
void cancel_op(osd_op_t *op);
bool try_send(osd_client_t & cl);
void handle_send(int result, int peer_fd);
bool handle_read(int result, int peer_fd);
void handle_finished_read(osd_client_t & cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
};

View File

@@ -1,18 +1,11 @@
#include "cluster_client.h"
#include "messenger.h"
void cluster_client_t::read_requests()
void osd_messenger_t::read_requests()
{
for (int i = 0; i < read_ready_clients.size(); i++)
while (read_ready_clients.size() > 0)
{
int peer_fd = read_ready_clients[i];
int peer_fd = read_ready_clients[0];
auto & cl = clients[peer_fd];
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
return;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.read_op || cl.read_remaining < receive_buffer_size)
{
cl.read_iov.iov_base = cl.in_buf;
@@ -25,26 +18,35 @@ void cluster_client_t::read_requests()
}
cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1;
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + 1);
int result = recvmsg(peer_fd, &cl.read_msg, 0);
if (result < 0)
{
result = -errno;
}
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("recvmsg done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
handle_read(result, peer_fd);
}
read_ready_clients.clear();
}
void cluster_client_t::handle_read(ring_data_t *data, int peer_fd)
bool osd_messenger_t::handle_read(int result, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
if (data->res < 0 && data->res != -EAGAIN)
if (result < 0 && result != -EAGAIN)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
stop_client(peer_fd);
return;
return false;
}
if (data->res == -EAGAIN || cl.read_iov.iov_base == cl.in_buf && data->res < receive_buffer_size)
if (result == -EAGAIN || result < cl.read_iov.iov_len)
{
cl.read_ready--;
if (cl.read_ready > 0)
@@ -54,16 +56,12 @@ void cluster_client_t::handle_read(ring_data_t *data, int peer_fd)
{
read_ready_clients.push_back(peer_fd);
}
if (data->res == -EAGAIN)
{
return;
}
if (data->res > 0)
if (result > 0)
{
if (cl.read_iov.iov_base == cl.in_buf)
{
// Compose operation(s) from the buffer
int remain = data->res;
int remain = result;
void *curbuf = cl.in_buf;
while (remain > 0)
{
@@ -99,18 +97,23 @@ void cluster_client_t::handle_read(ring_data_t *data, int peer_fd)
else
{
// Long data
cl.read_remaining -= data->res;
cl.read_buf += data->res;
cl.read_remaining -= result;
cl.read_buf += result;
if (cl.read_remaining <= 0)
{
handle_finished_read(cl);
}
}
if (result >= cl.read_iov.iov_len)
{
return true;
}
}
}
return false;
}
void cluster_client_t::handle_finished_read(osd_client_t & cl)
void osd_messenger_t::handle_finished_read(osd_client_t & cl)
{
if (cl.read_state == CL_READ_HDR)
{
@@ -122,6 +125,7 @@ void cluster_client_t::handle_finished_read(osd_client_t & cl)
else if (cl.read_state == CL_READ_DATA)
{
// Operation is ready
cl.received_ops.push_back(cl.read_op);
exec_op(cl.read_op);
cl.read_op = NULL;
cl.read_state = 0;
@@ -157,7 +161,7 @@ void cluster_client_t::handle_finished_read(osd_client_t & cl)
}
}
void cluster_client_t::handle_op_hdr(osd_client_t *cl)
void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
@@ -202,11 +206,12 @@ void cluster_client_t::handle_op_hdr(osd_client_t *cl)
// Operation is ready
cl->read_op = NULL;
cl->read_state = 0;
cl->received_ops.push_back(cur_op);
exec_op(cur_op);
}
}
void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
@@ -219,7 +224,7 @@ void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
}
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
if ((op->reply.hdr.opcode == OSD_OP_SECONDARY_READ || op->reply.hdr.opcode == OSD_OP_READ) &&
op->reply.hdr.retval > 0)
{
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
@@ -229,8 +234,7 @@ void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
}
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
op->reply.hdr.retval > 0)
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST && op->reply.hdr.retval > 0)
{
op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
@@ -238,8 +242,7 @@ void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
cl->read_buf = op->buf;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
}
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG &&
op->reply.hdr.retval > 0)
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
{
op->buf = malloc(op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;

View File

@@ -1,6 +1,6 @@
#include "cluster_client.h"
#include "messenger.h"
void cluster_client_t::outbox_push(osd_op_t *cur_op)
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
{
assert(cur_op->peer_fd);
auto & cl = clients.at(cur_op->peer_fd);
@@ -8,6 +8,25 @@ void cluster_client_t::outbox_push(osd_op_t *cur_op)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
}
else
{
// Check that operation actually belongs to this client
bool found = false;
for (auto it = cl.received_ops.begin(); it != cl.received_ops.end(); it++)
{
if (*it == cur_op)
{
found = true;
cl.received_ops.erase(it, it+1);
break;
}
}
if (!found)
{
delete cur_op;
return;
}
}
cl.outbox.push_back(cur_op);
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
{
@@ -20,15 +39,9 @@ void cluster_client_t::outbox_push(osd_op_t *cur_op)
}
}
bool cluster_client_t::try_send(osd_client_t & cl)
bool osd_messenger_t::try_send(osd_client_t & cl)
{
int peer_fd = cl.peer_fd;
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
return false;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.write_op)
{
// pick next command
@@ -65,53 +78,56 @@ bool cluster_client_t::try_send(osd_client_t & cl)
}
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
int result = sendmsg(peer_fd, &cl.write_msg, MSG_NOSIGNAL);
if (result < 0)
result = -errno;
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("sendmsg done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
handle_send(result, peer_fd);
return true;
}
void cluster_client_t::send_replies()
void osd_messenger_t::send_replies()
{
for (int i = 0; i < write_ready_clients.size(); i++)
while (write_ready_clients.size() > 0)
{
int peer_fd = write_ready_clients[i];
if (!try_send(clients[peer_fd]))
{
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
return;
}
auto & cl = clients[write_ready_clients[0]];
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + 1);
try_send(cl);
}
write_ready_clients.clear();
}
void cluster_client_t::handle_send(ring_data_t *data, int peer_fd)
void osd_messenger_t::handle_send(int result, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
if (data->res < 0 && data->res != -EAGAIN)
if (result < 0 && result != -EAGAIN)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
stop_client(peer_fd);
return;
}
if (data->res >= 0)
if (result >= 0)
{
osd_op_t *cur_op = cl.write_op;
while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
while (result > 0 && cur_op->send_list.sent < cur_op->send_list.count)
{
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
if (iov.iov_len <= data->res)
if (iov.iov_len <= result)
{
data->res -= iov.iov_len;
result -= iov.iov_len;
cur_op->send_list.sent++;
}
else
{
iov.iov_len -= data->res;
iov.iov_base += data->res;
iov.iov_len -= result;
iov.iov_base += result;
break;
}
}

69
osd.cpp
View File

@@ -7,6 +7,8 @@
#include "osd.h"
#define MAX_EPOLL_EVENTS 64
const char* osd_op_names[] = {
"",
"read",
@@ -73,29 +75,9 @@ osd_t::~osd_t()
void osd_t::parse_config(blockstore_config_t & config)
{
int pos;
// Initial startup configuration
{
std::string ea = config["etcd_address"];
while (1)
{
pos = ea.find(',');
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
if (addr.length() > 0)
{
if (addr.find('/') < 0)
addr += "/v3";
st_cli.etcd_addresses.push_back(addr);
}
if (pos >= 0)
ea = ea.substr(pos+1);
else
break;
}
}
st_cli.etcd_prefix = config["etcd_prefix"];
if (st_cli.etcd_prefix == "")
st_cli.etcd_prefix = "/microceph";
json11::Json json_config = json11::Json(config);
st_cli.parse_config(json_config);
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
if (etcd_report_interval <= 0)
etcd_report_interval = 30;
@@ -143,12 +125,11 @@ void osd_t::parse_config(blockstore_config_t & config)
print_stats_interval = 3;
c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
if (!c_cli.peer_connect_interval)
c_cli.peer_connect_interval = 5;
c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
if (!c_cli.peer_connect_timeout)
c_cli.peer_connect_timeout = 5;
c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
st_cli.log_level = log_level;
c_cli.log_level = log_level;
}
@@ -260,6 +241,11 @@ void osd_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
void osd_t::handle_epoll_events()
{
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("epoll %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
{
@@ -284,38 +270,7 @@ restart:
{
if (events[i].data.fd == listen_fd)
{
// Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
assert(peer_fd != 0);
char peer_str[256];
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
c_cli.clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
.in_buf = malloc(c_cli.receive_buffer_size),
};
// Add FD to epoll
set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
c_cli.handle_peer_epoll(peer_fd, epoll_events);
});
// Try to accept next connection
peer_addr_size = sizeof(addr);
}
if (peer_fd == -1 && errno != EAGAIN)
{
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
c_cli.accept_connections(listen_fd);
}
else
{

11
osd.h
View File

@@ -17,7 +17,7 @@
#include "ringloop.h"
#include "timerfd_manager.h"
#include "osd_peering_pg.h"
#include "cluster_client.h"
#include "messenger.h"
#include "etcd_state_client.h"
#define OSD_LOADING_PGS 0x01
@@ -78,7 +78,7 @@ class osd_t
// cluster state
etcd_state_client_t st_cli;
cluster_client_t c_cli;
osd_messenger_t c_cli;
int etcd_failed_attempts = 0;
std::string etcd_lease_id;
json11::Json self_state;
@@ -187,15 +187,16 @@ class osd_t
bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op);
void cancel_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
bool finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
void handle_primary_bs_subop(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop);
void pg_cancel_write_queue(pg_t & pg, object_id oid, int retval);
void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
void submit_primary_sync_subops(osd_op_t *cur_op);

View File

@@ -1,40 +0,0 @@
void slice()
{
// Slice the request into blockstore requests to individual objects
// Primary OSD still operates individual stripes, except they're twice the size of the blockstore's stripe.
std::vector read_parts;
int block = bs->get_block_size();
uint64_t stripe1 = cur_op->req.rw.offset / block / 2;
uint64_t stripe2 = (cur_op->req.rw.offset + cur_op->req.rw.len + block*2 - 1) / block / 2 - 1;
for (uint64_t s = stripe1; s <= stripe2; s++)
{
uint64_t start = s == stripe1 ? cur_op->req.rw.offset - stripe1*block*2 : 0;
uint64_t end = s == stripe2 ? cur_op->req.rw.offset + cur_op->req.rw.len - stripe2*block*2 : block*2;
if (start < block)
{
read_parts.push_back({
.role = 1,
.oid = {
.inode = cur_op->req.rw.inode,
.stripe = (s << STRIPE_ROLE_BITS) | 1,
},
.version = UINT64_MAX,
.offset = start,
.len = (block < end ? block : end) - start,
});
}
if (end > block)
{
read_parts.push_back({
.role = 2,
.oid = {
.inode = cur_op->req.rw.inode,
.stripe = (s << STRIPE_ROLE_BITS) | 2,
},
.version = UINT64_MAX,
.offset = (start > block ? start-block : 0),
.len = end - (start > block ? start-block : 0),
});
}
}
}

View File

@@ -83,7 +83,7 @@ void osd_t::parse_test_peer(std::string peer)
{ "addresses", json11::Json::array { addr } },
{ "port", port },
};
c_cli.connect_peer(peer_osd, json11::Json::array { addr }, port);
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
json11::Json osd_t::get_osd_state()
@@ -211,7 +211,7 @@ void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
{
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
{
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]["addresses"], st_cli.peer_states[peer_osd]["port"].int64_value());
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
}
@@ -229,8 +229,14 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
{
if (this->config.find(cfg_var.first) == this->config.end())
{
// FIXME Convert int to str
osd_config[cfg_var.first] = cfg_var.second.string_value();
if (cfg_var.second.is_string())
{
osd_config[cfg_var.first] = cfg_var.second.string_value();
}
else
{
osd_config[cfg_var.first] = cfg_var.second.dump();
}
}
}
parse_config(osd_config);
@@ -556,7 +562,7 @@ void osd_t::apply_pg_config()
{
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
{
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
}
}
start_pg_peering(pg_num);

View File

@@ -78,9 +78,12 @@ void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb
}
else
{
printf("Error while doing flush on OSD %lu: %s\n", osd_num, strerror(-retval));
assert(c_cli.osd_peer_fds.find(peer_osd) != c_cli.osd_peer_fds.end());
c_cli.stop_client(c_cli.osd_peer_fds[peer_osd]);
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
auto fd_it = c_cli.osd_peer_fds.find(peer_osd);
if (fd_it != c_cli.osd_peer_fds.end())
{
c_cli.stop_client(fd_it->second);
}
return;
}
}
@@ -270,9 +273,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
throw std::runtime_error("Failed to recover an object");
}
}
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
op->osd_op = NULL;
recovery_ops.erase(op->oid);
delete osd_op;
op->osd_op = NULL;
continue_recovery();
};
exec_op(op->osd_op);

View File

@@ -120,7 +120,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
pg.flush_batch = NULL;
for (auto p: pg.write_queue)
{
finish_op(p.second, -EPIPE);
cancel_primary_write(p.second);
}
pg.write_queue.clear();
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
@@ -132,7 +132,6 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
else
it++;
}
pg.inflight = 0;
dirty_pgs.erase(pg.pg_num);
// Calculate current write OSD set
pg.pg_cursize = 0;
@@ -188,7 +187,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
}
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
{
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
}
}
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());

View File

@@ -206,17 +206,6 @@ void pg_obj_state_check_t::finish_object()
if (log_level > 1)
{
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
if (log_level > 2)
{
for (int i = obj_start; i < obj_end; i++)
{
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
state = OBJ_INCOMPLETE;
pg->state = pg->state | PG_HAS_INCOMPLETE;
@@ -226,11 +215,21 @@ void pg_obj_state_check_t::finish_object()
if (log_level > 1)
{
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
state = OBJ_DEGRADED;
pg->state = pg->state | PG_HAS_DEGRADED;
}
if (n_mismatched > 0)
{
if (n_roles >= pg->pg_cursize && log_level > 1)
{
printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
}
if (log_level > 1 && (n_roles < pg->pg_cursize || n_mismatched > 0))
{
if (log_level > 2)
{
for (int i = obj_start; i < obj_end; i++)
@@ -238,13 +237,13 @@ void pg_obj_state_check_t::finish_object()
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
state = OBJ_DEGRADED;
pg->state = pg->state | PG_HAS_DEGRADED;
}
if (n_mismatched > 0)
{
state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
else
{
for (int i = ver_start; i < ver_end; i++)
{
printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
}
pg->total_count++;
if (state != 0 || ver_end < obj_end)

View File

@@ -13,9 +13,14 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
{
// PG number is calculated from the offset
// Our EC scheme stores data in fixed chunks equal to (K*block size)
// But we must not use K in the process of calculating the PG number
// So we calculate the PG number using a separate setting which should be per-inode (FIXME)
pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / pg_stripe_size) % pg_count + 1;
// K = pg_minsize and will be a property of the inode. Not it's hardcoded (FIXME)
uint64_t pg_block_size = bs_block_size * 2;
object_id oid = {
.inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe
.stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
};
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_count + 1;
auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
{
@@ -23,13 +28,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
finish_op(cur_op, -EPIPE);
return false;
}
uint64_t pg_block_size = bs_block_size * pg_it->second.pg_minsize;
object_id oid = {
.inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
.stripe = (cur_op->req.rw.offset / pg_stripe_size) * pg_stripe_size +
((cur_op->req.rw.offset % pg_stripe_size) / pg_block_size) * pg_block_size
};
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
(cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
(cur_op->req.rw.len % bs_disk_alignment) != 0)
@@ -198,7 +196,9 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
else if (op_data->st == 8) goto resume_8;
else if (op_data->st == 9) goto resume_9;
assert(op_data->st == 0);
printf("primary_write\n");
if (!check_write_queue(cur_op, pg))
{
return;
@@ -218,7 +218,7 @@ resume_2:
resume_3:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Save version override for parallel reads
@@ -233,7 +233,7 @@ resume_4:
resume_5:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
if (op_data->fact_ver == 1)
@@ -263,12 +263,13 @@ resume_5:
submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set);
if (op_data->n_subops > 0)
{
resume_8:
op_data->st = 8;
return;
resume_8:
resume_9:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
}
@@ -282,7 +283,7 @@ resume_8:
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
resume_6:
resume_7:
if (!finalize_primary_write(cur_op, pg, pg.cur_loc_set, 6))
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
{
return;
}
@@ -291,17 +292,19 @@ resume_7:
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() && next_it->first == oid)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
}
}
}
bool osd_t::finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == base_state)
@@ -347,7 +350,7 @@ resume_7:
op_data->unstable_write_osds = NULL;
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return false;
}
}
@@ -387,6 +390,7 @@ void osd_t::continue_primary_sync(osd_op_t *cur_op)
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
assert(op_data->st == 0);
printf("primary_sync\n");
if (syncs_in_progress.size() > 0)
{
// Wait for previous syncs, if any
@@ -594,8 +598,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
@@ -618,7 +620,7 @@ resume_2:
resume_3:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Save version override for parallel reads
@@ -632,17 +634,11 @@ resume_4:
resume_5:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Remove version override
pg.ver_override.erase(op_data->oid);
resume_6:
resume_7:
if (!finalize_primary_write(cur_op, pg, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set, 6))
{
return;
}
// Adjust PG stats after "instant stabilize", because we need object_state above
if (!op_data->object_state)
{
@@ -658,12 +654,15 @@ resume_7:
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
}
}
}

View File

@@ -33,15 +33,24 @@ void osd_t::autosync()
void osd_t::finish_op(osd_op_t *cur_op, int retval)
{
inflight_ops--;
if (cur_op->op_data && cur_op->op_data->pg_num > 0)
if (cur_op->op_data)
{
auto & pg = pgs[cur_op->op_data->pg_num];
pg.inflight--;
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
if (cur_op->op_data->pg_num > 0)
{
finish_stop_pg(pg);
auto & pg = pgs[cur_op->op_data->pg_num];
pg.inflight--;
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
}
assert(!cur_op->op_data->subops);
assert(!cur_op->op_data->unstable_write_osds);
assert(!cur_op->op_data->unstable_writes);
assert(!cur_op->op_data->dirty_pgs);
free(cur_op->op_data);
cur_op->op_data = NULL;
}
if (!cur_op->peer_fd)
{
@@ -129,6 +138,13 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
.buf = w ? stripes[role].write_buf : stripes[role].read_buf,
});
#ifdef OSD_DEBUG
printf(
"Submit %s to local: %lu:%lu v%lu %u-%u\n", w ? "write" : "read",
op_data->oid.inode, op_data->oid.stripe | role, op_version,
subops[i].bs_op->offset, subops[i].bs_op->len
);
#endif
bs->enqueue_op(subops[i].bs_op);
}
else
@@ -150,6 +166,13 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
};
#ifdef OSD_DEBUG
printf(
"Submit %s to osd %lu: %lu:%lu v%lu %u-%u\n", w ? "write" : "read", role_osd_num,
op_data->oid.inode, op_data->oid.stripe | role, op_version,
subops[i].req.sec_rw.offset, subops[i].req.sec_rw.len
);
#endif
subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
if (w && stripes[role].write_end > 0)
{
@@ -161,10 +184,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
// so it doesn't get freed
subop->buf = NULL;
handle_primary_subop(
subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval,
subop->req.sec_rw.len, subop->reply.sec_rw.version
);
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// write operation failed, drop the connection
@@ -204,12 +224,16 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
);
}
add_bs_subop_stats(subop);
uint64_t opcode = bs_op_to_osd_op[bs_op->opcode];
int retval = bs_op->retval;
uint64_t version = bs_op->version;
subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode];
subop->reply.hdr.retval = bs_op->retval;
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE)
{
subop->req.sec_rw.len = bs_op->len;
subop->reply.sec_rw.version = bs_op->version;
}
delete bs_op;
subop->bs_op = NULL;
handle_primary_subop(opcode, cur_op, retval, expected, version);
handle_primary_subop(subop, cur_op);
}
void osd_t::add_bs_subop_stats(osd_op_t *subop)
@@ -235,8 +259,12 @@ void osd_t::add_bs_subop_stats(osd_op_t *subop)
}
}
void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version)
void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
{
uint64_t opcode = subop->req.hdr.opcode;
int retval = subop->reply.hdr.retval;
int expected = opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE
? subop->req.sec_rw.len : 0;
osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval != expected)
{
@@ -252,6 +280,12 @@ void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval,
op_data->done++;
if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
{
uint64_t version = subop->reply.sec_rw.version;
#ifdef OSD_DEBUG
uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
? c_cli.clients[subop->peer_fd].osd_num : osd_num;
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
#endif
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
{
throw std::runtime_error(
@@ -290,6 +324,23 @@ void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval,
}
}
void osd_t::cancel_primary_write(osd_op_t *cur_op)
{
if (cur_op->op_data && cur_op->op_data->subops)
{
// Primary-write operation is waiting for subops, subops
// are sent to peer OSDs, so we can't just throw them away.
// Mark them with an extra EPIPE.
cur_op->op_data->errors++;
cur_op->op_data->epipe++;
cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
}
else
{
finish_op(cur_op, -EPIPE);
}
}
void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
@@ -354,7 +405,7 @@ void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_os
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(OSD_OP_SECONDARY_DELETE, cur_op, subop->reply.hdr.retval, 0, 0);
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// delete operation failed, drop the connection
@@ -407,7 +458,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0);
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
@@ -462,7 +513,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0);
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
@@ -474,9 +525,20 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
}
}
void osd_t::pg_cancel_write_queue(pg_t & pg, object_id oid, int retval)
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
{
auto st_it = pg.write_queue.find(oid), it = st_it;
finish_op(first_op, retval);
if (it != pg.write_queue.end() && it->second == first_op)
{
it++;
}
else
{
// Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG.
return;
}
while (it != pg.write_queue.end() && it->first == oid)
{
finish_op(it->second, retval);

View File

@@ -19,6 +19,8 @@
int connect_osd(const char *osd_address, int osd_port);
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len);
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern);
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
@@ -105,7 +107,7 @@ int main3(int narg, char *args[])
return 0;
}
int main(int narg, char *args[])
int main4(int narg, char *args[])
{
int connect_fd;
// Cluster write (sync not implemented yet)
@@ -117,6 +119,15 @@ int main(int narg, char *args[])
return 0;
}
int main(int narg, char *args[])
{
int connect_fd;
connect_fd = connect_osd("192.168.7.2", 43051);
test_read(connect_fd, 1, 1039663104, UINT64_MAX, 0, 128*1024);
close(connect_fd);
return 0;
}
int connect_osd(const char *osd_address, int osd_port)
{
struct sockaddr_in addr;
@@ -167,6 +178,66 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
return true;
}
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len)
{
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid = {
.inode = inode,
.stripe = stripe,
};
op.sec_rw.version = version;
op.sec_rw.offset = offset;
op.sec_rw.len = len;
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, op.sec_rw.len))
{
free(data);
return 0;
}
r = read_blocking(connect_fd, data, len);
if (r != len)
{
free(data);
perror("read data");
return 0;
}
free(data);
printf("Read %lu:%lu v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
op.hdr.opcode = OSD_OP_SECONDARY_LIST;
op.sec_list.list_pg = 1;
op.sec_list.pg_count = 1;
op.sec_list.pg_stripe_size = 4*1024*1024;
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (reply.hdr.retval < 0 || !check_reply(r, op, reply, reply.hdr.retval))
{
return 0;
}
data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
if (r != sizeof(obj_ver_id)*reply.hdr.retval)
{
free(data);
perror("read data");
return 0;
}
obj_ver_id *ov = (obj_ver_id*)data;
for (int i = 0; i < reply.hdr.retval; i++)
{
if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
{
printf("list: %lu:%lu v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
}
}
return 0;
}
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern)
{
osd_any_op_t op;

View File

@@ -4,6 +4,8 @@
#define _LARGEFILE64_SOURCE
#endif
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <assert.h>
#include <liburing.h>
@@ -158,7 +160,13 @@ public:
}
inline int submit()
{
return io_uring_submit(&ring);
int r = io_uring_submit(&ring);
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("submit %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
return r;
}
inline int wait()
{

View File

@@ -51,6 +51,40 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
return done;
}
int readv_blocking(int fd, iovec *iov, int iovcnt)
{
int v = 0;
int done = 0;
while (v < iovcnt)
{
ssize_t r = readv(fd, iov, iovcnt);
if (r < 0)
{
if (errno != EAGAIN && errno != EPIPE)
{
perror("writev");
exit(1);
}
continue;
}
while (v < iovcnt)
{
if (iov[v].iov_len > r)
{
iov[v].iov_len -= r;
iov[v].iov_base += r;
break;
}
else
{
v++;
}
}
done += r;
}
return done;
}
int writev_blocking(int fd, iovec *iov, int iovcnt)
{
int v = 0;

View File

@@ -5,4 +5,5 @@
int read_blocking(int fd, void *read_buf, size_t remaining);
int write_blocking(int fd, void *write_buf, size_t remaining);
int readv_blocking(int fd, iovec *iov, int iovcnt);
int writev_blocking(int fd, iovec *iov, int iovcnt);

View File

@@ -25,20 +25,37 @@ int connect_stub(const char *server_address, int server_port);
void run_bench(int peer_fd);
static uint64_t read_sum = 0, read_count = 0;
static uint64_t write_sum = 0, write_count = 0;
static uint64_t sync_sum = 0, sync_count = 0;
void handle_sigint(int sig)
{
printf("4k randwrite: %lu us avg\n", write_sum/write_count);
printf("sync: %lu us avg\n", sync_sum/sync_count);
printf("4k randread: %lu us avg\n", read_count ? read_sum/read_count : 0);
printf("4k randwrite: %lu us avg\n", write_count ? write_sum/write_count : 0);
printf("sync: %lu us avg\n", sync_count ? sync_sum/sync_count : 0);
exit(0);
}
int main(int narg, char *args[])
{
if (narg < 2)
{
printf("USAGE: %s SERVER_IP [PORT]\n", args[0]);
return 1;
}
int port = 11203;
if (narg >= 3)
{
port = atoi(args[2]);
if (port <= 0 || port >= 65536)
{
printf("Bad port number\n");
return 1;
}
}
signal(SIGINT, handle_sigint);
int peer_fd = connect_stub("127.0.0.1", 11203);
int peer_fd = connect_stub(args[1], port);
run_bench(peer_fd);
close(peer_fd);
return 0;
@@ -98,10 +115,37 @@ void run_bench(int peer_fd)
osd_any_reply_t reply;
void *buf = NULL;
int r;
iovec iov[2];
timespec tv_begin, tv_end;
clock_gettime(CLOCK_REALTIME, &tv_begin);
while (1)
{
// read
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid.inode = 3;
op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
op.sec_rw.version = 0;
op.sec_rw.len = 4096;
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
if (!r)
break;
buf = malloc(op.sec_rw.len);
iov[0] = { reply.buf, OSD_PACKET_SIZE };
iov[1] = { buf, op.sec_rw.len };
r = readv_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
free(buf);
if (!r || !check_reply(OSD_PACKET_SIZE, op, reply, op.sec_rw.len))
break;
clock_gettime(CLOCK_REALTIME, &tv_end);
read_count++;
read_sum += (
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
);
tv_begin = tv_end;
// write
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
@@ -113,9 +157,9 @@ void run_bench(int peer_fd)
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
buf = malloc(op.sec_rw.len);
memset(buf, rand() % 255, op.sec_rw.len);
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
if (r)
r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
iov[0] = { op.buf, OSD_PACKET_SIZE };
iov[1] = { buf, op.sec_rw.len };
r = writev_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
free(buf);
if (!r)
break;
@@ -128,6 +172,7 @@ void run_bench(int peer_fd)
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
);
tv_begin = tv_end;
// sync/stab
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
@@ -138,11 +183,12 @@ void run_bench(int peer_fd)
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, 0))
break;
clock_gettime(CLOCK_REALTIME, &tv_begin);
clock_gettime(CLOCK_REALTIME, &tv_end);
sync_count++;
sync_sum += (
(tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
);
tv_begin = tv_end;
}
}

129
stub_uring_osd.cpp Normal file
View File

@@ -0,0 +1,129 @@
/**
* Stub "OSD" implemented on top of osd_messenger to test & compare
* network performance with sync read/write and io_uring
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#include <stdexcept>
#include "ringloop.h"
#include "epoll_manager.h"
#include "messenger.h"
int bind_stub(const char *bind_address, int bind_port);
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
int main(int narg, char *args[])
{
ring_consumer_t looper;
ring_loop_t *ringloop = new ring_loop_t(512);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
osd_messenger_t *msgr = new osd_messenger_t();
msgr->osd_num = 1351;
msgr->tfd = epmgr->tfd;
msgr->ringloop = ringloop;
msgr->repeer_pgs = [](osd_num_t) {};
msgr->exec_op = [msgr](osd_op_t *op) { stub_exec_op(msgr, op); };
// Accept new connections
int listen_fd = bind_stub("0.0.0.0", 11203);
epmgr->set_fd_handler(listen_fd, [listen_fd, msgr](int fd, int events)
{
msgr->accept_connections(listen_fd);
});
looper.loop = [msgr, ringloop]()
{
msgr->read_requests();
msgr->send_replies();
ringloop->submit();
};
ringloop->register_consumer(&looper);
printf("stub_uring_osd: waiting for clients\n");
while (true)
{
ringloop->loop();
ringloop->wait();
}
delete msgr;
delete epmgr;
delete ringloop;
return 0;
}
int bind_stub(const char *bind_address, int bind_port)
{
int listen_backlog = 128;
int listen_fd = socket(AF_INET, SOCK_STREAM, 0);
if (listen_fd < 0)
{
throw std::runtime_error(std::string("socket: ") + strerror(errno));
}
int enable = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, bind_address, &addr.sin_addr)) != 1)
{
close(listen_fd);
throw std::runtime_error("bind address "+std::string(bind_address)+(r == 0 ? " is not valid" : ": no ipv4 support"));
}
addr.sin_family = AF_INET;
addr.sin_port = htons(bind_port);
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
}
if (listen(listen_fd, listen_backlog) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("listen: ") + strerror(errno));
}
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
return listen_fd;
}
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->send_list.push_back(op->reply.buf, OSD_PACKET_SIZE);
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
op->reply.hdr.retval = op->req.sec_rw.len;
op->buf = malloc(op->req.sec_rw.len);
op->send_list.push_back(op->buf, op->req.sec_rw.len);
}
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
op->reply.hdr.retval = op->req.sec_rw.len;
}
else if (op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
{
op->reply.hdr.retval = 0;
}
else
{
printf("client %d: unsupported stub opcode: %lu\n", op->peer_fd, op->req.hdr.opcode);
op->reply.hdr.retval = -EINVAL;
}
msgr->outbox_push(op);
}