Compare commits

..

3 Commits

Author SHA1 Message Date
c414a90abc TRACE 2020-05-28 12:41:08 +03:00
36fe7d394b EPOLLLT 2020-05-28 12:41:08 +03:00
540137dd23 Submit 2020-05-28 12:41:08 +03:00
41 changed files with 1236 additions and 2683 deletions

View File

@@ -1,46 +0,0 @@
#!/usr/bin/perl
use strict;
my $deps = {};
for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
{
if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
{
$deps->{$1}->{$2} = 1;
}
}
my $added;
do
{
$added = 0;
for my $file (keys %$deps)
{
for my $dep (keys %{$deps->{$file}})
{
if ($deps->{$dep})
{
for my $subdep (keys %{$deps->{$dep}})
{
if (!$deps->{$file}->{$subdep})
{
$added = 1;
$deps->{$file}->{$subdep} = 1;
}
}
}
}
}
} while ($added);
for my $file (sort keys %$deps)
{
if ($file =~ /\.cpp$/)
{
my $obj = $file;
$obj =~ s/\.cpp$/.o/s;
print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
}
}

191
Makefile
View File

@@ -2,44 +2,85 @@ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
# -fsanitize=address # -fsanitize=address
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so stub_osd stub_bench osd_test dump_journal
clean: clean:
rm -f *.o rm -f *.o
crc32c.o: crc32c.c
g++ $(CXXFLAGS) -c -o $@ $<
json11.o: json11/json11.cpp
g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
allocator.o: allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h object_id.h
g++ $(CXXFLAGS) -c -o $@ $<
dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
g++ $(CXXFLAGS) -o $@ $< crc32c.o g++ $(CXXFLAGS) -o $@ $< crc32c.o
libblockstore.so: $(BLOCKSTORE_OBJS) libblockstore.so: $(BLOCKSTORE_OBJS)
g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -o libblockstore.so -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \ OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o pg_states.o \ osd_primary.o osd_primary_subops.o etcd_state_client.o cluster_client.o osd_cluster.o http_client.o pg_states.o \
osd_rmw.o json11.o base64.o timerfd_manager.o osd_rmw.o json11.o base64.o timerfd_manager.o
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_cluster.o: osd_cluster.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
http_client.o: http_client.cpp http_client.h
g++ $(CXXFLAGS) -c -o $@ $<
etcd_state_client.o: etcd_state_client.cpp etcd_state_client.h http_client.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
cluster_client.o: cluster_client.cpp cluster_client.h osd_ops.h timerfd_manager.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_flush.o: osd_flush.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
pg_states.o: pg_states.cpp pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -o $@ $<
osd_primary.o: osd_primary.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary_subops.o: osd_primary_subops.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp osd.h http_client.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS) osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
stub_osd: stub_osd.o rw_blocking.o g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
stub_uring_osd: $(STUB_URING_OSD_OBJS)
g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_test: osd_test.cpp osd_ops.h rw_blocking.o osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
FIO_CLUSTER_OBJS := fio_cluster.o cluster_client.o epoll_manager.o etcd_state_client.o \
messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o pg_states.o timerfd_manager.o base64.o
libfio_cluster.so: $(FIO_CLUSTER_OBJS)
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) -luring
test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
@@ -47,107 +88,3 @@ test: test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
test_allocator: test_allocator.cpp allocator.o test_allocator: test_allocator.cpp allocator.o
g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
crc32c.o: crc32c.c crc32c.h
g++ $(CXXFLAGS) -c -o $@ $<
json11.o: json11/json11.cpp
g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
# Autogenerated
allocator.o: allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
messenger.o: messenger.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
msgr_receive.o: msgr_receive.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
msgr_send.o: msgr_send.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp object_id.h osd_id.h osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test.o: osd_rmw_test.cpp object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
g++ $(CXXFLAGS) -c -o $@ $<
pg_states.o: pg_states.cpp pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
test_allocator.o: test_allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<

View File

@@ -7,8 +7,8 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
dequeuing = false; dequeuing = false;
active_flushers = 0; active_flushers = 0;
syncing_flushers = 0; syncing_flushers = 0;
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable); sync_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = flusher_start_threshold; journal_trim_interval = sync_threshold;
journal_trim_counter = 0; journal_trim_counter = 0;
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size); journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
co = new journal_flusher_co[flusher_count]; co = new journal_flusher_co[flusher_count];
@@ -81,7 +81,7 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version; flush_versions[ov.oid] = ov.version;
flush_queue.push_back(ov.oid); flush_queue.push_back(ov.oid);
} }
if (!dequeuing && flush_queue.size() >= flusher_start_threshold) if (!dequeuing && flush_queue.size() >= sync_threshold)
{ {
dequeuing = true; dequeuing = true;
bs->ringloop->wakeup(); bs->ringloop->wakeup();
@@ -101,27 +101,26 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version; flush_versions[ov.oid] = ov.version;
flush_queue.push_front(ov.oid); flush_queue.push_front(ov.oid);
} }
if (!dequeuing && flush_queue.size() >= flusher_start_threshold) if (!dequeuing && flush_queue.size() >= sync_threshold)
{ {
dequeuing = true; dequeuing = true;
bs->ringloop->wakeup(); bs->ringloop->wakeup();
} }
} }
void journal_flusher_t::request_trim() void journal_flusher_t::force_start()
{ {
dequeuing = true; dequeuing = true;
trim_wanted++;
bs->ringloop->wakeup(); bs->ringloop->wakeup();
} }
void journal_flusher_t::release_trim()
{
trim_wanted--;
}
#define await_sqe(label) \ #define await_sqe(label) \
resume_##label:\ resume_##label:\
{\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
}\
sqe = bs->get_sqe();\ sqe = bs->get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
@@ -187,71 +186,11 @@ resume_0:
(bs->journal.dirty_start >= bs->journal.used_start || (bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start)) dirty_end->second.journal_sector < bs->journal.used_start))
{ {
flusher->enqueue_flush(cur);
// We can't flush journal sectors that are still written to // We can't flush journal sectors that are still written to
// However, as we group flushes by oid, current oid may have older writes to flush! flusher->enqueue_flush(cur);
// And it may even block writes if we don't flush the older version flusher->dequeuing = false;
// (if it's in the beginning of the journal)... wait_state = 0;
// So first try to find an older version of the same object to flush. return true;
bool found = false;
while (dirty_end != bs->dirty_db.begin())
{
dirty_end--;
if (dirty_end->first.oid != cur.oid)
{
break;
}
if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start)))
{
found = true;
cur.version = dirty_end->first.version;
break;
}
}
if (!found)
{
// Try other objects
int search_left = flusher->flush_queue.size() - 1;
#ifdef BLOCKSTORE_DEBUG
printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
#endif
while (search_left > 0)
{
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
flusher->flush_queue.pop_front();
flusher->flush_versions.erase(cur.oid);
dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end())
{
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
{
#ifdef BLOCKSTORE_DEBUG
printf("Write %lu:%lu v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
#endif
flusher->enqueue_flush(cur);
}
else
{
break;
}
}
search_left--;
}
if (search_left <= 0)
{
#ifdef BLOCKSTORE_DEBUG
printf("No older flushes, stopping\n");
#endif
flusher->dequeuing = false;
wait_state = 0;
return true;
}
}
} }
repeat_it = flusher->sync_to_repeat.find(cur.oid); repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end()) if (repeat_it != flusher->sync_to_repeat.end())
@@ -275,26 +214,32 @@ resume_0:
#endif #endif
flusher->active_flushers++; flusher->active_flushers++;
resume_1: resume_1:
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
// Scan dirty versions of the object // Scan dirty versions of the object
if (!scan_dirty(1)) if (!scan_dirty(1))
{ {
wait_state += 1; wait_state += 1;
return false; return false;
} }
// Writes and deletes shouldn't happen at the same time if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
assert(!(copy_count > 0 || has_writes) || !has_delete);
if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
{ {
// Nothing to flush // Nothing to flush
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc); flusher->active_flushers--;
goto trim_journal; repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
{
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
}
flusher->sync_to_repeat.erase(repeat_it);
wait_state = 0;
goto resume_0;
} }
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
if (clean_loc == UINT64_MAX) if (clean_loc == UINT64_MAX)
{ {
if (old_clean_loc == UINT64_MAX) if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
{ {
// Object not allocated. This is a bug. // Object not allocated. This is a bug.
char err[1024]; char err[1024];
@@ -465,9 +410,8 @@ resume_1:
} }
// Update clean_db and dirty_db, free old data locations // Update clean_db and dirty_db, free old data locations
update_clean_db(); update_clean_db();
trim_journal:
// Clear unused part of the journal every <journal_trim_interval> flushes // Clear unused part of the journal every <journal_trim_interval> flushes
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0) if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
{ {
flusher->journal_trim_counter = 0; flusher->journal_trim_counter = 0;
if (bs->journal.trim()) if (bs->journal.trim())
@@ -497,7 +441,7 @@ resume_1:
} }
// All done // All done
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("Flushed %lu:%lu v%lu (%ld left)\n", cur.oid.inode, cur.oid.stripe, cur.version, flusher->flush_queue.size()); printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif #endif
flusher->active_flushers--; flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid); repeat_it = flusher->sync_to_repeat.find(cur.oid);
@@ -525,7 +469,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
copy_count = 0; copy_count = 0;
clean_loc = UINT64_MAX; clean_loc = UINT64_MAX;
has_delete = false; has_delete = false;
has_writes = false; has_empty = false;
skip_copy = false; skip_copy = false;
clean_init_bitmap = false; clean_init_bitmap = false;
while (1) while (1)
@@ -533,8 +477,11 @@ bool journal_flusher_co::scan_dirty(int wait_base)
if (dirty_it->second.state == ST_J_STABLE && !skip_copy) if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{ {
// First we submit all reads // First we submit all reads
has_writes = true; if (dirty_it->second.len == 0)
if (dirty_it->second.len != 0) {
has_empty = true;
}
else
{ {
offset = dirty_it->second.offset; offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len; end_offset = dirty_it->second.offset + dirty_it->second.len;
@@ -576,7 +523,6 @@ bool journal_flusher_co::scan_dirty(int wait_base)
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy) else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{ {
// There is an unflushed big write. Copy small writes in its position // There is an unflushed big write. Copy small writes in its position
has_writes = true;
clean_loc = dirty_it->second.location; clean_loc = dirty_it->second.location;
clean_init_bitmap = true; clean_init_bitmap = true;
clean_bitmap_offset = dirty_it->second.offset; clean_bitmap_offset = dirty_it->second.offset;

View File

@@ -45,7 +45,7 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it; std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w; std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete, has_writes; bool skip_copy, has_delete, has_empty;
blockstore_clean_db_t::iterator clean_it; blockstore_clean_db_t::iterator clean_it;
std::vector<copy_buffer_t> v; std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it; std::vector<copy_buffer_t>::iterator it;
@@ -73,10 +73,9 @@ public:
// Journal flusher itself // Journal flusher itself
class journal_flusher_t class journal_flusher_t
{ {
int trim_wanted = 0;
bool dequeuing; bool dequeuing;
int flusher_count; int flusher_count;
int flusher_start_threshold; int sync_threshold;
journal_flusher_co *co; journal_flusher_co *co;
blockstore_impl_t *bs; blockstore_impl_t *bs;
friend class journal_flusher_co; friend class journal_flusher_co;
@@ -97,8 +96,7 @@ public:
~journal_flusher_t(); ~journal_flusher_t();
void loop(); void loop();
bool is_active(); bool is_active();
void request_trim(); void force_start();
void release_trim();
void enqueue_flush(obj_ver_id oid); void enqueue_flush(obj_ver_id oid);
void unshift_flush(obj_ver_id oid); void unshift_flush(obj_ver_id oid);
}; };

View File

@@ -124,6 +124,12 @@ void blockstore_impl_t::loop()
if (PRIV(op)->wait_for) if (PRIV(op)->wait_for)
{ {
check_wait(op); check_wait(op);
#ifdef BLOCKSTORE_DEBUG
if (PRIV(op)->wait_for)
{
printf("still waiting for %d\n", PRIV(op)->wait_for);
}
#endif
if (PRIV(op)->wait_for == WAIT_SQE) if (PRIV(op)->wait_for == WAIT_SQE)
{ {
break; break;
@@ -144,7 +150,7 @@ void blockstore_impl_t::loop()
{ {
dequeue_op = dequeue_read(op); dequeue_op = dequeue_read(op);
} }
else if (op->opcode == BS_OP_WRITE) else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
{ {
if (has_writes == 2) if (has_writes == 2)
{ {
@@ -154,16 +160,6 @@ void blockstore_impl_t::loop()
dequeue_op = dequeue_write(op); dequeue_op = dequeue_write(op);
has_writes = dequeue_op ? 1 : 2; has_writes = dequeue_op ? 1 : 2;
} }
else if (op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
// Some writes could not be submitted
break;
}
dequeue_op = dequeue_del(op);
has_writes = dequeue_op ? 1 : 2;
}
else if (op->opcode == BS_OP_SYNC) else if (op->opcode == BS_OP_SYNC)
{ {
// wait for all small writes to be submitted // wait for all small writes to be submitted
@@ -275,9 +271,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (ringloop->space_left() < PRIV(op)->wait_detail) if (ringloop->space_left() < PRIV(op)->wait_detail)
{ {
// stop submission if there's still no free space // stop submission if there's still no free space
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
#endif
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
@@ -287,12 +280,8 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (journal.used_start == PRIV(op)->wait_detail) if (journal.used_start == PRIV(op)->wait_detail)
{ {
// do not submit // do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
#endif
return; return;
} }
flusher->release_trim();
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
} }
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER) else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
@@ -302,9 +291,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
journal.sector_info[next].dirty) journal.sector_info[next].dirty)
{ {
// do not submit // do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for a journal buffer\n");
#endif
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
@@ -313,9 +299,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
{ {
if (!data_alloc->get_free_count() && !flusher->is_active()) if (!data_alloc->get_free_count() && !flusher->is_active())
{ {
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
#endif
return; return;
} }
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
@@ -380,7 +363,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
} }
}; };
} }
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE) && !enqueue_write(op)) if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
{ {
std::function<void (blockstore_op_t*)>(op->callback)(op); std::function<void (blockstore_op_t*)>(op->callback)(op);
return; return;
@@ -407,31 +390,9 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
ringloop->wakeup(); ringloop->wakeup();
} }
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
{
while (search_start < search_end)
{
int pos = search_start+(search_end-search_start)/2;
if (oid < list[pos].oid)
{
search_end = pos;
}
else if (list[pos].oid < oid)
{
search_start = pos+1;
}
else
{
list[pos].version = version;
return true;
}
}
return false;
}
void blockstore_impl_t::process_list(blockstore_op_t *op) void blockstore_impl_t::process_list(blockstore_op_t *op)
{ {
// Check PG // Count objects
uint32_t list_pg = op->offset; uint32_t list_pg = op->offset;
uint32_t pg_count = op->len; uint32_t pg_count = op->len;
uint64_t pg_stripe_size = op->oid.stripe; uint64_t pg_stripe_size = op->oid.stripe;
@@ -441,131 +402,70 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
FINISH_OP(op); FINISH_OP(op);
return; return;
} }
// Copy clean_db entries (sorted) uint64_t stable_count = 0;
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1); if (pg_count > 0)
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc); {
if (!stable) for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{
uint32_t pg = (it->first.inode + it->first.stripe / pg_stripe_size) % pg_count;
if (pg == list_pg)
{
stable_count++;
}
}
}
else
{
stable_count = clean_db.size();
}
uint64_t total_count = stable_count;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
{
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
{
if (IS_STABLE(it->second.state))
{
stable_count++;
}
total_count++;
}
}
// Allocate memory
op->version = stable_count;
op->retval = total_count;
op->buf = malloc(sizeof(obj_ver_id) * total_count);
if (!op->buf)
{ {
op->retval = -ENOMEM; op->retval = -ENOMEM;
FINISH_OP(op); FINISH_OP(op);
return; return;
} }
obj_ver_id *vers = (obj_ver_id*)op->buf;
int i = 0;
for (auto it = clean_db.begin(); it != clean_db.end(); it++) for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{ {
if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg) if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
{ {
if (stable_count >= stable_alloc) vers[i++] = {
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
stable[stable_count++] = {
.oid = it->first, .oid = it->first,
.version = it->second.version, .version = it->second.version,
}; };
} }
} }
int clean_stable_count = stable_count; int j = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
obj_ver_id *unstable = NULL;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++) for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
{ {
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
{ {
if (IS_DELETE(it->second.state)) if (IS_STABLE(it->second.state))
{ {
// Deletions are always stable, so try to zero out two possible entries vers[i++] = it->first;
if (!replace_stable(it->first.oid, 0, 0, clean_stable_count, stable))
{
replace_stable(it->first.oid, 0, clean_stable_count, stable_count, stable);
}
}
else if (IS_STABLE(it->second.state))
{
// First try to replace a clean stable version in the first part of the list
if (!replace_stable(it->first.oid, it->first.version, 0, clean_stable_count, stable))
{
// Then try to replace the last dirty stable version in the second part of the list
if (stable[stable_count-1].oid == it->first.oid)
{
stable[stable_count-1].version = it->first.version;
}
else
{
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
stable[stable_count++] = it->first;
}
}
} }
else else
{ {
if (unstable_count >= unstable_alloc) vers[j++] = it->first;
{
unstable_alloc += 32768;
unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
if (!unstable)
{
if (stable)
free(stable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
unstable[unstable_count++] = it->first;
} }
} }
} }
// Remove zeroed out stable entries
int j = 0;
for (int i = 0; i < stable_count; i++)
{
if (stable[i].version != 0)
{
stable[j++] = stable[i];
}
}
stable_count = j;
if (stable_count+unstable_count > stable_alloc)
{
stable_alloc = stable_count+unstable_count;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
// Copy unstable entries
for (int i = 0; i < unstable_count; i++)
{
stable[j++] = unstable[i];
}
free(unstable);
op->version = stable_count;
op->retval = stable_count+unstable_count;
op->buf = stable;
FINISH_OP(op); FINISH_OP(op);
} }

View File

@@ -62,6 +62,11 @@
struct ring_data_t *data = ((ring_data_t*)sqe->user_data) struct ring_data_t *data = ((ring_data_t*)sqe->user_data)
#define BS_SUBMIT_GET_ONLY_SQE(sqe) \ #define BS_SUBMIT_GET_ONLY_SQE(sqe) \
{\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
}\
struct io_uring_sqe *sqe = get_sqe();\ struct io_uring_sqe *sqe = get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
@@ -71,6 +76,11 @@
} }
#define BS_SUBMIT_GET_SQE_DECL(sqe) \ #define BS_SUBMIT_GET_SQE_DECL(sqe) \
{\
timespec now;\
clock_gettime(CLOCK_REALTIME, &now);\
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
}\
sqe = get_sqe();\ sqe = get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
@@ -286,14 +296,12 @@ class blockstore_impl_t
// Stabilize // Stabilize
int dequeue_stable(blockstore_op_t *op); int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op); int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov);
void handle_stable_event(ring_data_t *data, blockstore_op_t *op); void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
void stabilize_object(object_id oid, uint64_t max_ver); void stabilize_object(object_id oid, uint64_t max_ver);
// Rollback // Rollback
int dequeue_rollback(blockstore_op_t *op); int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op); int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op); void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc); void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);

View File

@@ -404,7 +404,7 @@ resume_1:
bs->journal.trim(); bs->journal.trim();
bs->journal.dirty_start = bs->journal.next_free; bs->journal.dirty_start = bs->journal.next_free;
printf( printf(
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n", "Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
entries_loaded, entries_loaded,
(bs->journal.next_free >= bs->journal.used_start (bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start) ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@@ -475,7 +475,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (location != je->small_write.data_offset) if (location != je->small_write.data_offset)
{ {
char err[1024]; char err[1024];
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset); snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
throw std::runtime_error(err); throw std::runtime_error(err);
} }
uint32_t data_crc32 = 0; uint32_t data_crc32 = 0;
@@ -537,10 +537,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
}); });
bs->journal.used_sectors[proc_pos]++; bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
);
#endif #endif
auto & unstab = bs->unstable_writes[ov.oid]; auto & unstab = bs->unstable_writes[ov.oid];
unstab = unstab < ov.version ? ov.version : unstab; unstab = unstab < ov.version ? ov.version : unstab;
@@ -587,7 +584,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->stable.oid, .oid = je->stable.oid,
.version = je->stable.version, .version = je->stable.version,
}; };
bs->mark_stable(ov); auto it = bs->dirty_db.find(ov);
if (it == bs->dirty_db.end())
{
// journal contains a legitimate STABLE entry for a non-existing dirty write
// this probably means that journal was trimmed between WRITE and STABLE entries
// skip it
}
else
{
while (1)
{
it->second.state = (it->second.state == ST_D_SYNCED
? ST_D_STABLE
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
if (it == bs->dirty_db.begin())
break;
it--;
if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
break;
}
bs->flusher->enqueue_flush(ov);
}
auto unstab_it = bs->unstable_writes.find(ov.oid);
if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
{
bs->unstable_writes.erase(unstab_it);
}
} }
else if (je->type == JE_ROLLBACK) else if (je->type == JE_ROLLBACK)
{ {
@@ -595,39 +618,70 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version); printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
#endif #endif
// rollback dirty writes of <oid> up to <version> // rollback dirty writes of <oid> up to <version>
obj_ver_id ov = { auto it = bs->dirty_db.lower_bound((obj_ver_id){
.oid = je->rollback.oid, .oid = je->rollback.oid,
.version = je->rollback.version, .version = UINT64_MAX,
}; });
bs->mark_rolled_back(ov); if (it != bs->dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (it->first.oid == je->rollback.oid &&
it->first.version > je->rollback.version &&
!IS_IN_FLIGHT(it->second.state) &&
!IS_STABLE(it->second.state))
{
if (it->first.oid != je->rollback.oid)
break;
else if (it->first.version <= je->rollback.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == bs->dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
}
auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
if (unstab_it != bs->unstable_writes.end())
{
if (max_unstable == 0)
bs->unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
} }
else if (je->type == JE_DELETE) else if (je->type == JE_DELETE)
{ {
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version); printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
#endif #endif
auto clean_it = bs->clean_db.find(je->del.oid); // oid, version
if (clean_it == bs->clean_db.end() || obj_ver_id ov = {
clean_it->second.version < je->del.version) .oid = je->del.oid,
{ .version = je->del.version,
// oid, version };
obj_ver_id ov = { bs->dirty_db.emplace(ov, (dirty_entry){
.oid = je->del.oid, .state = ST_DEL_SYNCED,
.version = je->del.version, .flags = 0,
}; .location = 0,
bs->dirty_db.emplace(ov, (dirty_entry){ .offset = 0,
.state = ST_DEL_SYNCED, .len = 0,
.flags = 0, .journal_sector = proc_pos,
.location = 0, });
.offset = 0, bs->journal.used_sectors[proc_pos]++;
.len = 0,
.journal_sector = proc_pos,
});
bs->journal.used_sectors[proc_pos]++;
// Deletions are treated as immediately stable, because
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
bs->mark_stable(ov);
}
} }
started = true; started = true;
pos += je->size; pos += je->size;

View File

@@ -101,7 +101,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
: bs->journal.used_start - bs->journal.next_free) : bs->journal.used_start - bs->journal.next_free)
); );
PRIV(op)->wait_for = WAIT_JOURNAL; PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->request_trim(); bs->flusher->force_start();
PRIV(op)->wait_detail = bs->journal.used_start; PRIV(op)->wait_detail = bs->journal.used_start;
return 0; return 0;
} }
@@ -180,8 +180,8 @@ bool journal_t::trim()
auto journal_used_it = used_sectors.lower_bound(used_start); auto journal_used_it = used_sectors.lower_bound(used_start);
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n", "Trimming journal (used_start=%08lx, next_free=%08lx, first_used=%08lx, usage_count=%08lx)\n",
used_start, next_free, dirty_start, used_start, next_free,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first, journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
); );

View File

@@ -77,6 +77,33 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
} }
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
// FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
// FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
});
uint64_t max_unstable = 0;
while (dirty_it != dirty_db.begin())
{
dirty_it--;
if (dirty_it->first.oid != v->oid)
break;
else if (dirty_it->first.version <= v->version)
{
if (!IS_STABLE(dirty_it->second.state))
max_unstable = dirty_it->first.version;
break;
}
}
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
journal_entry_rollback *je = (journal_entry_rollback*) journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback)); prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
journal.sector_info[journal.cur_sector].dirty = false; journal.sector_info[journal.cur_sector].dirty = false;
@@ -134,7 +161,26 @@ resume_5:
int i; int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
mark_rolled_back(*v); // Erase dirty_db entries
auto rm_end = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
});
auto rm_start = rm_end;
assert(rm_start != dirty_db.begin());
rm_start--;
while (1)
{
if (rm_start->first.oid != v->oid || rm_start->first.version <= v->version)
{
rm_start++;
break;
}
if (rm_start == dirty_db.begin())
break;
rm_start--;
}
erase_dirty(rm_start, rm_end, UINT64_MAX);
} }
journal.trim(); journal.trim();
inflight_writes--; inflight_writes--;
@@ -144,54 +190,6 @@ resume_5:
return 1; return 1;
} }
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
{
auto it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (it != dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (it->first.oid == ov.oid &&
it->first.version > ov.version &&
!IS_IN_FLIGHT(it->second.state) &&
!IS_STABLE(it->second.state))
{
if (it->first.oid != ov.oid)
break;
else if (it->first.version <= ov.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
erase_dirty(rm_start, rm_end, UINT64_MAX);
}
auto unstab_it = unstable_writes.find(ov.oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
}
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op) void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
{ {
live = true; live = true;
@@ -227,13 +225,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
#endif #endif
data_alloc->set(dirty_it->second.location >> block_order, false); data_alloc->set(dirty_it->second.location >> block_order, false);
} }
int used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
"remove usage of journal offset %08lx by %lu:%lu v%lu (%d refs)\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
#endif #endif
int used = --journal.used_sectors[dirty_it->second.journal_sector];
if (used == 0) if (used == 0)
{ {
journal.used_sectors.erase(dirty_it->second.journal_sector); journal.used_sectors.erase(dirty_it->second.journal_sector);

View File

@@ -109,6 +109,12 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
// FIXME: Only stabilize versions that aren't stable yet // FIXME: Only stabilize versions that aren't stable yet
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v->version)
{
unstable_writes.erase(unstab_it);
}
journal_entry_stable *je = (journal_entry_stable*) journal_entry_stable *je = (journal_entry_stable*)
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable)); prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
journal.sector_info[journal.cur_sector].dirty = false; journal.sector_info[journal.cur_sector].dirty = false;
@@ -147,6 +153,11 @@ resume_2:
resume_3: resume_3:
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe *sqe = get_sqe(); io_uring_sqe *sqe = get_sqe();
if (!sqe) if (!sqe)
{ {
@@ -168,7 +179,42 @@ resume_5:
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
// Mark all dirty_db entries up to op->version as stable // Mark all dirty_db entries up to op->version as stable
mark_stable(*v); auto dirty_it = dirty_db.find(*v);
if (dirty_it != dirty_db.end())
{
while (1)
{
if (dirty_it->second.state == ST_J_SYNCED)
{
dirty_it->second.state = ST_J_STABLE;
}
else if (dirty_it->second.state == ST_D_SYNCED)
{
dirty_it->second.state = ST_D_STABLE;
}
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v->oid)
{
break;
}
}
#ifdef BLOCKSTORE_DEBUG
printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
#endif
flusher->enqueue_flush(*v);
}
} }
inflight_writes--; inflight_writes--;
// Acknowledge op // Acknowledge op
@@ -177,52 +223,6 @@ resume_5:
return 1; return 1;
} }
void blockstore_impl_t::mark_stable(const obj_ver_id & v)
{
auto dirty_it = dirty_db.find(v);
if (dirty_it != dirty_db.end())
{
while (1)
{
if (dirty_it->second.state == ST_J_SYNCED)
{
dirty_it->second.state = ST_J_STABLE;
}
else if (dirty_it->second.state == ST_D_SYNCED)
{
dirty_it->second.state = ST_D_STABLE;
}
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v.oid)
{
break;
}
}
#ifdef BLOCKSTORE_DEBUG
printf("enqueue_flush %lu:%lu v%lu\n", v.oid.inode, v.oid.stripe, v.version);
#endif
flusher->enqueue_flush(v);
}
auto unstab_it = unstable_writes.find(v.oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v.version)
{
unstable_writes.erase(unstab_it);
}
}
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op) void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
{ {
live = true; live = true;

View File

@@ -133,11 +133,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
journal.sector_info[journal.cur_sector].dirty = false; journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif #endif
je->oid = it->oid; je->oid = it->oid;
je->version = it->version; je->version = it->version;
@@ -275,16 +271,7 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
#endif #endif
auto & unstab = unstable_writes[it->oid]; auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab; unstab = unstab < it->version ? it->version : unstab;
if (dirty_db[*it].state == ST_DEL_WRITTEN) dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
{
dirty_db[*it].state = ST_DEL_SYNCED;
// Deletions are treated as immediately stable
mark_stable(*it);
}
else /* == ST_J_WRITTEN */
{
dirty_db[*it].state = ST_J_SYNCED;
}
} }
in_progress_syncs.erase(PRIV(op)->in_progress_ptr); in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
op->retval = 0; op->retval = 0;

View File

@@ -100,7 +100,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end());
if (dirty_it->second.state == ST_J_WAIT_BIG) if (dirty_it->second.state == ST_J_WAIT_BIG)
{ {
return 0; return 0;
@@ -214,11 +213,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif #endif
// Figure out where data will be // Figure out where data will be
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size; journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
@@ -293,7 +288,6 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end());
if (PRIV(op)->op_state == 2) if (PRIV(op)->op_state == 2)
goto resume_2; goto resume_2;
else if (PRIV(op)->op_state == 4) else if (PRIV(op)->op_state == 4)
@@ -302,6 +296,11 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
return 1; return 1;
resume_2: resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry // Only for the immediate_commit mode: prepare and submit big_write journal entry
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
sqe = get_sqe(); sqe = get_sqe();
if (!sqe) if (!sqe)
{ {
@@ -312,11 +311,7 @@ resume_2:
journal.sector_info[journal.cur_sector].dirty = false; journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version);
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif #endif
je->oid = op->oid; je->oid = op->oid;
je->version = op->version; je->version = op->version;
@@ -333,6 +328,11 @@ resume_2:
return 1; return 1;
resume_4: resume_4:
// Switch object state // Switch object state
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("write_done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state); printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
#endif #endif
@@ -355,11 +355,6 @@ resume_4:
else if (dirty_it->second.state == ST_DEL_SUBMITTED) else if (dirty_it->second.state == ST_DEL_SUBMITTED)
{ {
dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN; dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
if (imm)
{
// Deletions are treated as immediately stable
mark_stable(dirty_it->first);
}
} }
if (immediate_commit == IMMEDIATE_ALL) if (immediate_commit == IMMEDIATE_ALL)
{ {
@@ -418,10 +413,6 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
{ {
// We know for sure that we won't write into this sector anymore // We know for sure that we won't write into this sector anymore
uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size; uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
if (new_ds >= journal.len)
{
new_ds = journal.block_size;
}
if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) < if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
(new_ds + (new_ds >= journal.used_start ? 0 : journal.len))) (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
{ {
@@ -442,7 +433,6 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this); blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0)) if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
{ {
@@ -477,11 +467,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif #endif
je->oid = op->oid; je->oid = op->oid;
je->version = op->version; je->version = op->version;

View File

@@ -1,349 +1,357 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include "cluster_client.h" #include "cluster_client.h"
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config) osd_op_t::~osd_op_t()
{ {
this->ringloop = ringloop; assert(!bs_op);
this->tfd = tfd; if (op_data)
msgr.tfd = tfd;
msgr.ringloop = ringloop;
msgr.repeer_pgs = [this](osd_num_t peer_osd)
{ {
// peer_osd just connected or dropped connection free(op_data);
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end()) }
{ if (rmw_buf)
// really connected :) {
continue_ops(); free(rmw_buf);
} }
}; if (buf)
{
st_cli.tfd = tfd; // Note: reusing osd_op_t WILL currently lead to memory leaks
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); }; // So we don't reuse it, but free it every time
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); }; free(buf);
st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); }; }
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
log_level = config["log_level"].int64_value();
st_cli.parse_config(config);
st_cli.load_global_config();
} }
void cluster_client_t::continue_ops() void cluster_client_t::connect_peer(uint64_t peer_osd, json11::Json address_list, int port)
{ {
for (auto op_it = unsent_ops.begin(); op_it != unsent_ops.end(); ) if (wanted_peers.find(peer_osd) == wanted_peers.end())
{ {
cluster_op_t *op = *op_it; wanted_peers[peer_osd] = (osd_wanted_peer_t){
if (op->needs_reslice && !op->sent_count) .address_list = address_list,
.port = port,
};
}
else
{
wanted_peers[peer_osd].address_list = address_list;
wanted_peers[peer_osd].port = port;
}
wanted_peers[peer_osd].address_changed = true;
if (!wanted_peers[peer_osd].connecting &&
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
{
try_connect_peer(peer_osd);
}
}
void cluster_client_t::try_connect_peer(uint64_t peer_osd)
{
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end())
{
return;
}
if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
{
wanted_peers.erase(peer_osd);
return;
}
auto & wp = wp_it->second;
if (wp.address_index >= wp.address_list.array_items().size())
{
return;
}
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
}
void cluster_client_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{
on_connect_peer(peer_osd, -EINVAL);
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
on_connect_peer(peer_osd, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int timeout_id = -1;
if (peer_connect_timeout > 0)
{
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
{ {
op->parts.clear(); osd_num_t peer_osd = clients[peer_fd].osd_num;
op->done_count = 0; stop_client(peer_fd);
op->needs_reslice = false; on_connect_peer(peer_osd, -EIO);
return;
});
}
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
on_connect_peer(peer_osd, -errno);
return;
}
assert(peer_osd != this->osd_num);
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_timeout_id = timeout_id,
.osd_num = peer_osd,
.in_buf = malloc(receive_buffer_size),
};
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
handle_connect_epoll(peer_fd);
});
}
void cluster_client_t::handle_connect_epoll(int peer_fd)
{
auto & cl = clients[peer_fd];
if (cl.connect_timeout_id >= 0)
{
tfd->clear_timer(cl.connect_timeout_id);
cl.connect_timeout_id = -1;
}
osd_num_t peer_osd = cl.osd_num;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
stop_client(peer_fd);
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl.peer_state = PEER_CONNECTED;
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Check OSD number
check_peer_config(cl);
}
void cluster_client_t::handle_peer_epoll(int peer_fd, int epoll_events)
{
// Mark client as ready (i.e. some data is available)
if (epoll_events & EPOLLRDHUP)
{
// Stop client
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
stop_client(peer_fd);
}
else if (epoll_events & EPOLLIN)
{
// Mark client as ready (i.e. some data is available)
auto & cl = clients[peer_fd];
cl.read_ready++;
if (cl.read_ready == 1)
{
read_ready_clients.push_back(cl.peer_fd);
ringloop->wakeup();
} }
if (!op->parts.size()) }
}
void cluster_client_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
{
auto & wp = wanted_peers.at(peer_osd);
wp.connecting = false;
if (peer_fd < 0)
{
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
if (wp.address_changed)
{ {
unsent_ops.erase(op_it++); wp.address_changed = false;
execute(op); wp.address_index = 0;
continue; try_connect_peer(peer_osd);
} }
if (!op->needs_reslice) else if (wp.address_index < wp.address_list.array_items().size()-1)
{ {
for (auto & op_part: op->parts) // Try other addresses
{ wp.address_index++;
if (!op_part.sent && !op_part.done) try_connect_peer(peer_osd);
{
try_send(op, &op_part);
}
}
if (op->sent_count == op->parts.size() - op->done_count)
{
unsent_ops.erase(op_it++);
sent_ops.insert(op);
}
else
op_it++;
} }
else else
op_it++;
}
}
static uint32_t is_power_of_two(uint64_t value)
{
uint32_t l = 0;
while (value > 1)
{
if (value & 1)
{ {
return 64; // Retry again in <peer_connect_interval> seconds
wp.last_connect_attempt = time(NULL);
wp.address_index = 0;
tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
{
try_connect_peer(peer_osd);
});
} }
value = value >> 1;
l++;
}
return l;
}
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
bs_block_size = config["block_size"].uint64_value();
bs_disk_alignment = config["disk_alignment"].uint64_value();
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
if (!bs_block_size)
bs_block_size = DEFAULT_BLOCK_SIZE;
if (!bs_disk_alignment)
bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
if (!bs_bitmap_granularity)
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
{
uint32_t block_order;
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
throw std::runtime_error("Bad block size");
}
if (config.find("pg_stripe_size") != config.end())
{
pg_stripe_size = config["pg_stripe_size"].uint64_value();
if (!pg_stripe_size)
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
if (config["immediate_commit"] == "all")
{
// Cluster-wide immediate_commit mode
immediate_commit = true;
}
msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
if (!msgr.peer_connect_interval)
msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
if (!msgr.peer_connect_timeout)
msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
}
void cluster_client_t::on_load_pgs_hook(bool success)
{
if (success)
{
pg_count = st_cli.pg_config.size();
continue_ops();
}
}
void cluster_client_t::on_change_hook(json11::Json::object & changes)
{
if (pg_count != st_cli.pg_config.size())
{
// At this point, all operations should be suspended
// And they need to be resliced!
for (auto op: unsent_ops)
{
op->needs_reslice = true;
}
for (auto op: sent_ops)
{
op->needs_reslice = true;
}
pg_count = st_cli.pg_config.size();
}
continue_ops();
}
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
{
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
{
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
}
// FIXME: Implement OSD_OP_SYNC for immediate_commit == false
void cluster_client_t::execute(cluster_op_t *op)
{
if (op->opcode == OSD_OP_SYNC && immediate_commit)
{
// Syncs are not required in the immediate_commit mode
op->retval = 0;
std::function<void(cluster_op_t*)>(op->callback)(op);
return; return;
} }
if (op->opcode != OSD_OP_READ && op->opcode != OSD_OP_OUT || !op->inode || !op->len || printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
op->offset % bs_disk_alignment || op->len % bs_disk_alignment) wanted_peers.erase(peer_osd);
repeer_pgs(peer_osd);
}
void cluster_client_t::check_peer_config(osd_client_t & cl)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.show_conf = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SHOW_CONFIG,
},
},
};
op->callback = [this](osd_op_t *op)
{ {
op->retval = -EINVAL; osd_client_t & cl = clients[op->peer_fd];
std::function<void(cluster_op_t*)>(op->callback)(op); std::string json_err;
return; json11::Json config;
} bool err = false;
if (!pg_stripe_size) if (op->reply.hdr.retval < 0)
{
// Config is not loaded yet
unsent_ops.insert(op);
return;
}
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
{
// Copy operation
cluster_op_t *op_copy = new cluster_op_t();
op_copy->opcode = op->opcode;
op_copy->inode = op->inode;
op_copy->offset = op->offset;
op_copy->len = op->len;
op_copy->buf = malloc(op->len);
memcpy(op_copy->buf, op->buf, op->len);
unsynced_ops.push_back(op_copy);
unsynced_bytes += op->len;
if (inmemory_commit)
{ {
// Immediately acknowledge write and continue with the copy err = true;
op->retval = op->len; printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
std::function<void(cluster_op_t*)>(op->callback)(op);
op = op_copy;
} }
if (unsynced_bytes >= inmemory_dirty_limit) else
{ {
// Push an extra SYNC operation config = json11::Json::parse(std::string((char*)op->buf), json_err);
} if (json_err != "")
}
// Slice the request into individual object stripe requests
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
uint64_t pg_block_size = bs_block_size * pg_part_count;
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
int part_count = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
if (op->offset < (stripe+pg_block_size) && (op->offset+op->len) > stripe)
{
part_count++;
}
}
op->parts.resize(part_count);
bool resend = false;
int i = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
uint64_t stripe_end = stripe + pg_block_size;
if (op->offset < stripe_end && (op->offset+op->len) > stripe)
{
pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pg_count + 1;
op->parts[i] = {
.parent = op,
.offset = op->offset < stripe ? stripe : op->offset,
.len = (uint32_t)((op->offset+op->len) > stripe_end ? pg_block_size : op->offset+op->len-stripe),
.pg_num = pg_num,
.buf = op->buf + (op->offset < stripe ? stripe-op->offset : 0),
.sent = false,
.done = false,
};
if (!try_send(op, &op->parts[i]))
{ {
// Part needs to be sent later err = true;
resend = true; printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
}
else if (config["osd_num"].uint64_value() != cl.osd_num)
{
err = true;
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
on_connect_peer(cl.osd_num, -1);
} }
i++;
} }
} if (err)
if (resend) {
stop_client(op->peer_fd);
delete op;
return;
}
osd_peer_fds[cl.osd_num] = cl.peer_fd;
on_connect_peer(cl.osd_num, cl.peer_fd);
delete op;
};
outbox_push(op);
}
void cluster_client_t::cancel_osd_ops(osd_client_t & cl)
{
for (auto p: cl.sent_ops)
{ {
unsent_ops.insert(op); cancel_out_op(p.second);
} }
else cl.sent_ops.clear();
for (auto op: cl.outbox)
{ {
sent_ops.insert(op); cancel_out_op(op);
}
cl.outbox.clear();
if (cl.write_op)
{
cancel_out_op(cl.write_op);
cl.write_op = NULL;
} }
} }
bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part) void cluster_client_t::cancel_out_op(osd_op_t *op)
{ {
auto pg_it = st_cli.pg_config.find(part->pg_num); op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
if (pg_it != st_cli.pg_config.end() && op->reply.hdr.id = op->req.hdr.id;
!pg_it->second.pause && pg_it->second.cur_primary) op->reply.hdr.opcode = op->req.hdr.opcode;
{ op->reply.hdr.retval = -EPIPE;
osd_num_t primary_osd = pg_it->second.cur_primary; // Copy lambda to be unaffected by `delete op`
auto peer_it = msgr.osd_peer_fds.find(primary_osd); std::function<void(osd_op_t*)>(op->callback)(op);
if (peer_it != msgr.osd_peer_fds.end())
{
int peer_fd = peer_it->second;
part->osd_num = primary_osd;
part->sent = true;
op->sent_count++;
part->op = {
.op_type = OSD_OP_OUT,
.peer_fd = peer_fd,
.req = { .rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = op_id++,
.opcode = op->opcode,
},
.inode = op->inode,
.offset = part->offset,
.len = part->len,
} },
.callback = [this, part](osd_op_t *op_part)
{
handle_op_part(part);
},
};
part->op.send_list.push_back(part->op.req.buf, OSD_PACKET_SIZE);
if (op->opcode == OSD_OP_WRITE)
{
part->op.send_list.push_back(part->buf, part->len);
}
else
{
part->op.buf = part->buf;
}
msgr.outbox_push(&part->op);
return true;
}
else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
{
msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
}
}
return false;
} }
void cluster_client_t::handle_op_part(cluster_op_part_t *part) void cluster_client_t::stop_client(int peer_fd)
{ {
cluster_op_t *op = part->parent; assert(peer_fd != 0);
part->sent = false; auto it = clients.find(peer_fd);
op->sent_count--; if (it == clients.end())
part->op.buf = NULL;
if (part->op.reply.hdr.retval != part->op.req.rw.len)
{ {
// Operation failed, retry return;
printf( }
"Operation part failed on OSD %lu: retval=%ld (expected %u), reconnecting\n", uint64_t repeer_osd = 0;
part->osd_num, part->op.reply.hdr.retval, part->op.req.rw.len osd_client_t cl = it->second;
); if (cl.peer_state == PEER_CONNECTED)
msgr.stop_client(part->op.peer_fd); {
if (op->sent_count == op->parts.size() - op->done_count - 1) if (cl.osd_num)
{ {
// Resend later when OSDs come up // Reload configuration from etcd when the connection is dropped
// FIXME: Check for different types of errors printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
// FIXME: Repeat operations after a small timeout, for the case when OSD is coming up repeer_osd = cl.osd_num;
sent_ops.erase(op);
unsent_ops.insert(op);
} }
if (op->sent_count == 0 && op->needs_reslice) else
{ {
// PG count has changed, reslice the operation printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
unsent_ops.erase(op);
op->parts.clear();
op->done_count = 0;
op->needs_reslice = false;
execute(op);
} }
} }
else clients.erase(it);
tfd->set_fd_handler(peer_fd, false, NULL);
if (cl.osd_num)
{ {
// OK osd_peer_fds.erase(cl.osd_num);
part->done = true; // Cancel outbound operations
op->done_count++; cancel_osd_ops(cl);
if (op->done_count >= op->parts.size()) }
if (cl.read_op)
{
delete cl.read_op;
cl.read_op = NULL;
}
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
{
if (*rit == peer_fd)
{ {
// Finished! read_ready_clients.erase(rit);
sent_ops.erase(op); break;
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
} }
} }
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
{
if (*wit == peer_fd)
{
write_ready_clients.erase(wit);
break;
}
}
free(cl.in_buf);
assert(peer_fd != 0);
close(peer_fd);
if (repeer_osd)
{
repeer_pgs(repeer_osd);
}
} }

View File

@@ -1,80 +1,209 @@
#pragma once #pragma once
#include "messenger.h" #include <sys/types.h>
#include "etcd_state_client.h" #include <stdint.h>
#include <arpa/inet.h>
#include <malloc.h>
#define MIN_BLOCK_SIZE 4*1024 #include <set>
#define MAX_BLOCK_SIZE 128*1024*1024 #include <map>
#define DEFAULT_BLOCK_SIZE 128*1024 #include <deque>
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 #include <vector>
#define DEFAULT_DISK_ALIGNMENT 4096
#define DEFAULT_BITMAP_GRANULARITY 4096
struct cluster_op_t; #include "json11/json11.hpp"
#include "osd_ops.h"
#include "timerfd_manager.h"
#include "ringloop.h"
struct cluster_op_part_t #define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define MAX_EPOLL_EVENTS 64
#define OSD_OP_INLINE_BUF_COUNT 16
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
struct osd_op_buf_list_t
{ {
cluster_op_t *parent; int count = 0, alloc = 0, sent = 0;
uint64_t offset; iovec *buf = NULL;
uint32_t len; iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
pg_num_t pg_num;
osd_num_t osd_num; ~osd_op_buf_list_t()
void *buf; {
bool sent; if (buf && buf != inline_buf)
bool done; {
osd_op_t op; free(buf);
}
}
inline iovec* get_iovec()
{
return (buf ? buf : inline_buf) + sent;
}
inline int get_size()
{
return count - sent;
}
inline void push_back(void *nbuf, size_t len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
}
}
buf[count++] = { .iov_base = nbuf, .iov_len = len };
}
}; };
struct cluster_op_t struct blockstore_op_t;
struct osd_primary_op_data_t;
struct osd_op_t
{ {
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC timespec tv_begin;
uint64_t inode; uint64_t op_type = OSD_OP_IN;
uint64_t offset; int peer_fd;
uint64_t len; osd_any_op_t req;
int retval; osd_any_reply_t reply;
void *buf; blockstore_op_t *bs_op = NULL;
std::function<void(cluster_op_t*)> callback; void *buf = NULL;
protected: void *rmw_buf = NULL;
bool needs_reslice = false; osd_primary_op_data_t* op_data = NULL;
int sent_count = 0, done_count = 0; std::function<void(osd_op_t*)> callback;
std::vector<cluster_op_part_t> parts;
friend class cluster_client_t; osd_op_buf_list_t send_list;
~osd_op_t();
}; };
class cluster_client_t struct osd_client_t
{
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
int connect_timeout_id = -1;
osd_num_t osd_num = 0;
void *in_buf = NULL;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Outbound operations sent to this peer
std::map<int, osd_op_t*> sent_ops;
// Outbound messages (replies or requests)
std::deque<osd_op_t*> outbox;
// PGs dirtied by this client's primary-writes (FIXME to drop the connection)
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
};
struct osd_wanted_peer_t
{
json11::Json address_list;
int port;
time_t last_connect_attempt;
bool connecting, address_changed;
int address_index;
std::string cur_addr;
int cur_port;
};
struct osd_op_stats_t
{
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
};
struct cluster_client_t
{ {
timerfd_manager_t *tfd; timerfd_manager_t *tfd;
ring_loop_t *ringloop; ring_loop_t *ringloop;
uint64_t pg_part_count = 2; // osd_num_t is only for logging and asserts
uint64_t pg_stripe_size = 0; osd_num_t osd_num;
uint64_t bs_block_size = 0; int receive_buffer_size = 9000;
uint64_t bs_disk_alignment = 0; int peer_connect_interval = 5;
uint64_t bs_bitmap_granularity = 0; int peer_connect_timeout = 5;
uint64_t pg_count = 0; int log_level = 0;
bool immediate_commit = false;
bool inmemory_commit = false;
uint64_t inmemory_dirty_limit = 32*1024*1024;
int log_level;
uint64_t op_id = 1; std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
etcd_state_client_t st_cli; std::map<uint64_t, int> osd_peer_fds;
osd_messenger_t msgr; uint64_t next_subop_id = 1;
std::set<cluster_op_t*> sent_ops, unsent_ops;
// unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
std::vector<cluster_op_t*> unsynced_ops;
uint64_t unsynced_bytes = 0;
public: std::map<int, osd_client_t> clients;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config); std::vector<int> read_ready_clients;
void execute(cluster_op_t *op); std::vector<int> write_ready_clients;
protected: // op statistics
void continue_ops(); osd_op_stats_t stats;
void on_load_config_hook(json11::Json::object & cfg);
void on_load_pgs_hook(bool success); // public
void on_change_hook(json11::Json::object & changes); void connect_peer(uint64_t osd_num, json11::Json address_list, int port);
void on_change_osd_state_hook(uint64_t peer_osd); void stop_client(int peer_fd);
bool try_send(cluster_op_t *op, cluster_op_part_t *part); void outbox_push(osd_op_t *cur_op);
void handle_op_part(cluster_op_part_t *part); std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
// private
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_connect_epoll(int peer_fd);
void handle_peer_epoll(int peer_fd, int epoll_events);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t & cl);
void cancel_osd_ops(osd_client_t & cl);
void cancel_out_op(osd_op_t *op);
bool try_send(osd_client_t & cl);
void send_replies();
void handle_send(ring_data_t *data, int peer_fd);
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
void handle_finished_read(osd_client_t & cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
}; };

View File

@@ -1,87 +0,0 @@
#include <sys/epoll.h>
#include <sys/poll.h>
#include <unistd.h>
#include "epoll_manager.h"
#define MAX_EPOLL_EVENTS 64
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
{
this->ringloop = ringloop;
epoll_fd = epoll_create(1);
if (epoll_fd < 0)
{
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
}
tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
handle_epoll_events();
}
epoll_manager_t::~epoll_manager_t()
{
if (tfd)
{
delete tfd;
tfd = NULL;
}
close(epoll_fd);
}
void epoll_manager_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
{
if (handler != NULL)
{
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev;
ev.data.fd = fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers[fd] = handler;
}
else
{
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers.erase(fd);
}
}
void epoll_manager_t::handle_epoll_events()
{
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
{
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
data->callback = [this](ring_data_t *data)
{
if (data->res < 0)
{
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
}
handle_epoll_events();
};
ringloop->submit();
int nfds;
epoll_event events[MAX_EPOLL_EVENTS];
do
{
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
for (int i = 0; i < nfds; i++)
{
auto & cb = epoll_handlers[events[i].data.fd];
cb(events[i].data.fd, events[i].events);
}
} while (nfds == MAX_EPOLL_EVENTS);
}

View File

@@ -1,20 +0,0 @@
#pragma once
#include <map>
#include "ringloop.h"
#include "timerfd_manager.h"
class epoll_manager_t
{
int epoll_fd;
ring_loop_t *ringloop;
std::map<int, std::function<void(int, int)>> epoll_handlers;
public:
epoll_manager_t(ring_loop_t *ringloop);
~epoll_manager_t();
void set_fd_handler(int fd, std::function<void(int, int)> handler);
void handle_epoll_events();
timerfd_manager_t *tfd;
};

View File

@@ -43,53 +43,6 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
http_request_json(tfd, etcd_address, req, timeout, callback); http_request_json(tfd, etcd_address, req, timeout, callback);
} }
void etcd_state_client_t::parse_config(json11::Json & config)
{
this->etcd_addresses.clear();
if (config["etcd_address"].is_string())
{
std::string ea = config["etcd_address"].string_value();
while (1)
{
int pos = ea.find(',');
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
if (addr.length() > 0)
{
if (addr.find('/') < 0)
addr += "/v3";
this->etcd_addresses.push_back(addr);
}
if (pos >= 0)
ea = ea.substr(pos+1);
else
break;
}
}
else if (config["etcd_address"].array_items().size())
{
for (auto & ea: config["etcd_address"].array_items())
{
std::string addr = ea.string_value();
if (addr != "")
{
if (addr.find('/') < 0)
addr += "/v3";
this->etcd_addresses.push_back(addr);
}
}
}
this->etcd_prefix = config["etcd_prefix"].string_value();
if (this->etcd_prefix == "")
{
this->etcd_prefix = "/microceph";
}
else if (this->etcd_prefix[0] != '/')
{
this->etcd_prefix = "/"+this->etcd_prefix;
}
this->log_level = config["log_level"].int64_value();
}
void etcd_state_client_t::start_etcd_watcher() void etcd_state_client_t::start_etcd_watcher()
{ {
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()]; std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
@@ -140,10 +93,7 @@ void etcd_state_client_t::start_etcd_watcher()
parse_state(kv.first, kv.second); parse_state(kv.first, kv.second);
} }
// React to changes // React to changes
if (on_change_hook != NULL) on_change_hook(changes);
{
on_change_hook(changes);
}
} }
} }
if (msg->eof) if (msg->eof)
@@ -258,7 +208,7 @@ void etcd_state_client_t::load_pgs()
}, },
}; };
json11::Json::object req = { { "success", txn } }; json11::Json::object req = { { "success", txn } };
json11::Json checks = load_pgs_checks_hook != NULL ? load_pgs_checks_hook() : json11::Json(); json11::Json checks = load_pgs_checks_hook();
if (checks.array_items().size() > 0) if (checks.array_items().size() > 0)
{ {
req["compare"] = checks; req["compare"] = checks;

View File

@@ -1,6 +1,5 @@
#pragma once #pragma once
#include "osd_id.h"
#include "http_client.h" #include "http_client.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
@@ -57,5 +56,4 @@ struct etcd_state_client_t
void load_global_config(); void load_global_config();
void load_pgs(); void load_pgs();
void parse_state(const std::string & key, const json11::Json & value); void parse_state(const std::string & key, const json11::Json & value);
void parse_config(json11::Json & config);
}; };

View File

@@ -1,298 +0,0 @@
// FIO engine to test cluster I/O
//
// Random write:
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
//
// Linear write:
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
//
// Random read (run with -iodepth=32 or -iodepth=1):
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <vector>
#include <unordered_map>
#include "epoll_manager.h"
#include "cluster_client.h"
extern "C" {
#define CONFIG_HAVE_GETTID
#define CONFIG_PWRITEV2
#include "fio/fio.h"
#include "fio/optgroup.h"
}
struct sec_data
{
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
bool last_sync = false;
/* The list of completed io_u structs. */
std::vector<io_u*> completed;
uint64_t op_n = 0, inflight = 0;
bool trace = false;
};
struct sec_options
{
int __pad;
char *etcd_host = NULL;
char *etcd_prefix = NULL;
int inode = 0;
int trace = 0;
};
static struct fio_option options[] = {
{
.name = "etcd",
.lname = "etcd address",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, etcd_host),
.help = "etcd address in the form HOST:PORT[/PATH]",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "etcd",
.lname = "etcd key prefix",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, etcd_prefix),
.help = "etcd key prefix, by default /microceph",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "inode",
.lname = "inode to run tests on",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, inode),
.help = "inode to run tests on (1 by default)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "osd_trace",
.lname = "OSD trace",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, trace),
.help = "Trace OSD operations",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
};
static int sec_setup(struct thread_data *td)
{
sec_data *bsd;
bsd = new sec_data;
if (!bsd)
{
td_verror(td, errno, "calloc");
return 1;
}
td->io_ops_data = bsd;
if (!td->files_index)
{
add_file(td, "osd_cluster", 0, 0);
td->o.nr_files = td->o.nr_files ? : 1;
td->o.open_files++;
}
return 0;
}
static void sec_cleanup(struct thread_data *td)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd)
{
delete bsd->cli;
delete bsd->epmgr;
delete bsd->ringloop;
bsd->cli = NULL;
bsd->epmgr = NULL;
bsd->ringloop = NULL;
}
}
/* Connect to the server from each thread. */
static int sec_init(struct thread_data *td)
{
sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
json11::Json cfg = json11::Json::object {
{ "etcd_address", std::string(o->etcd_host) },
{ "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/microceph") },
};
bsd->ringloop = new ring_loop_t(512);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
bsd->trace = o->trace ? true : false;
return 0;
}
/* Begin read or write request. */
static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
{
sec_options *opt = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
int n = bsd->op_n;
fio_ro_check(td, io);
if (io->ddir == DDIR_SYNC && bsd->last_sync)
{
return FIO_Q_COMPLETED;
}
io->engine_data = bsd;
cluster_op_t *op = new cluster_op_t;
switch (io->ddir)
{
case DDIR_READ:
op->opcode = OSD_OP_READ;
op->inode = opt->inode;
op->offset = io->offset;
op->len = io->xfer_buflen;
op->buf = io->xfer_buf;
bsd->last_sync = false;
break;
case DDIR_WRITE:
op->opcode = OSD_OP_WRITE;
op->inode = opt->inode;
op->offset = io->offset;
op->len = io->xfer_buflen;
op->buf = io->xfer_buf;
bsd->last_sync = false;
break;
case DDIR_SYNC:
op->opcode = OSD_OP_SYNC;
bsd->last_sync = true;
break;
default:
io->error = EINVAL;
return FIO_Q_COMPLETED;
}
op->callback = [io, n](cluster_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
sec_data *bsd = (sec_data*)io->engine_data;
bsd->inflight--;
bsd->completed.push_back(io);
if (bsd->trace)
{
printf("--- %s n=%d retval=%d\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n, op->retval);
}
delete op;
};
if (opt->trace)
{
printf("+++ %s # %d\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n);
}
io->error = 0;
bsd->inflight++;
bsd->op_n++;
bsd->cli->execute(op);
if (io->error != 0)
return FIO_Q_COMPLETED;
return FIO_Q_QUEUED;
}
static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
while (true)
{
bsd->ringloop->loop();
if (bsd->completed.size() >= min)
break;
bsd->ringloop->wait();
}
return bsd->completed.size();
}
static struct io_u *sec_event(struct thread_data *td, int event)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd->completed.size() == 0)
return NULL;
/* FIXME We ignore the event number and assume fio calls us exactly once for [0..nr_events-1] */
struct io_u *ev = bsd->completed.back();
bsd->completed.pop_back();
return ev;
}
static int sec_io_u_init(struct thread_data *td, struct io_u *io)
{
io->engine_data = NULL;
return 0;
}
static void sec_io_u_free(struct thread_data *td, struct io_u *io)
{
}
static int sec_open_file(struct thread_data *td, struct fio_file *f)
{
return 0;
}
static int sec_invalidate(struct thread_data *td, struct fio_file *f)
{
return 0;
}
struct ioengine_ops ioengine = {
.name = "microceph_cluster",
.version = FIO_IOOPS_VERSION,
.flags = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
.setup = sec_setup,
.init = sec_init,
.queue = sec_queue,
.getevents = sec_getevents,
.event = sec_event,
.cleanup = sec_cleanup,
.open_file = sec_open_file,
.invalidate = sec_invalidate,
.io_u_init = sec_io_u_init,
.io_u_free = sec_io_u_free,
.option_struct_size = sizeof(struct sec_options),
.options = options,
};
static void fio_init fio_sec_register(void)
{
register_ioengine(&ioengine);
}
static void fio_exit fio_sec_unregister(void)
{
unregister_ioengine(&ioengine);
}

View File

@@ -5,7 +5,7 @@
// Random write: // Random write:
// //
// fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \ // fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -host=127.0.0.1 -port=11203 [-block_size_order=17] [-single_primary=1] -size=1000M // -host=127.0.0.1 -port=11203 [-single_primary=1] -size=1000M
// //
// Linear write: // Linear write:
// //
@@ -53,7 +53,6 @@ struct sec_options
int port = 0; int port = 0;
int single_primary = 0; int single_primary = 0;
int trace = 0; int trace = 0;
int block_order = 17;
}; };
static struct fio_option options[] = { static struct fio_option options[] = {
@@ -75,15 +74,6 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME, .group = FIO_OPT_G_FILENAME,
}, },
{
.name = "block_size_order",
.lname = "Blockstore block size order",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, block_order),
.help = "Blockstore block size order (size = 2^order)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{ {
.name = "single_primary", .name = "single_primary",
.lname = "Single Primary", .lname = "Single Primary",
@@ -150,8 +140,6 @@ static int sec_init(struct thread_data *td)
{ {
sec_options *o = (sec_options*)td->eo; sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data; sec_data *bsd = (sec_data*)td->io_ops_data;
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order;
struct sockaddr_in addr; struct sockaddr_in addr;
int r; int r;

View File

@@ -50,15 +50,8 @@ struct http_co_t
websocket_t ws; websocket_t ws;
int onstack = 0;
bool ended = false;
~http_co_t(); ~http_co_t();
inline void stackin() { onstack++; }
inline void stackout() { onstack--; if (!onstack && ended) end(); }
inline void end() { ended = true; if (!onstack) { delete this; } }
void start_connection(); void start_connection();
void handle_events();
void handle_connect_result(); void handle_connect_result();
void submit_read(); void submit_read();
void submit_send(); void submit_send();
@@ -144,7 +137,7 @@ void websocket_t::post_message(int type, const std::string & msg)
void websocket_t::close() void websocket_t::close()
{ {
co->end(); delete co;
} }
http_co_t::~http_co_t() http_co_t::~http_co_t()
@@ -156,7 +149,7 @@ http_co_t::~http_co_t()
} }
if (peer_fd >= 0) if (peer_fd >= 0)
{ {
tfd->set_fd_handler(peer_fd, NULL); tfd->set_fd_handler(peer_fd, false, NULL);
close(peer_fd); close(peer_fd);
peer_fd = -1; peer_fd = -1;
} }
@@ -180,15 +173,14 @@ http_co_t::~http_co_t()
void http_co_t::start_connection() void http_co_t::start_connection()
{ {
stackin();
int port = extract_port(host); int port = extract_port(host);
struct sockaddr_in addr; struct sockaddr_in addr;
int r; int r;
if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1) if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
{ {
parsed.error_code = ENXIO; parsed.error_code = ENXIO;
stackout(); // FIXME 'delete this' is ugly...
end(); delete this;
return; return;
} }
addr.sin_family = AF_INET; addr.sin_family = AF_INET;
@@ -197,8 +189,7 @@ void http_co_t::start_connection()
if (peer_fd < 0) if (peer_fd < 0)
{ {
parsed.error_code = errno; parsed.error_code = errno;
stackout(); delete this;
end();
return; return;
} }
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
@@ -210,81 +201,69 @@ void http_co_t::start_connection()
{ {
parsed.error_code = ETIME; parsed.error_code = ETIME;
} }
end(); delete this;
}); });
} }
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
handle_connect_result();
});
epoll_events = 0; epoll_events = 0;
// Finally call connect // Finally call connect
r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr)); r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS) if (r < 0 && errno != EINPROGRESS)
{ {
parsed.error_code = errno; parsed.error_code = errno;
stackout(); delete this;
end();
return; return;
} }
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
handle_events();
});
state = HTTP_CO_CONNECTING; state = HTTP_CO_CONNECTING;
stackout();
}
void http_co_t::handle_events()
{
stackin();
while (epoll_events)
{
if (state == HTTP_CO_CONNECTING)
{
handle_connect_result();
}
else
{
epoll_events &= ~EPOLLOUT;
if (epoll_events & EPOLLIN)
{
submit_read();
}
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
{
end();
break;
}
}
}
stackout();
} }
void http_co_t::handle_connect_result() void http_co_t::handle_connect_result()
{ {
stackin(); if (epoll_events & (EPOLLOUT | EPOLLERR))
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{ {
result = errno; int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
parsed.error_code = result;
delete this;
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
if (this->epoll_events & EPOLLIN)
{
submit_read();
}
else if (this->epoll_events & (EPOLLRDHUP|EPOLLERR))
{
delete this;
}
});
state = HTTP_CO_SENDING_REQUEST;
submit_send();
} }
if (result != 0) else
{ {
parsed.error_code = result; delete this;
stackout();
end();
return;
} }
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
state = HTTP_CO_SENDING_REQUEST;
submit_send();
stackout();
} }
void http_co_t::submit_read() void http_co_t::submit_read()
{ {
stackin();
int res; int res;
again:
if (rbuf.size() != READ_BUFFER_SIZE) if (rbuf.size() != READ_BUFFER_SIZE)
{ {
rbuf.resize(READ_BUFFER_SIZE); rbuf.resize(READ_BUFFER_SIZE);
@@ -292,30 +271,39 @@ void http_co_t::submit_read()
read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE }; read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
read_msg.msg_iov = &read_iov; read_msg.msg_iov = &read_iov;
read_msg.msg_iovlen = 1; read_msg.msg_iovlen = 1;
epoll_events = epoll_events & ~EPOLLIN;
res = recvmsg(peer_fd, &read_msg, 0); res = recvmsg(peer_fd, &read_msg, 0);
if (res < 0) if (res < 0)
{ {
res = -errno; res = -errno;
} }
if (res == -EAGAIN || res == 0) if (res == -EAGAIN)
{ {
epoll_events = epoll_events & ~EPOLLIN; res = 0;
} }
else if (res < 0) if (res < 0)
{ {
end(); delete this;
return;
} }
else if (res > 0) response += std::string(rbuf.data(), res);
if (res == READ_BUFFER_SIZE)
{ {
response += std::string(rbuf.data(), res); goto again;
handle_read(); }
if (!handle_read())
{
return;
}
if (res < READ_BUFFER_SIZE && (epoll_events & (EPOLLRDHUP|EPOLLERR)))
{
delete this;
return;
} }
stackout();
} }
void http_co_t::submit_send() void http_co_t::submit_send()
{ {
stackin();
int res; int res;
again: again:
if (sent < request.size()) if (sent < request.size())
@@ -323,7 +311,7 @@ again:
send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent }; send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
send_msg.msg_iov = &send_iov; send_msg.msg_iov = &send_iov;
send_msg.msg_iovlen = 1; send_msg.msg_iovlen = 1;
res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL); res = sendmsg(peer_fd, &send_msg, 0);
if (res < 0) if (res < 0)
{ {
res = -errno; res = -errno;
@@ -334,17 +322,14 @@ again:
} }
else if (res < 0) else if (res < 0)
{ {
stackout(); delete this;
end();
return; return;
} }
sent += res; sent += res;
if (state == HTTP_CO_SENDING_REQUEST) if (state == HTTP_CO_SENDING_REQUEST)
{ {
if (sent >= request.size()) if (sent >= request.size())
{
state = HTTP_CO_REQUEST_SENT; state = HTTP_CO_REQUEST_SENT;
}
else else
goto again; goto again;
} }
@@ -355,12 +340,10 @@ again:
goto again; goto again;
} }
} }
stackout();
} }
bool http_co_t::handle_read() bool http_co_t::handle_read()
{ {
stackin();
if (state == HTTP_CO_REQUEST_SENT) if (state == HTTP_CO_REQUEST_SENT)
{ {
int pos = response.find("\r\n\r\n"); int pos = response.find("\r\n\r\n");
@@ -395,8 +378,7 @@ bool http_co_t::handle_read()
if (!target_response_size) if (!target_response_size)
{ {
// Sorry, unsupported response // Sorry, unsupported response
stackout(); delete this;
end();
return false; return false;
} }
} }
@@ -404,8 +386,7 @@ bool http_co_t::handle_read()
} }
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size) if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
{ {
stackout(); delete this;
end();
return false; return false;
} }
if (state == HTTP_CO_CHUNKED && response.size() > 0) if (state == HTTP_CO_CHUNKED && response.size() > 0)
@@ -433,8 +414,7 @@ bool http_co_t::handle_read()
} }
if (parsed.eof) if (parsed.eof)
{ {
stackout(); delete this;
end();
return false; return false;
} }
if (want_streaming && parsed.body.size() > 0) if (want_streaming && parsed.body.size() > 0)
@@ -451,13 +431,11 @@ bool http_co_t::handle_read()
parsed.body = ""; parsed.body = "";
} }
} }
stackout();
return true; return true;
} }
void http_co_t::post_message(int type, const std::string & msg) void http_co_t::post_message(int type, const std::string & msg)
{ {
stackin();
if (state == HTTP_CO_WEBSOCKET) if (state == HTTP_CO_WEBSOCKET)
{ {
request += ws_format_frame(type, msg.size()); request += ws_format_frame(type, msg.size());
@@ -469,7 +447,6 @@ void http_co_t::post_message(int type, const std::string & msg)
ws_outbox += ws_format_frame(type, msg.size()); ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg; ws_outbox += msg;
} }
stackout();
} }
uint64_t stoull_full(const std::string & str, int base) uint64_t stoull_full(const std::string & str, int base)

View File

@@ -1,398 +0,0 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include "messenger.h"
osd_op_t::~osd_op_t()
{
assert(!bs_op);
assert(!op_data);
if (rmw_buf)
{
free(rmw_buf);
}
if (buf)
{
// Note: reusing osd_op_t WILL currently lead to memory leaks
// So we don't reuse it, but free it every time
free(buf);
}
}
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
{
if (wanted_peers.find(peer_osd) == wanted_peers.end())
{
wanted_peers[peer_osd] = (osd_wanted_peer_t){
.address_list = peer_state["addresses"],
.port = (int)peer_state["port"].int64_value(),
};
}
else
{
wanted_peers[peer_osd].address_list = peer_state["addresses"];
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
}
wanted_peers[peer_osd].address_changed = true;
if (!wanted_peers[peer_osd].connecting &&
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
{
try_connect_peer(peer_osd);
}
}
void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
{
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end())
{
return;
}
if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
{
wanted_peers.erase(peer_osd);
return;
}
auto & wp = wp_it->second;
if (wp.address_index >= wp.address_list.array_items().size())
{
return;
}
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
}
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{
on_connect_peer(peer_osd, -EINVAL);
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
on_connect_peer(peer_osd, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int timeout_id = -1;
if (peer_connect_timeout > 0)
{
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
{
osd_num_t peer_osd = clients[peer_fd].osd_num;
stop_client(peer_fd);
on_connect_peer(peer_osd, -EIO);
return;
});
}
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
on_connect_peer(peer_osd, -errno);
return;
}
assert(peer_osd != this->osd_num);
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_timeout_id = timeout_id,
.osd_num = peer_osd,
.in_buf = malloc(receive_buffer_size),
};
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
handle_connect_epoll(peer_fd);
});
}
void osd_messenger_t::handle_connect_epoll(int peer_fd)
{
auto & cl = clients[peer_fd];
if (cl.connect_timeout_id >= 0)
{
tfd->clear_timer(cl.connect_timeout_id);
cl.connect_timeout_id = -1;
}
osd_num_t peer_osd = cl.osd_num;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
stop_client(peer_fd);
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl.peer_state = PEER_CONNECTED;
// FIXME Disable EPOLLOUT on this fd
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Check OSD number
check_peer_config(cl);
}
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
{
// Mark client as ready (i.e. some data is available)
if (epoll_events & EPOLLRDHUP)
{
// Stop client
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
stop_client(peer_fd);
}
else if (epoll_events & EPOLLIN)
{
// Mark client as ready (i.e. some data is available)
auto & cl = clients[peer_fd];
cl.read_ready++;
if (cl.read_ready == 1)
{
read_ready_clients.push_back(cl.peer_fd);
ringloop->wakeup();
}
}
}
void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
{
auto & wp = wanted_peers.at(peer_osd);
wp.connecting = false;
if (peer_fd < 0)
{
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
if (wp.address_changed)
{
wp.address_changed = false;
wp.address_index = 0;
try_connect_peer(peer_osd);
}
else if (wp.address_index < wp.address_list.array_items().size()-1)
{
// Try other addresses
wp.address_index++;
try_connect_peer(peer_osd);
}
else
{
// Retry again in <peer_connect_interval> seconds
wp.last_connect_attempt = time(NULL);
wp.address_index = 0;
tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
{
try_connect_peer(peer_osd);
});
}
return;
}
printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
wanted_peers.erase(peer_osd);
repeer_pgs(peer_osd);
}
void osd_messenger_t::check_peer_config(osd_client_t & cl)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.show_conf = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SHOW_CONFIG,
},
},
};
op->callback = [this](osd_op_t *op)
{
osd_client_t & cl = clients[op->peer_fd];
std::string json_err;
json11::Json config;
bool err = false;
if (op->reply.hdr.retval < 0)
{
err = true;
printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
}
else
{
config = json11::Json::parse(std::string((char*)op->buf), json_err);
if (json_err != "")
{
err = true;
printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
}
else if (config["osd_num"].uint64_value() != cl.osd_num)
{
err = true;
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
on_connect_peer(cl.osd_num, -1);
}
}
if (err)
{
stop_client(op->peer_fd);
delete op;
return;
}
osd_peer_fds[cl.osd_num] = cl.peer_fd;
on_connect_peer(cl.osd_num, cl.peer_fd);
delete op;
};
outbox_push(op);
}
void osd_messenger_t::cancel_osd_ops(osd_client_t & cl)
{
for (auto p: cl.sent_ops)
{
cancel_op(p.second);
}
cl.sent_ops.clear();
for (auto op: cl.outbox)
{
cancel_op(op);
}
cl.outbox.clear();
if (cl.write_op)
{
cancel_op(cl.write_op);
cl.write_op = NULL;
}
}
void osd_messenger_t::cancel_op(osd_op_t *op)
{
if (op->op_type == OSD_OP_OUT)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
else
{
// This function is only called in stop_client(), so it's fine to destroy the operation
delete op;
}
}
void osd_messenger_t::stop_client(int peer_fd)
{
assert(peer_fd != 0);
auto it = clients.find(peer_fd);
if (it == clients.end())
{
return;
}
uint64_t repeer_osd = 0;
osd_client_t cl = it->second;
if (cl.peer_state == PEER_CONNECTED)
{
if (cl.osd_num)
{
// Reload configuration from etcd when the connection is dropped
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
repeer_osd = cl.osd_num;
}
else
{
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
}
}
clients.erase(it);
tfd->set_fd_handler(peer_fd, NULL);
if (cl.osd_num)
{
osd_peer_fds.erase(cl.osd_num);
// Cancel outbound operations
cancel_osd_ops(cl);
}
if (cl.read_op)
{
delete cl.read_op;
cl.read_op = NULL;
}
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
{
if (*rit == peer_fd)
{
read_ready_clients.erase(rit);
break;
}
}
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
{
if (*wit == peer_fd)
{
write_ready_clients.erase(wit);
break;
}
}
free(cl.in_buf);
close(peer_fd);
if (repeer_osd)
{
repeer_pgs(repeer_osd);
}
}
void osd_messenger_t::accept_connections(int listen_fd)
{
// Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
assert(peer_fd != 0);
char peer_str[256];
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
.in_buf = malloc(receive_buffer_size),
};
// Add FD to epoll
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Try to accept next connection
peer_addr_size = sizeof(addr);
}
if (peer_fd == -1 && errno != EAGAIN)
{
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
}

View File

@@ -1,213 +0,0 @@
#pragma once
#include <sys/types.h>
#include <stdint.h>
#include <arpa/inet.h>
#include <malloc.h>
#include <set>
#include <map>
#include <deque>
#include <vector>
#include "json11/json11.hpp"
#include "osd_ops.h"
#include "timerfd_manager.h"
#include "ringloop.h"
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define OSD_OP_INLINE_BUF_COUNT 16
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
#define DEFAULT_PEER_CONNECT_INTERVAL 5
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
struct osd_op_buf_list_t
{
int count = 0, alloc = 0, sent = 0;
iovec *buf = NULL;
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
~osd_op_buf_list_t()
{
if (buf && buf != inline_buf)
{
free(buf);
}
}
inline iovec* get_iovec()
{
return (buf ? buf : inline_buf) + sent;
}
inline int get_size()
{
return count - sent;
}
inline void push_back(void *nbuf, size_t len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
}
}
buf[count++] = { .iov_base = nbuf, .iov_len = len };
}
};
struct blockstore_op_t;
struct osd_primary_op_data_t;
struct osd_op_t
{
timespec tv_begin;
uint64_t op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
osd_op_buf_list_t send_list;
~osd_op_t();
};
struct osd_client_t
{
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
int connect_timeout_id = -1;
osd_num_t osd_num = 0;
void *in_buf = NULL;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Incoming operations
std::vector<osd_op_t*> received_ops;
// Outbound operations
std::deque<osd_op_t*> outbox;
std::map<int, osd_op_t*> sent_ops;
// PGs dirtied by this client's primary-writes (FIXME to drop the connection)
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
};
struct osd_wanted_peer_t
{
json11::Json address_list;
int port;
time_t last_connect_attempt;
bool connecting, address_changed;
int address_index;
std::string cur_addr;
int cur_port;
};
struct osd_op_stats_t
{
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
};
struct osd_messenger_t
{
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
// osd_num_t is only for logging and asserts
osd_num_t osd_num;
int receive_buffer_size = 9000;
int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
int log_level = 0;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
uint64_t next_subop_id = 1;
std::map<int, osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// op statistics
osd_op_stats_t stats;
public:
void connect_peer(uint64_t osd_num, json11::Json peer_state);
void stop_client(int peer_fd);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
void handle_peer_epoll(int peer_fd, int epoll_events);
void read_requests();
void send_replies();
void accept_connections(int listen_fd);
protected:
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_connect_epoll(int peer_fd);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t & cl);
void cancel_osd_ops(osd_client_t & cl);
void cancel_op(osd_op_t *op);
bool try_send(osd_client_t & cl);
void handle_send(int result, int peer_fd);
bool handle_read(int result, int peer_fd);
void handle_finished_read(osd_client_t & cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
};

83
osd.cpp
View File

@@ -7,8 +7,6 @@
#include "osd.h" #include "osd.h"
#define MAX_EPOLL_EVENTS 64
const char* osd_op_names[] = { const char* osd_op_names[] = {
"", "",
"read", "read",
@@ -44,7 +42,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno)); throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
} }
this->tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); }); this->tfd = new timerfd_manager_t([this](int fd, bool out, std::function<void(int, int)> handler) { set_fd_handler(fd, out, handler); });
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id) this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{ {
print_stats(); print_stats();
@@ -75,9 +73,29 @@ osd_t::~osd_t()
void osd_t::parse_config(blockstore_config_t & config) void osd_t::parse_config(blockstore_config_t & config)
{ {
int pos;
// Initial startup configuration // Initial startup configuration
json11::Json json_config = json11::Json(config); {
st_cli.parse_config(json_config); std::string ea = config["etcd_address"];
while (1)
{
pos = ea.find(',');
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
if (addr.length() > 0)
{
if (addr.find('/') < 0)
addr += "/v3";
st_cli.etcd_addresses.push_back(addr);
}
if (pos >= 0)
ea = ea.substr(pos+1);
else
break;
}
}
st_cli.etcd_prefix = config["etcd_prefix"];
if (st_cli.etcd_prefix == "")
st_cli.etcd_prefix = "/microceph";
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10); etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
if (etcd_report_interval <= 0) if (etcd_report_interval <= 0)
etcd_report_interval = 30; etcd_report_interval = 30;
@@ -125,11 +143,12 @@ void osd_t::parse_config(blockstore_config_t & config)
print_stats_interval = 3; print_stats_interval = 3;
c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10); c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
if (!c_cli.peer_connect_interval) if (!c_cli.peer_connect_interval)
c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL; c_cli.peer_connect_interval = 5;
c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10); c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
if (!c_cli.peer_connect_timeout) if (!c_cli.peer_connect_timeout)
c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT; c_cli.peer_connect_timeout = 5;
log_level = strtoull(config["log_level"].c_str(), NULL, 10); log_level = strtoull(config["log_level"].c_str(), NULL, 10);
st_cli.log_level = log_level;
c_cli.log_level = log_level; c_cli.log_level = log_level;
} }
@@ -183,7 +202,7 @@ void osd_t::bind_socket()
epoll_event ev; epoll_event ev;
ev.data.fd = listen_fd; ev.data.fd = listen_fd;
ev.events = EPOLLIN | EPOLLET; ev.events = EPOLLIN;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0) if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0)
{ {
close(listen_fd); close(listen_fd);
@@ -215,14 +234,14 @@ void osd_t::loop()
ringloop->submit(); ringloop->submit();
} }
void osd_t::set_fd_handler(int fd, std::function<void(int, int)> handler) void osd_t::set_fd_handler(int fd, bool out, std::function<void(int, int)> handler)
{ {
if (handler != NULL) if (handler != NULL)
{ {
bool exists = epoll_handlers.find(fd) != epoll_handlers.end(); bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev; epoll_event ev;
ev.data.fd = fd; ev.data.fd = fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET; ev.events = EPOLLIN | (out ? EPOLLOUT : 0) | EPOLLRDHUP;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0) if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{ {
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno)); throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
@@ -241,11 +260,18 @@ void osd_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
void osd_t::handle_epoll_events() void osd_t::handle_epoll_events()
{ {
wait_state = 0;
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe *sqe = ringloop->get_sqe(); io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe) if (!sqe)
{ {
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET"); return;
} }
wait_state = 1;
ring_data_t *data = ((ring_data_t*)sqe->user_data); ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN); my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
data->callback = [this](ring_data_t *data) data->callback = [this](ring_data_t *data)
@@ -256,7 +282,6 @@ void osd_t::handle_epoll_events()
} }
handle_epoll_events(); handle_epoll_events();
}; };
ringloop->submit();
int nfds; int nfds;
epoll_event events[MAX_EPOLL_EVENTS]; epoll_event events[MAX_EPOLL_EVENTS];
restart: restart:
@@ -265,7 +290,38 @@ restart:
{ {
if (events[i].data.fd == listen_fd) if (events[i].data.fd == listen_fd)
{ {
c_cli.accept_connections(listen_fd); // Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
assert(peer_fd != 0);
char peer_str[256];
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
c_cli.clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
.in_buf = malloc(c_cli.receive_buffer_size),
};
// Add FD to epoll
set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
{
c_cli.handle_peer_epoll(peer_fd, epoll_events);
});
// Try to accept next connection
peer_addr_size = sizeof(addr);
}
if (peer_fd == -1 && errno != EAGAIN)
{
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
} }
else else
{ {
@@ -273,6 +329,7 @@ restart:
cb(events[i].data.fd, events[i].events); cb(events[i].data.fd, events[i].events);
} }
} }
printf("%d events\n", nfds);
if (nfds == MAX_EPOLL_EVENTS) if (nfds == MAX_EPOLL_EVENTS)
{ {
goto restart; goto restart;

13
osd.h
View File

@@ -17,7 +17,7 @@
#include "ringloop.h" #include "ringloop.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
#include "osd_peering_pg.h" #include "osd_peering_pg.h"
#include "messenger.h" #include "cluster_client.h"
#include "etcd_state_client.h" #include "etcd_state_client.h"
#define OSD_LOADING_PGS 0x01 #define OSD_LOADING_PGS 0x01
@@ -78,7 +78,7 @@ class osd_t
// cluster state // cluster state
etcd_state_client_t st_cli; etcd_state_client_t st_cli;
osd_messenger_t c_cli; cluster_client_t c_cli;
int etcd_failed_attempts = 0; int etcd_failed_attempts = 0;
std::string etcd_lease_id; std::string etcd_lease_id;
json11::Json self_state; json11::Json self_state;
@@ -149,7 +149,7 @@ class osd_t
// event loop, socket read/write // event loop, socket read/write
void loop(); void loop();
void set_fd_handler(int fd, std::function<void(int, int)> handler); void set_fd_handler(int fd, bool out, std::function<void(int, int)> handler);
void handle_epoll_events(); void handle_epoll_events();
// peer handling (primary OSD logic) // peer handling (primary OSD logic)
@@ -187,16 +187,15 @@ class osd_t
bool prepare_primary_rw(osd_op_t *cur_op); bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op); void continue_primary_read(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op); void continue_primary_write(osd_op_t *cur_op);
void cancel_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op); void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op); void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg); bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg); void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); bool finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version);
void handle_primary_bs_subop(osd_op_t *subop); void handle_primary_bs_subop(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop); void add_bs_subop_stats(osd_op_t *subop);
void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval); void pg_cancel_write_queue(pg_t & pg, object_id oid, int retval);
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op); void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set); void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
void submit_primary_sync_subops(osd_op_t *cur_op); void submit_primary_sync_subops(osd_op_t *cur_op);

40
osd_client.cpp Normal file
View File

@@ -0,0 +1,40 @@
void slice()
{
// Slice the request into blockstore requests to individual objects
// Primary OSD still operates individual stripes, except they're twice the size of the blockstore's stripe.
std::vector read_parts;
int block = bs->get_block_size();
uint64_t stripe1 = cur_op->req.rw.offset / block / 2;
uint64_t stripe2 = (cur_op->req.rw.offset + cur_op->req.rw.len + block*2 - 1) / block / 2 - 1;
for (uint64_t s = stripe1; s <= stripe2; s++)
{
uint64_t start = s == stripe1 ? cur_op->req.rw.offset - stripe1*block*2 : 0;
uint64_t end = s == stripe2 ? cur_op->req.rw.offset + cur_op->req.rw.len - stripe2*block*2 : block*2;
if (start < block)
{
read_parts.push_back({
.role = 1,
.oid = {
.inode = cur_op->req.rw.inode,
.stripe = (s << STRIPE_ROLE_BITS) | 1,
},
.version = UINT64_MAX,
.offset = start,
.len = (block < end ? block : end) - start,
});
}
if (end > block)
{
read_parts.push_back({
.role = 2,
.oid = {
.inode = cur_op->req.rw.inode,
.stripe = (s << STRIPE_ROLE_BITS) | 2,
},
.version = UINT64_MAX,
.offset = (start > block ? start-block : 0),
.len = end - (start > block ? start-block : 0),
});
}
}
}

View File

@@ -83,7 +83,7 @@ void osd_t::parse_test_peer(std::string peer)
{ "addresses", json11::Json::array { addr } }, { "addresses", json11::Json::array { addr } },
{ "port", port }, { "port", port },
}; };
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]); c_cli.connect_peer(peer_osd, json11::Json::array { addr }, port);
} }
json11::Json osd_t::get_osd_state() json11::Json osd_t::get_osd_state()
@@ -211,7 +211,7 @@ void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
{ {
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end()) if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
{ {
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]); c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]["addresses"], st_cli.peer_states[peer_osd]["port"].int64_value());
} }
} }
@@ -229,14 +229,8 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
{ {
if (this->config.find(cfg_var.first) == this->config.end()) if (this->config.find(cfg_var.first) == this->config.end())
{ {
if (cfg_var.second.is_string()) // FIXME Convert int to str
{ osd_config[cfg_var.first] = cfg_var.second.string_value();
osd_config[cfg_var.first] = cfg_var.second.string_value();
}
else
{
osd_config[cfg_var.first] = cfg_var.second.dump();
}
} }
} }
parse_config(osd_config); parse_config(osd_config);
@@ -562,7 +556,7 @@ void osd_t::apply_pg_config()
{ {
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end()) if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
{ {
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]); c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
} }
} }
start_pg_peering(pg_num); start_pg_peering(pg_num);

View File

@@ -78,12 +78,9 @@ void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb
} }
else else
{ {
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval)); printf("Error while doing flush on OSD %lu: %s\n", osd_num, strerror(-retval));
auto fd_it = c_cli.osd_peer_fds.find(peer_osd); assert(c_cli.osd_peer_fds.find(peer_osd) != c_cli.osd_peer_fds.end());
if (fd_it != c_cli.osd_peer_fds.end()) c_cli.stop_client(c_cli.osd_peer_fds[peer_osd]);
{
c_cli.stop_client(fd_it->second);
}
return; return;
} }
} }
@@ -273,10 +270,9 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
throw std::runtime_error("Failed to recover an object"); throw std::runtime_error("Failed to recover an object");
} }
} }
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
op->osd_op = NULL;
recovery_ops.erase(op->oid); recovery_ops.erase(op->oid);
delete osd_op; delete osd_op;
op->osd_op = NULL;
continue_recovery(); continue_recovery();
}; };
exec_op(op->osd_op); exec_op(op->osd_op);

View File

@@ -120,7 +120,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
pg.flush_batch = NULL; pg.flush_batch = NULL;
for (auto p: pg.write_queue) for (auto p: pg.write_queue)
{ {
cancel_primary_write(p.second); finish_op(p.second, -EPIPE);
} }
pg.write_queue.clear(); pg.write_queue.clear();
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); ) for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
@@ -132,6 +132,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
else else
it++; it++;
} }
pg.inflight = 0;
dirty_pgs.erase(pg.pg_num); dirty_pgs.erase(pg.pg_num);
// Calculate current write OSD set // Calculate current write OSD set
pg.pg_cursize = 0; pg.pg_cursize = 0;
@@ -187,7 +188,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
} }
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end()) else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
{ {
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]); c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
} }
} }
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end()); pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());

View File

@@ -206,6 +206,17 @@ void pg_obj_state_check_t::finish_object()
if (log_level > 1) if (log_level > 1)
{ {
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
if (log_level > 2)
{
for (int i = obj_start; i < obj_end; i++)
{
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
} }
state = OBJ_INCOMPLETE; state = OBJ_INCOMPLETE;
pg->state = pg->state | PG_HAS_INCOMPLETE; pg->state = pg->state | PG_HAS_INCOMPLETE;
@@ -215,21 +226,11 @@ void pg_obj_state_check_t::finish_object()
if (log_level > 1) if (log_level > 1)
{ {
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
} }
state = OBJ_DEGRADED;
pg->state = pg->state | PG_HAS_DEGRADED;
}
if (n_mismatched > 0)
{
if (n_roles >= pg->pg_cursize && log_level > 1)
{
printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
}
if (log_level > 1 && (n_roles < pg->pg_cursize || n_mismatched > 0))
{
if (log_level > 2) if (log_level > 2)
{ {
for (int i = obj_start; i < obj_end; i++) for (int i = obj_start; i < obj_end; i++)
@@ -237,13 +238,13 @@ void pg_obj_state_check_t::finish_object()
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
} }
} }
else state = OBJ_DEGRADED;
{ pg->state = pg->state | PG_HAS_DEGRADED;
for (int i = ver_start; i < ver_end; i++) }
{ if (n_mismatched > 0)
printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); {
} state |= OBJ_MISPLACED;
} pg->state = pg->state | PG_HAS_MISPLACED;
} }
pg->total_count++; pg->total_count++;
if (state != 0 || ver_end < obj_end) if (state != 0 || ver_end < obj_end)

View File

@@ -13,14 +13,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
{ {
// PG number is calculated from the offset // PG number is calculated from the offset
// Our EC scheme stores data in fixed chunks equal to (K*block size) // Our EC scheme stores data in fixed chunks equal to (K*block size)
// K = pg_minsize and will be a property of the inode. Not it's hardcoded (FIXME) // But we must not use K in the process of calculating the PG number
uint64_t pg_block_size = bs_block_size * 2; // So we calculate the PG number using a separate setting which should be per-inode (FIXME)
object_id oid = { pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / pg_stripe_size) % pg_count + 1;
.inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe
.stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
};
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_count + 1;
auto pg_it = pgs.find(pg_num); auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE)) if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
{ {
@@ -28,6 +23,13 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
finish_op(cur_op, -EPIPE); finish_op(cur_op, -EPIPE);
return false; return false;
} }
uint64_t pg_block_size = bs_block_size * pg_it->second.pg_minsize;
object_id oid = {
.inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
.stripe = (cur_op->req.rw.offset / pg_stripe_size) * pg_stripe_size +
((cur_op->req.rw.offset % pg_stripe_size) / pg_block_size) * pg_block_size
};
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) || if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
(cur_op->req.rw.offset % bs_disk_alignment) != 0 || (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
(cur_op->req.rw.len % bs_disk_alignment) != 0) (cur_op->req.rw.len % bs_disk_alignment) != 0)
@@ -196,8 +198,8 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
else if (op_data->st == 6) goto resume_6; else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7; else if (op_data->st == 7) goto resume_7;
else if (op_data->st == 8) goto resume_8; else if (op_data->st == 8) goto resume_8;
else if (op_data->st == 9) goto resume_9;
assert(op_data->st == 0); assert(op_data->st == 0);
printf("primary_write\n");
if (!check_write_queue(cur_op, pg)) if (!check_write_queue(cur_op, pg))
{ {
return; return;
@@ -217,7 +219,7 @@ resume_2:
resume_3: resume_3:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
// Save version override for parallel reads // Save version override for parallel reads
@@ -232,7 +234,7 @@ resume_4:
resume_5: resume_5:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
if (op_data->fact_ver == 1) if (op_data->fact_ver == 1)
@@ -262,13 +264,12 @@ resume_5:
submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set); submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set);
if (op_data->n_subops > 0) if (op_data->n_subops > 0)
{ {
resume_8:
op_data->st = 8; op_data->st = 8;
return; return;
resume_9: resume_8:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
} }
@@ -282,7 +283,7 @@ resume_9:
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
resume_6: resume_6:
resume_7: resume_7:
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6)) if (!finalize_primary_write(cur_op, pg, pg.cur_loc_set, 6))
{ {
return; return;
} }
@@ -291,19 +292,17 @@ resume_7:
// Continue other write operations to the same object // Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid); auto next_it = pg.write_queue.find(oid);
auto this_it = next_it; auto this_it = next_it;
if (this_it != pg.write_queue.end() && this_it->second == cur_op) next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
{ {
next_it++; osd_op_t *next_op = next_it->second;
pg.write_queue.erase(this_it); continue_primary_write(next_op);
if (next_it != pg.write_queue.end() && next_it->first == oid)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
}
} }
} }
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state) bool osd_t::finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
{ {
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == base_state) if (op_data->st == base_state)
@@ -349,7 +348,7 @@ resume_7:
op_data->unstable_write_osds = NULL; op_data->unstable_write_osds = NULL;
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return false; return false;
} }
} }
@@ -389,6 +388,7 @@ void osd_t::continue_primary_sync(osd_op_t *cur_op)
else if (op_data->st == 5) goto resume_5; else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6; else if (op_data->st == 6) goto resume_6;
assert(op_data->st == 0); assert(op_data->st == 0);
printf("primary_sync\n");
if (syncs_in_progress.size() > 0) if (syncs_in_progress.size() > 0)
{ {
// Wait for previous syncs, if any // Wait for previous syncs, if any
@@ -596,6 +596,8 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 3) goto resume_3; else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4; else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5; else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
assert(op_data->st == 0); assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD)) if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
@@ -618,7 +620,7 @@ resume_2:
resume_3: resume_3:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
// Save version override for parallel reads // Save version override for parallel reads
@@ -632,11 +634,17 @@ resume_4:
resume_5: resume_5:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO); pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return; return;
} }
// Remove version override // Remove version override
pg.ver_override.erase(op_data->oid); pg.ver_override.erase(op_data->oid);
resume_6:
resume_7:
if (!finalize_primary_write(cur_op, pg, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set, 6))
{
return;
}
// Adjust PG stats after "instant stabilize", because we need object_state above // Adjust PG stats after "instant stabilize", because we need object_state above
if (!op_data->object_state) if (!op_data->object_state)
{ {
@@ -652,15 +660,12 @@ resume_5:
// Continue other write operations to the same object // Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid); auto next_it = pg.write_queue.find(oid);
auto this_it = next_it; auto this_it = next_it;
if (this_it != pg.write_queue.end() && this_it->second == cur_op) next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
{ {
next_it++; osd_op_t *next_op = next_it->second;
pg.write_queue.erase(this_it); continue_primary_write(next_op);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
}
} }
} }

View File

@@ -33,24 +33,15 @@ void osd_t::autosync()
void osd_t::finish_op(osd_op_t *cur_op, int retval) void osd_t::finish_op(osd_op_t *cur_op, int retval)
{ {
inflight_ops--; inflight_ops--;
if (cur_op->op_data) if (cur_op->op_data && cur_op->op_data->pg_num > 0)
{ {
if (cur_op->op_data->pg_num > 0) auto & pg = pgs[cur_op->op_data->pg_num];
pg.inflight--;
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{ {
auto & pg = pgs[cur_op->op_data->pg_num]; finish_stop_pg(pg);
pg.inflight--;
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
} }
assert(!cur_op->op_data->subops);
assert(!cur_op->op_data->unstable_write_osds);
assert(!cur_op->op_data->unstable_writes);
assert(!cur_op->op_data->dirty_pgs);
free(cur_op->op_data);
cur_op->op_data = NULL;
} }
if (!cur_op->peer_fd) if (!cur_op->peer_fd)
{ {
@@ -138,13 +129,6 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start, .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
.buf = w ? stripes[role].write_buf : stripes[role].read_buf, .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
}); });
#ifdef OSD_DEBUG
printf(
"Submit %s to local: %lu:%lu v%lu %u-%u\n", w ? "write" : "read",
op_data->oid.inode, op_data->oid.stripe | role, op_version,
subops[i].bs_op->offset, subops[i].bs_op->len
);
#endif
bs->enqueue_op(subops[i].bs_op); bs->enqueue_op(subops[i].bs_op);
} }
else else
@@ -166,13 +150,6 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.offset = w ? stripes[role].write_start : stripes[role].read_start, .offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start, .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
}; };
#ifdef OSD_DEBUG
printf(
"Submit %s to osd %lu: %lu:%lu v%lu %u-%u\n", w ? "write" : "read", role_osd_num,
op_data->oid.inode, op_data->oid.stripe | role, op_version,
subops[i].req.sec_rw.offset, subops[i].req.sec_rw.len
);
#endif
subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf; subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
if (w && stripes[role].write_end > 0) if (w && stripes[role].write_end > 0)
{ {
@@ -184,7 +161,10 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1; subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
// so it doesn't get freed // so it doesn't get freed
subop->buf = NULL; subop->buf = NULL;
handle_primary_subop(subop, cur_op); handle_primary_subop(
subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval,
subop->req.sec_rw.len, subop->reply.sec_rw.version
);
if (fail_fd >= 0) if (fail_fd >= 0)
{ {
// write operation failed, drop the connection // write operation failed, drop the connection
@@ -224,16 +204,12 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
); );
} }
add_bs_subop_stats(subop); add_bs_subop_stats(subop);
subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode]; uint64_t opcode = bs_op_to_osd_op[bs_op->opcode];
subop->reply.hdr.retval = bs_op->retval; int retval = bs_op->retval;
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE) uint64_t version = bs_op->version;
{
subop->req.sec_rw.len = bs_op->len;
subop->reply.sec_rw.version = bs_op->version;
}
delete bs_op; delete bs_op;
subop->bs_op = NULL; subop->bs_op = NULL;
handle_primary_subop(subop, cur_op); handle_primary_subop(opcode, cur_op, retval, expected, version);
} }
void osd_t::add_bs_subop_stats(osd_op_t *subop) void osd_t::add_bs_subop_stats(osd_op_t *subop)
@@ -259,12 +235,8 @@ void osd_t::add_bs_subop_stats(osd_op_t *subop)
} }
} }
void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op) void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version)
{ {
uint64_t opcode = subop->req.hdr.opcode;
int retval = subop->reply.hdr.retval;
int expected = opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE
? subop->req.sec_rw.len : 0;
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval != expected) if (retval != expected)
{ {
@@ -280,12 +252,6 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
op_data->done++; op_data->done++;
if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE) if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
{ {
uint64_t version = subop->reply.sec_rw.version;
#ifdef OSD_DEBUG
uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
? c_cli.clients[subop->peer_fd].osd_num : osd_num;
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
#endif
if (op_data->fact_ver != 0 && op_data->fact_ver != version) if (op_data->fact_ver != 0 && op_data->fact_ver != version)
{ {
throw std::runtime_error( throw std::runtime_error(
@@ -324,23 +290,6 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
} }
} }
void osd_t::cancel_primary_write(osd_op_t *cur_op)
{
if (cur_op->op_data && cur_op->op_data->subops)
{
// Primary-write operation is waiting for subops, subops
// are sent to peer OSDs, so we can't just throw them away.
// Mark them with an extra EPIPE.
cur_op->op_data->errors++;
cur_op->op_data->epipe++;
cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
}
else
{
finish_op(cur_op, -EPIPE);
}
}
void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set) void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
{ {
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
@@ -405,7 +354,7 @@ void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_os
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1; int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(OSD_OP_SECONDARY_DELETE, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0) if (fail_fd >= 0)
{ {
// delete operation failed, drop the connection // delete operation failed, drop the connection
@@ -458,7 +407,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1; int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0) if (fail_fd >= 0)
{ {
// sync operation failed, drop the connection // sync operation failed, drop the connection
@@ -513,7 +462,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1; int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0) if (fail_fd >= 0)
{ {
// sync operation failed, drop the connection // sync operation failed, drop the connection
@@ -525,20 +474,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
} }
} }
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval) void osd_t::pg_cancel_write_queue(pg_t & pg, object_id oid, int retval)
{ {
auto st_it = pg.write_queue.find(oid), it = st_it; auto st_it = pg.write_queue.find(oid), it = st_it;
finish_op(first_op, retval);
if (it != pg.write_queue.end() && it->second == first_op)
{
it++;
}
else
{
// Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG.
return;
}
while (it != pg.write_queue.end() && it->first == oid) while (it != pg.write_queue.end() && it->first == oid)
{ {
finish_op(it->second, retval); finish_op(it->second, retval);

View File

@@ -1,11 +1,23 @@
#include "messenger.h" #include "cluster_client.h"
void osd_messenger_t::read_requests() void cluster_client_t::read_requests()
{ {
while (read_ready_clients.size() > 0) for (int i = 0; i < read_ready_clients.size(); i++)
{ {
int peer_fd = read_ready_clients[0]; int peer_fd = read_ready_clients[i];
auto & cl = clients[peer_fd]; auto & cl = clients[peer_fd];
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
return;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.read_op || cl.read_remaining < receive_buffer_size) if (!cl.read_op || cl.read_remaining < receive_buffer_size)
{ {
cl.read_iov.iov_base = cl.in_buf; cl.read_iov.iov_base = cl.in_buf;
@@ -18,30 +30,26 @@ void osd_messenger_t::read_requests()
} }
cl.read_msg.msg_iov = &cl.read_iov; cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1; cl.read_msg.msg_iovlen = 1;
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + 1); data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
int result = recvmsg(peer_fd, &cl.read_msg, 0); my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
if (result < 0)
{
result = -errno;
}
handle_read(result, peer_fd);
} }
read_ready_clients.clear();
} }
bool osd_messenger_t::handle_read(int result, int peer_fd) void cluster_client_t::handle_read(ring_data_t *data, int peer_fd)
{ {
auto cl_it = clients.find(peer_fd); auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end()) if (cl_it != clients.end())
{ {
auto & cl = cl_it->second; auto & cl = cl_it->second;
if (result < 0 && result != -EAGAIN) if (data->res < 0 && data->res != -EAGAIN)
{ {
// this is a client socket, so don't panic. just disconnect it // this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result)); printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
stop_client(peer_fd); stop_client(peer_fd);
return false; return;
} }
if (result == -EAGAIN || result < cl.read_iov.iov_len) if (data->res == -EAGAIN || cl.read_iov.iov_base == cl.in_buf && data->res < receive_buffer_size)
{ {
cl.read_ready--; cl.read_ready--;
if (cl.read_ready > 0) if (cl.read_ready > 0)
@@ -51,12 +59,16 @@ bool osd_messenger_t::handle_read(int result, int peer_fd)
{ {
read_ready_clients.push_back(peer_fd); read_ready_clients.push_back(peer_fd);
} }
if (result > 0) if (data->res == -EAGAIN)
{
return;
}
if (data->res > 0)
{ {
if (cl.read_iov.iov_base == cl.in_buf) if (cl.read_iov.iov_base == cl.in_buf)
{ {
// Compose operation(s) from the buffer // Compose operation(s) from the buffer
int remain = result; int remain = data->res;
void *curbuf = cl.in_buf; void *curbuf = cl.in_buf;
while (remain > 0) while (remain > 0)
{ {
@@ -92,23 +104,18 @@ bool osd_messenger_t::handle_read(int result, int peer_fd)
else else
{ {
// Long data // Long data
cl.read_remaining -= result; cl.read_remaining -= data->res;
cl.read_buf += result; cl.read_buf += data->res;
if (cl.read_remaining <= 0) if (cl.read_remaining <= 0)
{ {
handle_finished_read(cl); handle_finished_read(cl);
} }
} }
if (result >= cl.read_iov.iov_len)
{
return true;
}
} }
} }
return false;
} }
void osd_messenger_t::handle_finished_read(osd_client_t & cl) void cluster_client_t::handle_finished_read(osd_client_t & cl)
{ {
if (cl.read_state == CL_READ_HDR) if (cl.read_state == CL_READ_HDR)
{ {
@@ -120,7 +127,6 @@ void osd_messenger_t::handle_finished_read(osd_client_t & cl)
else if (cl.read_state == CL_READ_DATA) else if (cl.read_state == CL_READ_DATA)
{ {
// Operation is ready // Operation is ready
cl.received_ops.push_back(cl.read_op);
exec_op(cl.read_op); exec_op(cl.read_op);
cl.read_op = NULL; cl.read_op = NULL;
cl.read_state = 0; cl.read_state = 0;
@@ -156,7 +162,7 @@ void osd_messenger_t::handle_finished_read(osd_client_t & cl)
} }
} }
void osd_messenger_t::handle_op_hdr(osd_client_t *cl) void cluster_client_t::handle_op_hdr(osd_client_t *cl)
{ {
osd_op_t *cur_op = cl->read_op; osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ) if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
@@ -201,12 +207,11 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
// Operation is ready // Operation is ready
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
cl->received_ops.push_back(cur_op);
exec_op(cur_op); exec_op(cur_op);
} }
} }
void osd_messenger_t::handle_reply_hdr(osd_client_t *cl) void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
{ {
osd_op_t *cur_op = cl->read_op; osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id); auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
@@ -219,7 +224,7 @@ void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
} }
osd_op_t *op = req_it->second; osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE); memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
if ((op->reply.hdr.opcode == OSD_OP_SECONDARY_READ || op->reply.hdr.opcode == OSD_OP_READ) && if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->reply.hdr.retval > 0) op->reply.hdr.retval > 0)
{ {
// Read data. In this case we assume that the buffer is preallocated by the caller (!) // Read data. In this case we assume that the buffer is preallocated by the caller (!)
@@ -229,7 +234,8 @@ void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
cl->read_buf = op->buf; cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval; cl->read_remaining = op->reply.hdr.retval;
} }
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST && op->reply.hdr.retval > 0) else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
op->reply.hdr.retval > 0)
{ {
op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval); op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA; cl->read_state = CL_READ_REPLY_DATA;
@@ -237,7 +243,8 @@ void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
cl->read_buf = op->buf; cl->read_buf = op->buf;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval; cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
} }
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0) else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG &&
op->reply.hdr.retval > 0)
{ {
op->buf = malloc(op->reply.hdr.retval); op->buf = malloc(op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA; cl->read_state = CL_READ_REPLY_DATA;

View File

@@ -1,6 +1,6 @@
#include "messenger.h" #include "cluster_client.h"
void osd_messenger_t::outbox_push(osd_op_t *cur_op) void cluster_client_t::outbox_push(osd_op_t *cur_op)
{ {
assert(cur_op->peer_fd); assert(cur_op->peer_fd);
auto & cl = clients.at(cur_op->peer_fd); auto & cl = clients.at(cur_op->peer_fd);
@@ -8,25 +8,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
{ {
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin); clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
} }
else
{
// Check that operation actually belongs to this client
bool found = false;
for (auto it = cl.received_ops.begin(); it != cl.received_ops.end(); it++)
{
if (*it == cur_op)
{
found = true;
cl.received_ops.erase(it, it+1);
break;
}
}
if (!found)
{
delete cur_op;
return;
}
}
cl.outbox.push_back(cur_op); cl.outbox.push_back(cur_op);
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl)) if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
{ {
@@ -37,11 +18,24 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
} }
ringloop->wakeup(); ringloop->wakeup();
} }
else
ringloop->submit();
} }
bool osd_messenger_t::try_send(osd_client_t & cl) bool cluster_client_t::try_send(osd_client_t & cl)
{ {
int peer_fd = cl.peer_fd; int peer_fd = cl.peer_fd;
{
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
}
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
return false;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.write_op) if (!cl.write_op)
{ {
// pick next command // pick next command
@@ -78,51 +72,53 @@ bool osd_messenger_t::try_send(osd_client_t & cl)
} }
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec(); cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size(); cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
int result = sendmsg(peer_fd, &cl.write_msg, MSG_NOSIGNAL); data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
if (result < 0) my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
result = -errno;
handle_send(result, peer_fd);
return true; return true;
} }
void osd_messenger_t::send_replies() void cluster_client_t::send_replies()
{ {
while (write_ready_clients.size() > 0) for (int i = 0; i < write_ready_clients.size(); i++)
{ {
auto & cl = clients[write_ready_clients[0]]; int peer_fd = write_ready_clients[i];
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + 1); if (!try_send(clients[peer_fd]))
try_send(cl); {
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
return;
}
} }
write_ready_clients.clear();
} }
void osd_messenger_t::handle_send(int result, int peer_fd) void cluster_client_t::handle_send(ring_data_t *data, int peer_fd)
{ {
auto cl_it = clients.find(peer_fd); auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end()) if (cl_it != clients.end())
{ {
auto & cl = cl_it->second; auto & cl = cl_it->second;
if (result < 0 && result != -EAGAIN) if (data->res < 0 && data->res != -EAGAIN)
{ {
// this is a client socket, so don't panic. just disconnect it // this is a client socket, so don't panic. just disconnect it
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result)); printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
stop_client(peer_fd); stop_client(peer_fd);
return; return;
} }
if (result >= 0) if (data->res >= 0)
{ {
osd_op_t *cur_op = cl.write_op; osd_op_t *cur_op = cl.write_op;
while (result > 0 && cur_op->send_list.sent < cur_op->send_list.count) while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
{ {
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent]; iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
if (iov.iov_len <= result) if (iov.iov_len <= data->res)
{ {
result -= iov.iov_len; data->res -= iov.iov_len;
cur_op->send_list.sent++; cur_op->send_list.sent++;
} }
else else
{ {
iov.iov_len -= result; iov.iov_len -= data->res;
iov.iov_base += result; iov.iov_base += data->res;
break; break;
} }
} }

View File

@@ -19,8 +19,6 @@
int connect_osd(const char *osd_address, int osd_port); int connect_osd(const char *osd_address, int osd_port);
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len);
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern); uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern);
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len); void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
@@ -107,7 +105,7 @@ int main3(int narg, char *args[])
return 0; return 0;
} }
int main4(int narg, char *args[]) int main(int narg, char *args[])
{ {
int connect_fd; int connect_fd;
// Cluster write (sync not implemented yet) // Cluster write (sync not implemented yet)
@@ -119,15 +117,6 @@ int main4(int narg, char *args[])
return 0; return 0;
} }
int main(int narg, char *args[])
{
int connect_fd;
connect_fd = connect_osd("192.168.7.2", 43051);
test_read(connect_fd, 1, 1039663104, UINT64_MAX, 0, 128*1024);
close(connect_fd);
return 0;
}
int connect_osd(const char *osd_address, int osd_port) int connect_osd(const char *osd_address, int osd_port)
{ {
struct sockaddr_in addr; struct sockaddr_in addr;
@@ -178,66 +167,6 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
return true; return true;
} }
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len)
{
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid = {
.inode = inode,
.stripe = stripe,
};
op.sec_rw.version = version;
op.sec_rw.offset = offset;
op.sec_rw.len = len;
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, op.sec_rw.len))
{
free(data);
return 0;
}
r = read_blocking(connect_fd, data, len);
if (r != len)
{
free(data);
perror("read data");
return 0;
}
free(data);
printf("Read %lu:%lu v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
op.hdr.opcode = OSD_OP_SECONDARY_LIST;
op.sec_list.list_pg = 1;
op.sec_list.pg_count = 1;
op.sec_list.pg_stripe_size = 4*1024*1024;
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (reply.hdr.retval < 0 || !check_reply(r, op, reply, reply.hdr.retval))
{
return 0;
}
data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
if (r != sizeof(obj_ver_id)*reply.hdr.retval)
{
free(data);
perror("read data");
return 0;
}
obj_ver_id *ov = (obj_ver_id*)data;
for (int i = 0; i < reply.hdr.retval; i++)
{
if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
{
printf("list: %lu:%lu v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
}
}
return 0;
}
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern) uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern)
{ {
osd_any_op_t op; osd_any_op_t op;

View File

@@ -51,40 +51,6 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
return done; return done;
} }
int readv_blocking(int fd, iovec *iov, int iovcnt)
{
int v = 0;
int done = 0;
while (v < iovcnt)
{
ssize_t r = readv(fd, iov, iovcnt);
if (r < 0)
{
if (errno != EAGAIN && errno != EPIPE)
{
perror("writev");
exit(1);
}
continue;
}
while (v < iovcnt)
{
if (iov[v].iov_len > r)
{
iov[v].iov_len -= r;
iov[v].iov_base += r;
break;
}
else
{
v++;
}
}
done += r;
}
return done;
}
int writev_blocking(int fd, iovec *iov, int iovcnt) int writev_blocking(int fd, iovec *iov, int iovcnt)
{ {
int v = 0; int v = 0;

View File

@@ -5,5 +5,4 @@
int read_blocking(int fd, void *read_buf, size_t remaining); int read_blocking(int fd, void *read_buf, size_t remaining);
int write_blocking(int fd, void *write_buf, size_t remaining); int write_blocking(int fd, void *write_buf, size_t remaining);
int readv_blocking(int fd, iovec *iov, int iovcnt);
int writev_blocking(int fd, iovec *iov, int iovcnt); int writev_blocking(int fd, iovec *iov, int iovcnt);

View File

@@ -25,37 +25,20 @@ int connect_stub(const char *server_address, int server_port);
void run_bench(int peer_fd); void run_bench(int peer_fd);
static uint64_t read_sum = 0, read_count = 0;
static uint64_t write_sum = 0, write_count = 0; static uint64_t write_sum = 0, write_count = 0;
static uint64_t sync_sum = 0, sync_count = 0; static uint64_t sync_sum = 0, sync_count = 0;
void handle_sigint(int sig) void handle_sigint(int sig)
{ {
printf("4k randread: %lu us avg\n", read_count ? read_sum/read_count : 0); printf("4k randwrite: %lu us avg\n", write_sum/write_count);
printf("4k randwrite: %lu us avg\n", write_count ? write_sum/write_count : 0); printf("sync: %lu us avg\n", sync_sum/sync_count);
printf("sync: %lu us avg\n", sync_count ? sync_sum/sync_count : 0);
exit(0); exit(0);
} }
int main(int narg, char *args[]) int main(int narg, char *args[])
{ {
if (narg < 2)
{
printf("USAGE: %s SERVER_IP [PORT]\n", args[0]);
return 1;
}
int port = 11203;
if (narg >= 3)
{
port = atoi(args[2]);
if (port <= 0 || port >= 65536)
{
printf("Bad port number\n");
return 1;
}
}
signal(SIGINT, handle_sigint); signal(SIGINT, handle_sigint);
int peer_fd = connect_stub(args[1], port); int peer_fd = connect_stub("127.0.0.1", 11203);
run_bench(peer_fd); run_bench(peer_fd);
close(peer_fd); close(peer_fd);
return 0; return 0;
@@ -115,37 +98,10 @@ void run_bench(int peer_fd)
osd_any_reply_t reply; osd_any_reply_t reply;
void *buf = NULL; void *buf = NULL;
int r; int r;
iovec iov[2];
timespec tv_begin, tv_end; timespec tv_begin, tv_end;
clock_gettime(CLOCK_REALTIME, &tv_begin); clock_gettime(CLOCK_REALTIME, &tv_begin);
while (1) while (1)
{ {
// read
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid.inode = 3;
op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
op.sec_rw.version = 0;
op.sec_rw.len = 4096;
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
if (!r)
break;
buf = malloc(op.sec_rw.len);
iov[0] = { reply.buf, OSD_PACKET_SIZE };
iov[1] = { buf, op.sec_rw.len };
r = readv_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
free(buf);
if (!r || !check_reply(OSD_PACKET_SIZE, op, reply, op.sec_rw.len))
break;
clock_gettime(CLOCK_REALTIME, &tv_end);
read_count++;
read_sum += (
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
);
tv_begin = tv_end;
// write // write
op.hdr.magic = SECONDARY_OSD_OP_MAGIC; op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1; op.hdr.id = 1;
@@ -157,9 +113,9 @@ void run_bench(int peer_fd)
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17); op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
buf = malloc(op.sec_rw.len); buf = malloc(op.sec_rw.len);
memset(buf, rand() % 255, op.sec_rw.len); memset(buf, rand() % 255, op.sec_rw.len);
iov[0] = { op.buf, OSD_PACKET_SIZE }; r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
iov[1] = { buf, op.sec_rw.len }; if (r)
r = writev_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len); r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
free(buf); free(buf);
if (!r) if (!r)
break; break;
@@ -172,7 +128,6 @@ void run_bench(int peer_fd)
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 + (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000 tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
); );
tv_begin = tv_end;
// sync/stab // sync/stab
op.hdr.magic = SECONDARY_OSD_OP_MAGIC; op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1; op.hdr.id = 1;
@@ -183,12 +138,11 @@ void run_bench(int peer_fd)
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE); r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, 0)) if (!check_reply(r, op, reply, 0))
break; break;
clock_gettime(CLOCK_REALTIME, &tv_end); clock_gettime(CLOCK_REALTIME, &tv_begin);
sync_count++; sync_count++;
sync_sum += ( sync_sum += (
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 + (tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000 tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
); );
tv_begin = tv_end;
} }
} }

View File

@@ -1,129 +0,0 @@
/**
* Stub "OSD" implemented on top of osd_messenger to test & compare
* network performance with sync read/write and io_uring
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#include <stdexcept>
#include "ringloop.h"
#include "epoll_manager.h"
#include "messenger.h"
int bind_stub(const char *bind_address, int bind_port);
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
int main(int narg, char *args[])
{
ring_consumer_t looper;
ring_loop_t *ringloop = new ring_loop_t(512);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
osd_messenger_t *msgr = new osd_messenger_t();
msgr->osd_num = 1351;
msgr->tfd = epmgr->tfd;
msgr->ringloop = ringloop;
msgr->repeer_pgs = [](osd_num_t) {};
msgr->exec_op = [msgr](osd_op_t *op) { stub_exec_op(msgr, op); };
// Accept new connections
int listen_fd = bind_stub("0.0.0.0", 11203);
epmgr->set_fd_handler(listen_fd, [listen_fd, msgr](int fd, int events)
{
msgr->accept_connections(listen_fd);
});
looper.loop = [msgr, ringloop]()
{
msgr->read_requests();
msgr->send_replies();
ringloop->submit();
};
ringloop->register_consumer(&looper);
printf("stub_uring_osd: waiting for clients\n");
while (true)
{
ringloop->loop();
ringloop->wait();
}
delete msgr;
delete epmgr;
delete ringloop;
return 0;
}
int bind_stub(const char *bind_address, int bind_port)
{
int listen_backlog = 128;
int listen_fd = socket(AF_INET, SOCK_STREAM, 0);
if (listen_fd < 0)
{
throw std::runtime_error(std::string("socket: ") + strerror(errno));
}
int enable = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, bind_address, &addr.sin_addr)) != 1)
{
close(listen_fd);
throw std::runtime_error("bind address "+std::string(bind_address)+(r == 0 ? " is not valid" : ": no ipv4 support"));
}
addr.sin_family = AF_INET;
addr.sin_port = htons(bind_port);
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
}
if (listen(listen_fd, listen_backlog) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("listen: ") + strerror(errno));
}
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
return listen_fd;
}
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->send_list.push_back(op->reply.buf, OSD_PACKET_SIZE);
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
op->reply.hdr.retval = op->req.sec_rw.len;
op->buf = malloc(op->req.sec_rw.len);
op->send_list.push_back(op->buf, op->req.sec_rw.len);
}
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
op->reply.hdr.retval = op->req.sec_rw.len;
}
else if (op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
{
op->reply.hdr.retval = 0;
}
else
{
printf("client %d: unsupported stub opcode: %lu\n", op->peer_fd, op->req.hdr.opcode);
op->reply.hdr.retval = -EINVAL;
}
msgr->outbox_push(op);
}

View File

@@ -6,7 +6,7 @@
#include <string.h> #include <string.h>
#include "timerfd_manager.h" #include "timerfd_manager.h"
timerfd_manager_t::timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler) timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler)
{ {
this->set_fd_handler = set_fd_handler; this->set_fd_handler = set_fd_handler;
wait_state = 0; wait_state = 0;
@@ -15,7 +15,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, std::function<void(
{ {
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno)); throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
} }
set_fd_handler(timerfd, [this](int fd, int events) set_fd_handler(timerfd, false, [this](int fd, int events)
{ {
handle_readable(); handle_readable();
}); });
@@ -23,7 +23,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, std::function<void(
timerfd_manager_t::~timerfd_manager_t() timerfd_manager_t::~timerfd_manager_t()
{ {
set_fd_handler(timerfd, NULL); set_fd_handler(timerfd, false, NULL);
close(timerfd); close(timerfd);
} }

View File

@@ -26,9 +26,9 @@ class timerfd_manager_t
void trigger_nearest(); void trigger_nearest();
void handle_readable(); void handle_readable();
public: public:
std::function<void(int, std::function<void(int, int)>)> set_fd_handler; std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler;
timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler); timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler);
~timerfd_manager_t(); ~timerfd_manager_t();
int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback); int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
void clear_timer(int timer_id); void clear_timer(int timer_id);