Compare commits

..

1 Commits

Author SHA1 Message Date
9c33de5e57 Test: use submit_and_wait() 2020-03-03 17:48:47 +03:00
76 changed files with 2545 additions and 10814 deletions

View File

@@ -1,46 +0,0 @@
#!/usr/bin/perl
use strict;
my $deps = {};
for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
{
if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
{
$deps->{$1}->{$2} = 1;
}
}
my $added;
do
{
$added = 0;
for my $file (keys %$deps)
{
for my $dep (keys %{$deps->{$file}})
{
if ($deps->{$dep})
{
for my $subdep (keys %{$deps->{$dep}})
{
if (!$deps->{$file}->{$subdep})
{
$added = 1;
$deps->{$file}->{$subdep} = 1;
}
}
}
}
}
} while ($added);
for my $file (sort keys %$deps)
{
if ($file =~ /\.cpp$/)
{
my $obj = $file;
$obj =~ s/\.cpp$/.o/s;
print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
}
}

189
Makefile
View File

@@ -1,153 +1,66 @@
BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
# -fsanitize=address
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal
all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd stub_bench osd_test
clean:
rm -f *.o
dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
g++ $(CXXFLAGS) -o $@ $< crc32c.o
libblockstore.so: $(BLOCKSTORE_OBJS)
g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o pg_states.o \
osd_rmw.o json11.o base64.o timerfd_manager.o
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
stub_osd: stub_osd.o rw_blocking.o
g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
stub_uring_osd: $(STUB_URING_OSD_OBJS)
g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
FIO_CLUSTER_OBJS := fio_cluster.o cluster_client.o epoll_manager.o etcd_state_client.o \
messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o pg_states.o timerfd_manager.o base64.o
libfio_cluster.so: $(FIO_CLUSTER_OBJS)
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) -luring
test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
test: test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
test_allocator: test_allocator.cpp allocator.o
g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
crc32c.o: crc32c.c crc32c.h
crc32c.o: crc32c.c
g++ $(CXXFLAGS) -c -o $@ $<
json11.o: json11/json11.cpp
g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
# Autogenerated
allocator.o: allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
messenger.o: messenger.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
msgr_receive.o: msgr_receive.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
msgr_send.o: msgr_send.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp object_id.h osd_id.h osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test.o: osd_rmw_test.cpp object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
g++ $(CXXFLAGS) -c -o $@ $<
pg_states.o: pg_states.cpp pg_states.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
g++ $(CXXFLAGS) -c -o $@ $<
libblockstore.so: $(BLOCKSTORE_OBJS)
g++ $(CXXFLAGS) -o libblockstore.so -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
g++ $(CXXFLAGS) -o $@ $<
osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
test_allocator.o: test_allocator.cpp allocator.h
g++ $(CXXFLAGS) -c -o $@ $<
test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
g++ $(CXXFLAGS) -c -o $@ $<
timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
test_blockstore: ./libblockstore.so test_blockstore.cpp
g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp ./libblockstore.so -ltcmalloc_minimal -luring
test: test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring
test_allocator: test_allocator.cpp allocator.o
g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o

View File

@@ -1,52 +0,0 @@
#include "base64.h"
std::string base64_encode(const std::string &in)
{
std::string out;
unsigned val = 0;
int valb = -6;
for (unsigned char c: in)
{
val = (val << 8) + c;
valb += 8;
while (valb >= 0)
{
out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb) & 0x3F]);
valb -= 6;
}
}
if (valb > -6)
out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8)) & 0x3F]);
while (out.size() % 4)
out.push_back('=');
return out;
}
static char T[256] = { 0 };
std::string base64_decode(const std::string &in)
{
std::string out;
if (T[0] == 0)
{
for (int i = 0; i < 256; i++)
T[i] = -1;
for (int i = 0; i < 64; i++)
T[(unsigned char)("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i])] = i;
}
unsigned val = 0;
int valb = -8;
for (unsigned char c: in)
{
if (T[c] == -1)
break;
val = (val<<6) + T[c];
valb += 6;
if (valb >= 0)
{
out.push_back(char((val >> valb) & 0xFF));
valb -= 8;
}
}
return out;
}

View File

@@ -1,5 +0,0 @@
#pragma once
#include <string>
std::string base64_encode(const std::string &in);
std::string base64_decode(const std::string &in);

View File

@@ -55,11 +55,6 @@ uint64_t blockstore_t::get_block_count()
return impl->get_block_count();
}
uint64_t blockstore_t::get_free_block_count()
{
return impl->get_free_block_count();
}
uint32_t blockstore_t::get_disk_alignment()
{
return impl->get_disk_alignment();

View File

@@ -15,9 +15,7 @@
// Memory alignment for direct I/O (usually 512 bytes)
// All other alignments must be a multiple of this one
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 512
#endif
// Default block size is 128 KB, current allowed range is 4K - 128M
#define DEFAULT_ORDER 17
@@ -52,7 +50,6 @@ Input:
- version == 0: read the last stable version,
- version == UINT64_MAX: read the last version,
- otherwise: read the newest version that is <= the specified version
- reads aren't guaranteed to return data from previous unfinished writes
For writes:
- if version == 0, a new version is assigned automatically
- if version != 0, it is assigned for the new write if possible, otherwise -EINVAL is returned
@@ -95,7 +92,7 @@ Input:
- buf = pre-allocated obj_ver_id array <len> units long
Output:
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
- retval = 0 or negative error number (-EINVAL)
## BS_OP_SYNC_STAB_ALL
@@ -178,7 +175,6 @@ public:
// FIXME rename to object_size
uint32_t get_block_size();
uint64_t get_block_count();
uint64_t get_free_block_count();
uint32_t get_disk_alignment();
};

View File

@@ -4,11 +4,9 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
{
this->bs = bs;
this->flusher_count = flusher_count;
dequeuing = false;
active_flushers = 0;
syncing_flushers = 0;
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = flusher_start_threshold;
sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
journal_trim_interval = sync_threshold;
journal_trim_counter = 0;
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
co = new journal_flusher_co[flusher_count];
@@ -57,13 +55,17 @@ journal_flusher_t::~journal_flusher_t()
bool journal_flusher_t::is_active()
{
return active_flushers > 0 || dequeuing;
return active_flushers > 0 || start_forced && flush_queue.size() > 0 || flush_queue.size() >= sync_threshold;
}
void journal_flusher_t::loop()
{
for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
for (int i = 0; i < flusher_count; i++)
{
if (!active_flushers && (start_forced ? !flush_queue.size() : (flush_queue.size() < sync_threshold)))
{
return;
}
co[i].loop();
}
}
@@ -81,11 +83,6 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version;
flush_queue.push_back(ov.oid);
}
if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
{
dequeuing = true;
bs->ringloop->wakeup();
}
}
void journal_flusher_t::unshift_flush(obj_ver_id ov)
@@ -101,25 +98,14 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
flush_versions[ov.oid] = ov.version;
flush_queue.push_front(ov.oid);
}
if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
{
dequeuing = true;
bs->ringloop->wakeup();
}
}
void journal_flusher_t::request_trim()
void journal_flusher_t::force_start()
{
dequeuing = true;
trim_wanted++;
start_forced = true;
bs->ringloop->wakeup();
}
void journal_flusher_t::release_trim()
{
trim_wanted--;
}
#define await_sqe(label) \
resume_##label:\
sqe = bs->get_sqe();\
@@ -130,7 +116,6 @@ void journal_flusher_t::release_trim()
}\
data = ((ring_data_t*)sqe->user_data);
// FIXME: Implement batch flushing
bool journal_flusher_co::loop()
{
// This is much better than implementing the whole function as an FSM
@@ -170,9 +155,10 @@ bool journal_flusher_co::loop()
else if (wait_state == 18)
goto resume_18;
resume_0:
if (!flusher->flush_queue.size() || !flusher->dequeuing)
if (!flusher->flush_queue.size() ||
!flusher->start_forced && !flusher->active_flushers && flusher->flush_queue.size() < flusher->sync_threshold)
{
flusher->dequeuing = false;
flusher->start_forced = false;
wait_state = 0;
return true;
}
@@ -183,76 +169,6 @@ resume_0:
dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end())
{
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
{
flusher->enqueue_flush(cur);
// We can't flush journal sectors that are still written to
// However, as we group flushes by oid, current oid may have older writes to flush!
// And it may even block writes if we don't flush the older version
// (if it's in the beginning of the journal)...
// So first try to find an older version of the same object to flush.
bool found = false;
while (dirty_end != bs->dirty_db.begin())
{
dirty_end--;
if (dirty_end->first.oid != cur.oid)
{
break;
}
if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start)))
{
found = true;
cur.version = dirty_end->first.version;
break;
}
}
if (!found)
{
// Try other objects
int search_left = flusher->flush_queue.size() - 1;
#ifdef BLOCKSTORE_DEBUG
printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
#endif
while (search_left > 0)
{
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
flusher->flush_queue.pop_front();
flusher->flush_versions.erase(cur.oid);
dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end())
{
if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
(bs->journal.dirty_start >= bs->journal.used_start ||
dirty_end->second.journal_sector < bs->journal.used_start))
{
#ifdef BLOCKSTORE_DEBUG
printf("Write %lu:%lu v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
#endif
flusher->enqueue_flush(cur);
}
else
{
break;
}
}
search_left--;
}
if (search_left <= 0)
{
#ifdef BLOCKSTORE_DEBUG
printf("No older flushes, stopping\n");
#endif
flusher->dequeuing = false;
wait_state = 0;
return true;
}
}
}
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end())
{
@@ -275,26 +191,32 @@ resume_0:
#endif
flusher->active_flushers++;
resume_1:
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
// Scan dirty versions of the object
if (!scan_dirty(1))
{
wait_state += 1;
return false;
}
// Writes and deletes shouldn't happen at the same time
assert(!(copy_count > 0 || has_writes) || !has_delete);
if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
{
// Nothing to flush
bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
goto trim_journal;
flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
{
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
}
flusher->sync_to_repeat.erase(repeat_it);
wait_state = 0;
goto resume_0;
}
// Find it in clean_db
clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
if (clean_loc == UINT64_MAX)
{
if (old_clean_loc == UINT64_MAX)
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
{
// Object not allocated. This is a bug.
char err[1024];
@@ -409,7 +331,6 @@ resume_1:
else
{
clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
new_entry->oid = cur.oid;
new_entry->version = cur.version;
if (!bs->inmemory_meta)
@@ -465,9 +386,8 @@ resume_1:
}
// Update clean_db and dirty_db, free old data locations
update_clean_db();
trim_journal:
// Clear unused part of the journal every <journal_trim_interval> flushes
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
{
flusher->journal_trim_counter = 0;
if (bs->journal.trim())
@@ -497,7 +417,7 @@ resume_1:
}
// All done
#ifdef BLOCKSTORE_DEBUG
printf("Flushed %lu:%lu v%lu (%ld left)\n", cur.oid.inode, cur.oid.stripe, cur.version, flusher->flush_queue.size());
printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif
flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
@@ -525,7 +445,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
copy_count = 0;
clean_loc = UINT64_MAX;
has_delete = false;
has_writes = false;
has_empty = false;
skip_copy = false;
clean_init_bitmap = false;
while (1)
@@ -533,8 +453,11 @@ bool journal_flusher_co::scan_dirty(int wait_base)
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{
// First we submit all reads
has_writes = true;
if (dirty_it->second.len != 0)
if (dirty_it->second.len == 0)
{
has_empty = true;
}
else
{
offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len;
@@ -576,7 +499,6 @@ bool journal_flusher_co::scan_dirty(int wait_base)
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{
// There is an unflushed big write. Copy small writes in its position
has_writes = true;
clean_loc = dirty_it->second.location;
clean_init_bitmap = true;
clean_bitmap_offset = dirty_it->second.offset;
@@ -710,8 +632,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
});
sync_found:
cur_sync->ready_count++;
flusher->syncing_flushers++;
if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
if (cur_sync->ready_count >= flusher->sync_threshold || !flusher->flush_queue.size())
{
// Sync batch is ready. Do it.
await_sqe(0);
@@ -737,7 +658,6 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
wait_state = 2;
return false;
}
flusher->syncing_flushers--;
cur_sync->ready_count--;
if (cur_sync->ready_count == 0)
{

View File

@@ -45,8 +45,8 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete, has_writes;
blockstore_clean_db_t::iterator clean_it;
bool skip_copy, has_delete, has_empty;
spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
int copy_count;
@@ -73,10 +73,9 @@ public:
// Journal flusher itself
class journal_flusher_t
{
int trim_wanted = 0;
bool dequeuing;
bool start_forced = false;
int flusher_count;
int flusher_start_threshold;
int sync_threshold;
journal_flusher_co *co;
blockstore_impl_t *bs;
friend class journal_flusher_co;
@@ -85,7 +84,6 @@ class journal_flusher_t
void* journal_superblock;
int active_flushers;
int syncing_flushers;
std::list<flusher_sync_t> syncs;
std::map<object_id, uint64_t> sync_to_repeat;
@@ -97,8 +95,7 @@ public:
~journal_flusher_t();
void loop();
bool is_active();
void request_trim();
void release_trim();
void force_start();
void enqueue_flush(obj_ver_id oid);
void unshift_flush(obj_ver_id oid);
};

View File

@@ -5,7 +5,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
this->ringloop = ringloop;
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
ringloop->register_consumer(ring_consumer);
initialized = 0;
zero_object = (uint8_t*)memalign(MEM_ALIGNMENT, block_size);
data_fd = meta_fd = journal.fd = -1;
@@ -36,7 +36,7 @@ blockstore_impl_t::~blockstore_impl_t()
delete data_alloc;
delete flusher;
free(zero_object);
ringloop->unregister_consumer(&ring_consumer);
ringloop->unregister_consumer(ring_consumer);
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
@@ -98,19 +98,10 @@ void blockstore_impl_t::loop()
{
// try to submit ops
unsigned initial_ring_space = ringloop->space_left();
// FIXME: rework this "sync polling"
auto cur_sync = in_progress_syncs.begin();
while (cur_sync != in_progress_syncs.end())
{
if (continue_sync(*cur_sync) != 2)
{
// List is unmodified
cur_sync++;
}
else
{
cur_sync = in_progress_syncs.begin();
}
continue_sync(*cur_sync++);
}
auto cur = submit_queue.begin();
int has_writes = 0;
@@ -124,6 +115,12 @@ void blockstore_impl_t::loop()
if (PRIV(op)->wait_for)
{
check_wait(op);
#ifdef BLOCKSTORE_DEBUG
if (PRIV(op)->wait_for)
{
printf("still waiting for %d\n", PRIV(op)->wait_for);
}
#endif
if (PRIV(op)->wait_for == WAIT_SQE)
{
break;
@@ -139,12 +136,12 @@ void blockstore_impl_t::loop()
}
unsigned ring_space = ringloop->space_left();
unsigned prev_sqe_pos = ringloop->save();
bool dequeue_op = false;
int dequeue_op = 0;
if (op->opcode == BS_OP_READ)
{
dequeue_op = dequeue_read(op);
}
else if (op->opcode == BS_OP_WRITE)
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
@@ -154,16 +151,6 @@ void blockstore_impl_t::loop()
dequeue_op = dequeue_write(op);
has_writes = dequeue_op ? 1 : 2;
}
else if (op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
// Some writes could not be submitted
break;
}
dequeue_op = dequeue_del(op);
has_writes = dequeue_op ? 1 : 2;
}
else if (op->opcode == BS_OP_SYNC)
{
// wait for all small writes to be submitted
@@ -179,33 +166,16 @@ void blockstore_impl_t::loop()
}
else if (op->opcode == BS_OP_STABLE)
{
if (has_writes == 2)
{
// Don't submit additional flushes before completing previous LISTs
break;
}
dequeue_op = dequeue_stable(op);
}
else if (op->opcode == BS_OP_ROLLBACK)
{
if (has_writes == 2)
{
// Don't submit additional flushes before completing previous LISTs
break;
}
dequeue_op = dequeue_rollback(op);
}
else if (op->opcode == BS_OP_LIST)
{
// Block LIST operation by previous modifications,
// so it always returns a consistent state snapshot
if (has_writes == 2 || inflight_writes > 0)
has_writes = 2;
else
{
process_list(op);
dequeue_op = true;
}
process_list(op);
dequeue_op = true;
}
if (dequeue_op)
{
@@ -226,16 +196,11 @@ void blockstore_impl_t::loop()
{
flusher->loop();
}
int ret = ringloop->submit();
if (ret < 0)
{
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
}
if ((initial_ring_space - ringloop->space_left()) > 0)
{
live = true;
}
queue_stall = !live && !ringloop->has_work();
queue_stall = !live && !ringloop->get_loop_again();
live = false;
}
}
@@ -275,9 +240,19 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (ringloop->space_left() < PRIV(op)->wait_detail)
{
// stop submission if there's still no free space
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
#endif
return;
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_IN_FLIGHT)
{
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = PRIV(op)->wait_detail,
});
if (dirty_it != dirty_db.end() && IS_IN_FLIGHT(dirty_it->second.state))
{
// do not submit
return;
}
PRIV(op)->wait_for = 0;
@@ -287,12 +262,8 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
if (journal.used_start == PRIV(op)->wait_detail)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
#endif
return;
}
flusher->release_trim();
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
@@ -302,9 +273,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
journal.sector_info[next].dirty)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for a journal buffer\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
@@ -313,9 +281,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
{
if (!data_alloc->get_free_count() && !flusher->is_active())
{
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
@@ -334,12 +299,12 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
op->len > block_size-op->offset ||
(op->len % disk_alignment)
)) ||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
readonly && op->opcode != BS_OP_READ ||
first && op->opcode == BS_OP_WRITE)
{
// Basic verification not passed
op->retval = -EINVAL;
std::function<void (blockstore_op_t*)>(op->callback)(op);
op->callback(op);
return;
}
if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -380,21 +345,21 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
}
};
}
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
{
std::function<void (blockstore_op_t*)>(op->callback)(op);
op->callback(op);
return;
}
if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
if (0 && op->opcode == BS_OP_SYNC && immediate_commit)
{
op->retval = 0;
std::function<void (blockstore_op_t*)>(op->callback)(op);
op->callback(op);
return;
}
// Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0;
PRIV(op)->sync_state = 0;
PRIV(op)->pending_ops = 0;
if (!first)
{
@@ -407,165 +372,82 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
ringloop->wakeup();
}
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
{
while (search_start < search_end)
{
int pos = search_start+(search_end-search_start)/2;
if (oid < list[pos].oid)
{
search_end = pos;
}
else if (list[pos].oid < oid)
{
search_start = pos+1;
}
else
{
list[pos].version = version;
return true;
}
}
return false;
}
void blockstore_impl_t::process_list(blockstore_op_t *op)
{
// Check PG
// Count objects
uint32_t list_pg = op->offset;
uint32_t pg_count = op->len;
uint64_t pg_stripe_size = op->oid.stripe;
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
uint64_t parity_block_size = op->oid.stripe;
if (pg_count != 0 && (parity_block_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
{
op->retval = -EINVAL;
FINISH_OP(op);
return;
}
// Copy clean_db entries (sorted)
int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
if (!stable)
uint64_t stable_count = 0;
if (pg_count > 0)
{
for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{
uint32_t pg = (it->first.inode + it->first.stripe / parity_block_size) % pg_count;
if (pg == list_pg)
{
stable_count++;
}
}
}
else
{
stable_count = clean_db.size();
}
uint64_t total_count = stable_count;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
{
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
{
if (IS_STABLE(it->second.state))
{
stable_count++;
}
total_count++;
}
}
// Allocate memory
op->version = stable_count;
op->retval = total_count;
op->buf = malloc(sizeof(obj_ver_id) * total_count);
if (!op->buf)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
obj_ver_id *vers = (obj_ver_id*)op->buf;
int i = 0;
for (auto it = clean_db.begin(); it != clean_db.end(); it++)
{
if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
if (!pg_count || ((it->first.inode + it->first.stripe / parity_block_size) % pg_count) == list_pg)
{
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
stable[stable_count++] = {
vers[i++] = {
.oid = it->first,
.version = it->second.version,
};
}
}
int clean_stable_count = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
obj_ver_id *unstable = NULL;
int j = stable_count;
for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
{
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
{
if (IS_DELETE(it->second.state))
if (IS_STABLE(it->second.state))
{
// Deletions are always stable, so try to zero out two possible entries
if (!replace_stable(it->first.oid, 0, 0, clean_stable_count, stable))
{
replace_stable(it->first.oid, 0, clean_stable_count, stable_count, stable);
}
}
else if (IS_STABLE(it->second.state))
{
// First try to replace a clean stable version in the first part of the list
if (!replace_stable(it->first.oid, it->first.version, 0, clean_stable_count, stable))
{
// Then try to replace the last dirty stable version in the second part of the list
if (stable[stable_count-1].oid == it->first.oid)
{
stable[stable_count-1].version = it->first.version;
}
else
{
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
stable[stable_count++] = it->first;
}
}
vers[i++] = it->first;
}
else
{
if (unstable_count >= unstable_alloc)
{
unstable_alloc += 32768;
unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
if (!unstable)
{
if (stable)
free(stable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
unstable[unstable_count++] = it->first;
vers[j++] = it->first;
}
}
}
// Remove zeroed out stable entries
int j = 0;
for (int i = 0; i < stable_count; i++)
{
if (stable[i].version != 0)
{
stable[j++] = stable[i];
}
}
stable_count = j;
if (stable_count+unstable_count > stable_alloc)
{
stable_alloc = stable_count+unstable_count;
stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
}
// Copy unstable entries
for (int i = 0; i < unstable_count; i++)
{
stable[j++] = unstable[i];
}
free(unstable);
op->version = stable_count;
op->retval = stable_count+unstable_count;
op->buf = stable;
FINISH_OP(op);
}

View File

@@ -1,6 +1,7 @@
#pragma once
#include "blockstore.h"
#include "timerfd_interval.h"
#include <sys/types.h>
#include <sys/ioctl.h>
@@ -15,7 +16,7 @@
#include <deque>
#include <new>
#include "cpp-btree/btree_map.h"
#include "sparsepp/sparsepp/spp.h"
#include "allocator.h"
@@ -24,17 +25,17 @@
// States are not stored on disk. Instead, they're deduced from the journal
// FIXME: Rename to BS_ST_*
#define ST_J_WAIT_BIG 1
#define ST_J_IN_FLIGHT 2
#define ST_J_SUBMITTED 3
#define ST_J_WRITTEN 4
#define ST_J_SYNCED 5
#define ST_J_STABLE 6
#define ST_J_IN_FLIGHT 1
#define ST_J_SUBMITTED 2
#define ST_J_WRITTEN 3
#define ST_J_SYNCED 4
#define ST_J_STABLE 5
#define ST_D_IN_FLIGHT 15
#define ST_D_SUBMITTED 16
#define ST_D_WRITTEN 17
#define ST_D_SYNCED 20
#define ST_D_META_WRITTEN 19
#define ST_D_META_SYNCED 20
#define ST_D_STABLE 21
#define ST_DEL_IN_FLIGHT 31
@@ -45,17 +46,13 @@
#define ST_CURRENT 48
#define IMMEDIATE_NONE 0
#define IMMEDIATE_SMALL 1
#define IMMEDIATE_ALL 2
#define IS_IN_FLIGHT(st) (st == ST_J_WAIT_BIG || st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
#define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
#define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_SYNCED || st == ST_DEL_SYNCED)
#define IS_JOURNAL(st) (st >= ST_J_WAIT_BIG && st <= ST_J_STABLE)
#define IS_BIG_WRITE(st) (st >= ST_D_IN_FLIGHT && st <= ST_D_STABLE)
#define IS_DELETE(st) (st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_STABLE)
#define IS_UNSYNCED(st) (st >= ST_J_WAIT_BIG && st <= ST_J_WRITTEN || st >= ST_D_IN_FLIGHT && st <= ST_D_WRITTEN|| st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_WRITTEN)
#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED || st == ST_DEL_SYNCED)
#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
#define IS_UNSYNCED(st) (st >= ST_J_SUBMITTED && st <= ST_J_WRITTEN || st >= ST_D_SUBMITTED && st <= ST_D_META_WRITTEN || st >= ST_DEL_SUBMITTED && st <= ST_DEL_WRITTEN)
#define BS_SUBMIT_GET_SQE(sqe, data) \
BS_SUBMIT_GET_ONLY_SQE(sqe); \
@@ -127,6 +124,8 @@ struct __attribute__((__packed__)) dirty_entry
// Suspend operation until there are more free SQEs
#define WAIT_SQE 1
// Suspend operation until version <wait_detail> of object <oid> is written
#define WAIT_IN_FLIGHT 2
// Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
#define WAIT_JOURNAL 3
// Suspend operation until the next journal sector buffer is free
@@ -140,7 +139,7 @@ struct fulfill_read_t
};
#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)
#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); op->callback(op)
struct blockstore_op_private_t
{
@@ -148,13 +147,12 @@ struct blockstore_op_private_t
int wait_for;
uint64_t wait_detail;
int pending_ops;
int op_state;
// Read
std::vector<fulfill_read_t> read_vec;
// Sync, write
uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
uint64_t min_used_journal_sector, max_used_journal_sector;
// Write
struct iovec iov_zerofill[3];
@@ -163,13 +161,9 @@ struct blockstore_op_private_t
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
int sync_small_checked, sync_big_checked;
std::list<blockstore_op_t*>::iterator in_progress_ptr;
int prev_sync_count;
int sync_state, prev_sync_count;
};
// https://github.com/algorithm-ninja/cpp-btree
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
#include "blockstore_init.h"
@@ -183,30 +177,29 @@ class blockstore_impl_t
uint32_t block_size;
uint64_t meta_offset;
uint64_t data_offset;
uint64_t cfg_journal_size, cfg_data_size;
uint64_t cfg_journal_size;
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
uint32_t disk_alignment = 512;
// Journal block size - minimum_io_size of the journal device is the best choice
uint64_t journal_block_size = 4096;
uint64_t journal_block_size = 512;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint64_t meta_block_size = 4096;
uint64_t meta_block_size = 512;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
bool readonly = false;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Enable if you want every operation to be executed with an "implicit fsync"
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
int immediate_commit = IMMEDIATE_NONE;
// FIXME Not implemented yet
bool immediate_commit = false;
bool inmemory_meta = false;
int flusher_count;
/******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer;
blockstore_clean_db_t clean_db;
// Another option is https://github.com/algorithm-ninja/cpp-btree
spp::sparse_hash_map<object_id, clean_entry> clean_db;
uint8_t *clean_bitmap = NULL;
blockstore_dirty_db_t dirty_db;
std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
@@ -231,7 +224,6 @@ class blockstore_impl_t
bool live = false, queue_stall = false;
ring_loop_t *ringloop;
int inflight_writes = 0;
bool stop_sync_submitted;
@@ -272,7 +264,7 @@ class blockstore_impl_t
bool enqueue_write(blockstore_op_t *op);
int dequeue_write(blockstore_op_t *op);
int dequeue_del(blockstore_op_t *op);
int continue_write(blockstore_op_t *op);
void ack_write(blockstore_op_t *op);
void release_journal_sectors(blockstore_op_t *op);
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
@@ -285,15 +277,11 @@ class blockstore_impl_t
// Stabilize
int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov);
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
void stabilize_object(object_id oid, uint64_t max_ver);
// Rollback
int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
@@ -328,6 +316,5 @@ public:
inline uint32_t get_block_size() { return block_size; }
inline uint64_t get_block_count() { return block_count; }
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
inline uint32_t get_disk_alignment() { return disk_alignment; }
};

View File

@@ -402,9 +402,8 @@ resume_1:
}
// Trim journal on start so we don't stall when all entries are older
bs->journal.trim();
bs->journal.dirty_start = bs->journal.next_free;
printf(
"Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
"Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
entries_loaded,
(bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@@ -440,7 +439,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
{
journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
je->type < JE_SMALL_WRITE || je->type > JE_DELETE || started && je->crc32_prev != crc32_last)
{
if (pos == 0)
{
@@ -475,7 +474,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (location != je->small_write.data_offset)
{
char err[1024];
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
throw std::runtime_error(err);
}
uint32_t data_crc32 = 0;
@@ -510,9 +509,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (data_crc32 != je->small_write.crc32_data)
{
// journal entry is corrupt, stop here
// interesting thing is that we must clear the corrupt entry if we're not readonly,
// because we don't write next entries in the same journal block
printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
// interesting thing is that we must clear the corrupt entry if we're not readonly
memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
bs->journal.next_free = prev_free;
init_write_buf = buf + proc_pos - done_pos;
@@ -521,7 +518,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
}
auto clean_it = bs->clean_db.find(je->small_write.oid);
if (clean_it == bs->clean_db.end() ||
clean_it->second.version < je->small_write.version)
clean_it->second.version < je->big_write.version)
{
obj_ver_id ov = {
.oid = je->small_write.oid,
@@ -537,10 +534,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
});
bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
);
printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
#endif
auto & unstab = bs->unstable_writes[ov.oid];
unstab = unstab < ov.version ? ov.version : unstab;
@@ -561,7 +555,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->big_write.version,
};
bs->dirty_db.emplace(ov, (dirty_entry){
.state = ST_D_SYNCED,
.state = ST_D_META_SYNCED,
.flags = 0,
.location = je->big_write.location,
.offset = je->big_write.offset,
@@ -587,7 +581,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->stable.oid,
.version = je->stable.version,
};
bs->mark_stable(ov);
auto it = bs->dirty_db.find(ov);
if (it == bs->dirty_db.end())
{
// journal contains a legitimate STABLE entry for a non-existing dirty write
// this probably means that journal was trimmed between WRITE and STABLE entries
// skip it
}
else
{
while (1)
{
it->second.state = (it->second.state == ST_D_META_SYNCED
? ST_D_STABLE
: (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
if (it == bs->dirty_db.begin())
break;
it--;
if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
break;
}
bs->flusher->enqueue_flush(ov);
}
auto unstab_it = bs->unstable_writes.find(ov.oid);
if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
{
bs->unstable_writes.erase(unstab_it);
}
}
else if (je->type == JE_ROLLBACK)
{
@@ -595,39 +615,70 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
#endif
// rollback dirty writes of <oid> up to <version>
obj_ver_id ov = {
auto it = bs->dirty_db.lower_bound((obj_ver_id){
.oid = je->rollback.oid,
.version = je->rollback.version,
};
bs->mark_rolled_back(ov);
.version = UINT64_MAX,
});
if (it != bs->dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (it->first.oid == je->rollback.oid &&
it->first.version > je->rollback.version &&
!IS_IN_FLIGHT(it->second.state) &&
!IS_STABLE(it->second.state))
{
if (it->first.oid != je->rollback.oid)
break;
else if (it->first.version <= je->rollback.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == bs->dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
}
auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
if (unstab_it != bs->unstable_writes.end())
{
if (max_unstable == 0)
bs->unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
}
else if (je->type == JE_DELETE)
{
#ifdef BLOCKSTORE_DEBUG
printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
#endif
auto clean_it = bs->clean_db.find(je->del.oid);
if (clean_it == bs->clean_db.end() ||
clean_it->second.version < je->del.version)
{
// oid, version
obj_ver_id ov = {
.oid = je->del.oid,
.version = je->del.version,
};
bs->dirty_db.emplace(ov, (dirty_entry){
.state = ST_DEL_SYNCED,
.flags = 0,
.location = 0,
.offset = 0,
.len = 0,
.journal_sector = proc_pos,
});
bs->journal.used_sectors[proc_pos]++;
// Deletions are treated as immediately stable, because
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
bs->mark_stable(ov);
}
// oid, version
obj_ver_id ov = {
.oid = je->del.oid,
.version = je->del.version,
};
bs->dirty_db.emplace(ov, (dirty_entry){
.state = ST_DEL_SYNCED,
.flags = 0,
.location = 0,
.offset = 0,
.len = 0,
.journal_sector = proc_pos,
});
bs->journal.used_sectors[proc_pos]++;
}
started = true;
pos += je->size;

View File

@@ -6,24 +6,18 @@ blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
sectors_required = 0;
next_pos = bs->journal.next_free;
next_sector = bs->journal.cur_sector;
first_sector = -1;
next_in_pos = bs->journal.in_sector_pos;
right_dir = next_pos >= bs->journal.used_start;
}
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int required, int size, int data_after)
{
int required = entries_required;
while (1)
{
int fits = (bs->journal.block_size - next_in_pos) / size;
if (fits > 0)
{
if (first_sector == -1)
{
first_sector = next_sector;
}
required -= fits;
next_in_pos += fits * size;
sectors_required++;
@@ -44,40 +38,19 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
right_dir = false;
}
next_in_pos = 0;
next_sector = ((next_sector + 1) % bs->journal.sector_count);
if (next_sector == first_sector)
if (bs->journal.sector_info[next_sector].usage_count > 0 ||
bs->journal.sector_info[next_sector].dirty)
{
// next_sector may wrap when all sectors are flushed and the incoming batch is too big
// This is an error condition, we can't wait for anything in this case
throw std::runtime_error(
"Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
);
next_sector = ((next_sector + 1) % bs->journal.sector_count);
}
if (bs->journal.sector_info[next_sector].usage_count > 0 ||
bs->journal.sector_info[next_sector].dirty)
{
// No memory buffer available. Wait for it.
int used = 0, dirty = 0;
for (int i = 0; i < bs->journal.sector_count; i++)
{
if (bs->journal.sector_info[i].dirty)
{
dirty++;
used++;
}
if (bs->journal.sector_info[i].usage_count > 0)
{
used++;
}
}
// In fact, it's even more rare than "ran out of journal space", so print a warning
printf(
"Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
used, bs->journal.sector_count, dirty, next_sector,
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
bs->journal.sector_info[next_sector].usage_count
);
#ifdef BLOCKSTORE_DEBUG
printf("next journal buffer %d is still dirty=%d used=%d\n", next_sector,
bs->journal.sector_info[next_sector].dirty, bs->journal.sector_info[next_sector].usage_count);
#endif
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
return 0;
}
@@ -101,7 +74,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
: bs->journal.used_start - bs->journal.next_free)
);
PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->request_trim();
bs->flusher->force_start();
PRIV(op)->wait_detail = bs->journal.used_start;
return 0;
}
@@ -118,11 +91,6 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
{
// Also select next sector buffer in memory
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
assert(!journal.sector_info[journal.cur_sector].usage_count);
}
else
{
journal.dirty_start = journal.next_free;
}
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
@@ -180,8 +148,8 @@ bool journal_t::trim()
auto journal_used_it = used_sectors.lower_bound(used_start);
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
used_start, next_free, dirty_start,
"Trimming journal (used_start=%lu, next_free=%lu, first_used=%lu, usage_count=%lu)\n",
used_start, next_free,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
);
@@ -212,7 +180,7 @@ bool journal_t::trim()
return false;
}
#ifdef BLOCKSTORE_DEBUG
printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
printf("Journal trimmed to %lu (next_free=%lu)\n", used_start, next_free);
#endif
return true;
}

View File

@@ -12,14 +12,12 @@
// Journal entries
// Journal entries are linked to each other by their crc32 value
// The journal is almost a blockchain, because object versions constantly increase
#define JE_MIN 0x01
#define JE_START 0x01
#define JE_SMALL_WRITE 0x02
#define JE_BIG_WRITE 0x03
#define JE_STABLE 0x04
#define JE_DELETE 0x05
#define JE_ROLLBACK 0x06
#define JE_MAX 0x06
// crc32c comes first to ease calculation and is equal to crc32()
struct __attribute__((__packed__)) journal_entry_start
@@ -137,14 +135,10 @@ struct journal_t
bool inmemory = false;
void *buffer = NULL;
uint64_t block_size;
uint64_t block_size = 512;
uint64_t offset, len;
// Next free block offset
uint64_t next_free = 0;
// First occupied block offset
uint64_t used_start = 0;
// End of the last block not used for writing anymore
uint64_t dirty_start = 0;
uint32_t crc32_last = 0;
// Current sector(s) used for writing
@@ -166,7 +160,7 @@ struct blockstore_journal_check_t
{
blockstore_impl_t *bs;
uint64_t next_pos, next_sector, next_in_pos;
int sectors_required, first_sector;
int sectors_required;
bool right_dir; // writing to the end or the beginning of the ring buffer
blockstore_journal_check_t(blockstore_impl_t *bs);

View File

@@ -1,4 +1,3 @@
#include <sys/file.h>
#include "blockstore_impl.h"
static uint32_t is_power_of_two(uint64_t value)
@@ -35,23 +34,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
disable_journal_fsync = true;
}
if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
{
disable_flock = true;
}
if (config["immediate_commit"] == "all")
{
immediate_commit = IMMEDIATE_ALL;
}
else if (config["immediate_commit"] == "small")
{
immediate_commit = IMMEDIATE_SMALL;
}
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
data_device = config["data_device"];
data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
block_size = strtoull(config["block_size"].c_str(), NULL, 10);
@@ -80,7 +66,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
if (!disk_alignment)
{
disk_alignment = 4096;
disk_alignment = 512;
}
else if (disk_alignment % MEM_ALIGNMENT)
{
@@ -88,7 +74,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
if (!journal_block_size)
{
journal_block_size = 4096;
journal_block_size = 512;
}
else if (journal_block_size % MEM_ALIGNMENT)
{
@@ -96,7 +82,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
if (!meta_block_size)
{
meta_block_size = 4096;
meta_block_size = 512;
}
else if (meta_block_size % MEM_ALIGNMENT)
{
@@ -142,22 +128,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
metadata_buf_size = 4*1024*1024;
}
if (meta_device == "")
{
disable_meta_fsync = disable_data_fsync;
}
if (journal_device == "")
{
disable_journal_fsync = disable_meta_fsync;
}
if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
{
throw std::runtime_error("immediate_commit requires disable_journal_fsync");
}
if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
{
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
}
// init some fields
clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
@@ -181,15 +151,6 @@ void blockstore_impl_t::calc_lengths()
data_len = data_len < journal.offset-data_offset
? data_len : journal.offset-data_offset;
}
if (cfg_data_size != 0)
{
if (data_len < cfg_data_size)
{
throw std::runtime_error("Data area ("+std::to_string(data_len)+
" bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
}
data_len = cfg_data_size;
}
// meta
meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset)
@@ -291,10 +252,6 @@ void blockstore_impl_t::open_data()
{
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
}
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
}
void blockstore_impl_t::open_meta()
@@ -312,14 +269,11 @@ void blockstore_impl_t::open_meta()
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
}
if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
}
else
{
meta_fd = data_fd;
disable_meta_fsync = disable_data_fsync;
meta_size = 0;
if (meta_offset >= data_size)
{
@@ -337,15 +291,12 @@ void blockstore_impl_t::open_journal()
{
throw std::runtime_error("Failed to open journal device");
}
check_size(journal.fd, &journal.device_size, "journal device");
if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}
check_size(journal.fd, &journal.device_size, "metadata device");
}
else
{
journal.fd = meta_fd;
disable_journal_fsync = disable_meta_fsync;
journal.device_size = 0;
if (journal.offset >= data_size)
{

View File

@@ -8,10 +8,12 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
// Zero-length version - skip
return 1;
}
else if (IS_IN_FLIGHT(item_state))
if (IS_IN_FLIGHT(item_state))
{
// Write not finished yet - skip
return 1;
// Pause until it's written somewhere
PRIV(op)->wait_for = WAIT_IN_FLIGHT;
PRIV(op)->wait_detail = item_version;
return 0;
}
else if (IS_DELETE(item_state))
{
@@ -131,66 +133,63 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
dirty_it--;
}
}
if (clean_it != clean_db.end())
if (clean_it != clean_db.end() && fulfilled < read_op->len)
{
if (!result_version)
{
result_version = clean_it->second.version;
}
if (fulfilled < read_op->len)
if (!clean_entry_bitmap_size)
{
if (!clean_entry_bitmap_size)
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
{
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
}
else
{
uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
}
else
{
uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
}
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
bmp_end++;
}
else
if (bmp_end > bmp_start)
{
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
// fill with zeroes
fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
}
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size)
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
// fill with zeroes
fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
bmp_start = bmp_end;
}
}
}
}

View File

@@ -2,10 +2,6 @@
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_rollback(op);
}
obj_ver_id* v;
int i, todo = op->len;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -18,13 +14,8 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
});
if (dirty_it == dirty_db.begin())
{
if (v->version == 0)
{
// Already rolled back
// FIXME Skip this object version
}
bad_op:
op->retval = -ENOENT;
op->retval = -EINVAL;
FINISH_OP(op);
return 1;
}
@@ -40,9 +31,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
if (!IS_SYNCED(dirty_it->second.state) ||
IS_STABLE(dirty_it->second.state))
{
op->retval = -EBUSY;
FINISH_OP(op);
return 1;
goto bad_op;
}
if (dirty_it == dirty_db.begin())
{
@@ -71,12 +60,39 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
journal.sector_info[journal.cur_sector].dirty)
{
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
}
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
// FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
});
uint64_t max_unstable = 0;
while (dirty_it != dirty_db.begin())
{
dirty_it--;
if (dirty_it->first.oid != v->oid)
break;
else if (dirty_it->first.version <= v->version)
{
if (!IS_STABLE(dirty_it->second.state))
max_unstable = dirty_it->first.version;
break;
}
}
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
journal.sector_info[journal.cur_sector].dirty = false;
@@ -87,117 +103,21 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
if (cur_sector != journal.cur_sector)
{
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
}
}
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1;
inflight_writes++;
return 1;
}
int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
{
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 3)
goto resume_3;
else if (PRIV(op)->op_state == 5)
goto resume_5;
else
return 1;
resume_2:
// Release used journal sectors
release_journal_sectors(op);
resume_3:
if (!disable_journal_fsync)
{
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
return 0;
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 4;
return 1;
}
resume_5:
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
mark_rolled_back(*v);
}
journal.trim();
inflight_writes--;
// Acknowledge op
op->retval = 0;
FINISH_OP(op);
return 1;
}
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
{
auto it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (it != dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (it->first.oid == ov.oid &&
it->first.version > ov.version &&
!IS_IN_FLIGHT(it->second.state) &&
!IS_STABLE(it->second.state))
{
if (it->first.oid != ov.oid)
break;
else if (it->first.version <= ov.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
erase_dirty(rm_start, rm_end, UINT64_MAX);
}
auto unstab_it = unstable_writes.find(ov.oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
}
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
inflight_writes--;
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -206,11 +126,37 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
PRIV(op)->op_state++;
if (!continue_rollback(op))
// Release used journal sectors
release_journal_sectors(op);
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
submit_queue.push_front(op);
// Erase dirty_db entries
auto rm_end = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.version = UINT64_MAX,
});
rm_end--;
auto rm_start = rm_end;
while (1)
{
if (rm_end->first.oid != v->oid)
break;
else if (rm_end->first.version <= v->version)
break;
rm_start = rm_end;
if (rm_end == dirty_db.begin())
break;
rm_end--;
}
if (rm_end != rm_start)
erase_dirty(rm_start, rm_end, UINT64_MAX);
}
journal.trim();
// Acknowledge op
op->retval = 0;
FINISH_OP(op);
}
}
@@ -227,13 +173,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
#endif
data_alloc->set(dirty_it->second.location >> block_order, false);
}
int used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08lx by %lu:%lu v%lu (%d refs)\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
int used = --journal.used_sectors[dirty_it->second.journal_sector];
if (used == 0)
{
journal.used_sectors.erase(dirty_it->second.journal_sector);

View File

@@ -40,10 +40,6 @@
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_stable(op);
}
obj_ver_id* v;
int i, todo = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -55,7 +51,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
{
// No such object version
op->retval = -ENOENT;
op->retval = -EINVAL;
FINISH_OP(op);
return 1;
}
@@ -67,7 +63,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
else if (IS_UNSYNCED(dirty_it->second.state))
{
// Object not synced yet. Caller must sync it first
op->retval = -EBUSY;
op->retval = EAGAIN;
FINISH_OP(op);
return 1;
}
@@ -102,13 +98,18 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
journal.sector_info[journal.cur_sector].dirty)
{
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
}
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// FIXME: Only stabilize versions that aren't stable yet
auto unstab_it = unstable_writes.find(v->oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v->version)
{
unstable_writes.erase(unstab_it);
}
journal_entry_stable *je = (journal_entry_stable*)
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
journal.sector_info[journal.cur_sector].dirty = false;
@@ -119,116 +120,21 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
if (cur_sector != journal.cur_sector)
{
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
}
}
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1;
inflight_writes++;
return 1;
}
int blockstore_impl_t::continue_stable(blockstore_op_t *op)
{
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 3)
goto resume_3;
else if (PRIV(op)->op_state == 5)
goto resume_5;
else
return 1;
resume_2:
// Release used journal sectors
release_journal_sectors(op);
resume_3:
if (!disable_journal_fsync)
{
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
return 0;
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 4;
return 1;
}
resume_5:
// Mark dirty_db entries as stable, acknowledge op completion
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// Mark all dirty_db entries up to op->version as stable
mark_stable(*v);
}
inflight_writes--;
// Acknowledge op
op->retval = 0;
FINISH_OP(op);
return 1;
}
void blockstore_impl_t::mark_stable(const obj_ver_id & v)
{
auto dirty_it = dirty_db.find(v);
if (dirty_it != dirty_db.end())
{
while (1)
{
if (dirty_it->second.state == ST_J_SYNCED)
{
dirty_it->second.state = ST_J_STABLE;
}
else if (dirty_it->second.state == ST_D_SYNCED)
{
dirty_it->second.state = ST_D_STABLE;
}
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v.oid)
{
break;
}
}
#ifdef BLOCKSTORE_DEBUG
printf("enqueue_flush %lu:%lu v%lu\n", v.oid.inode, v.oid.stripe, v.version);
#endif
flusher->enqueue_flush(v);
}
auto unstab_it = unstable_writes.find(v.oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v.version)
{
unstable_writes.erase(unstab_it);
}
}
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
inflight_writes--;
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -237,10 +143,53 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
PRIV(op)->op_state++;
if (!continue_stable(op))
// Release used journal sectors
release_journal_sectors(op);
// Mark dirty_db entries as stable, acknowledge op completion
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
submit_queue.push_front(op);
// Mark all dirty_db entries up to op->version as stable
auto dirty_it = dirty_db.find(*v);
if (dirty_it != dirty_db.end())
{
while (1)
{
if (dirty_it->second.state == ST_J_SYNCED)
{
dirty_it->second.state = ST_J_STABLE;
}
else if (dirty_it->second.state == ST_D_META_SYNCED)
{
dirty_it->second.state = ST_D_STABLE;
}
else if (dirty_it->second.state == ST_DEL_SYNCED)
{
dirty_it->second.state = ST_DEL_STABLE;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v->oid)
{
break;
}
}
#ifdef BLOCKSTORE_DEBUG
printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
#endif
flusher->enqueue_flush(*v);
}
}
// Acknowledge op
op->retval = 0;
FINISH_OP(op);
}
}

View File

@@ -11,7 +11,7 @@
int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
{
if (PRIV(op)->op_state == 0)
if (PRIV(op)->sync_state == 0)
{
stop_sync_submitted = false;
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
@@ -21,11 +21,11 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
unsynced_big_writes.clear();
unsynced_small_writes.clear();
if (PRIV(op)->sync_big_writes.size() > 0)
PRIV(op)->op_state = SYNC_HAS_BIG;
PRIV(op)->sync_state = SYNC_HAS_BIG;
else if (PRIV(op)->sync_small_writes.size() > 0)
PRIV(op)->op_state = SYNC_HAS_SMALL;
PRIV(op)->sync_state = SYNC_HAS_SMALL;
else
PRIV(op)->op_state = SYNC_DONE;
PRIV(op)->sync_state = SYNC_DONE;
// Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
PRIV(op)->prev_sync_count = in_progress_syncs.size();
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
@@ -38,7 +38,7 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
{
auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
if (PRIV(op)->op_state == SYNC_HAS_SMALL)
if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
{
// No big writes, just fsync the journal
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
@@ -54,17 +54,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
// Write out the last journal sector if it happens to be dirty
BS_SUBMIT_GET_ONLY_SQE(sqe);
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
return 1;
}
else
{
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
}
}
if (PRIV(op)->op_state == SYNC_HAS_BIG)
if (PRIV(op)->sync_state == SYNC_HAS_BIG)
{
for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
{
@@ -81,17 +81,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = cb;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
return 1;
}
else
{
PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
}
}
if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
{
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
{
@@ -121,7 +121,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
journal.sector_info[journal.cur_sector].dirty)
{
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
}
@@ -133,11 +133,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
#endif
je->oid = it->oid;
je->version = it->version;
@@ -150,17 +146,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
if (cur_sector != journal.cur_sector)
{
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
cur_sector = journal.cur_sector;
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
}
}
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s;
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
return 1;
}
if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
{
if (!disable_journal_fsync)
{
@@ -169,17 +165,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
data->iov = { 0 };
data->callback = cb;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
return 1;
}
else
{
PRIV(op)->op_state = SYNC_DONE;
PRIV(op)->sync_state = SYNC_DONE;
}
}
if (PRIV(op)->op_state == SYNC_DONE)
if (PRIV(op)->sync_state == SYNC_DONE)
{
return ack_sync(op);
ack_sync(op);
}
return 1;
}
@@ -200,17 +196,17 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
// Release used journal sectors
release_journal_sectors(op);
// Handle states
if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
{
PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
}
else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
{
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
}
else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
{
PRIV(op)->op_state = SYNC_DONE;
PRIV(op)->sync_state = SYNC_DONE;
ack_sync(op);
}
else
@@ -222,7 +218,7 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
int blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
{
// Remove dependency of subsequent syncs
auto it = PRIV(op)->in_progress_ptr;
@@ -234,14 +230,14 @@ int blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
auto & next_sync = *it++;
PRIV(next_sync)->prev_sync_count -= done_syncs;
if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE)
{
done_syncs++;
// Acknowledge next_sync
ack_one_sync(next_sync);
}
}
return 2;
return 1;
}
return 0;
}
@@ -256,17 +252,7 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
#endif
auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab;
auto dirty_it = dirty_db.find(*it);
dirty_it->second.state = ST_D_SYNCED;
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
{
if (dirty_it->second.state == ST_J_WAIT_BIG)
{
dirty_it->second.state = ST_J_IN_FLIGHT;
}
dirty_it++;
}
dirty_db[*it].state = ST_D_META_SYNCED;
}
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
{
@@ -275,16 +261,7 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
#endif
auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab;
if (dirty_db[*it].state == ST_DEL_WRITTEN)
{
dirty_db[*it].state = ST_DEL_SYNCED;
// Deletions are treated as immediately stable
mark_stable(*it);
}
else /* == ST_J_WRITTEN */
{
dirty_db[*it].state = ST_J_SYNCED;
}
dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
}
in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
op->retval = 0;

View File

@@ -4,7 +4,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{
// Check or assign version number
bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
bool is_inflight_big = false;
uint64_t version = 1;
if (dirty_db.size() > 0)
{
@@ -18,9 +17,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
found = true;
version = dirty_it->first.version + 1;
deleted = IS_DELETE(dirty_it->second.state);
is_inflight_big = dirty_it->second.state >= ST_D_IN_FLIGHT &&
dirty_it->second.state < ST_D_SYNCED ||
dirty_it->second.state == ST_J_WAIT_BIG;
}
}
if (!found)
@@ -42,7 +38,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
else if (op->version < version)
{
// Invalid version requested
op->retval = -EEXIST;
op->retval = -EINVAL;
return false;
}
if (deleted && is_del)
@@ -51,26 +47,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
op->retval = 0;
return false;
}
if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
immediate_commit != IMMEDIATE_ALL)
{
// Issue an additional sync so that the previous big write can reach the journal
blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC;
sync_op->callback = [this, op](blockstore_op_t *sync_op)
{
delete sync_op;
};
enqueue_op(sync_op);
}
// Immediately add the operation into dirty_db, so subsequent reads could see it
#ifdef BLOCKSTORE_DEBUG
if (is_del)
printf("Delete %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version);
else
printf("Write %lu:%lu v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
printf("%s %lu:%lu v%lu\n", is_del ? "Delete" : "Write", op->oid.inode, op->oid.stripe, op->version);
#endif
// No strict need to add it into dirty_db here, it's just left
// from the previous implementation where reads waited for writes
dirty_db.emplace((obj_ver_id){
.oid = op->oid,
.version = op->version,
@@ -78,7 +58,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
.state = (uint32_t)(
is_del
? ST_DEL_IN_FLIGHT
: (op->len == block_size || deleted ? ST_D_IN_FLIGHT : (is_inflight_big ? ST_J_WAIT_BIG : ST_J_IN_FLIGHT))
: (op->len == block_size || deleted ? ST_D_IN_FLIGHT : ST_J_IN_FLIGHT)
),
.flags = 0,
.location = 0,
@@ -92,20 +72,11 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
// First step of the write algorithm: dequeue operation and submit initial write(s)
int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_write(op);
}
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
if (dirty_it->second.state == ST_J_WAIT_BIG)
{
return 0;
}
else if (dirty_it->second.state == ST_D_IN_FLIGHT)
if (dirty_it->second.state == ST_D_IN_FLIGHT)
{
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -154,20 +125,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
);
PRIV(op)->pending_ops = 1;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
if (immediate_commit != IMMEDIATE_ALL)
{
// Remember big write as unsynced
unsynced_big_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
PRIV(op)->op_state = 3;
}
else
{
PRIV(op)->op_state = 1;
}
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
// Remember big write as unsynced
unsynced_big_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
else
{
@@ -181,11 +144,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
}
// There is sufficient space. Get SQE(s)
struct io_uring_sqe *sqe1 = NULL;
if (immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
journal.sector_info[journal.cur_sector].dirty)
{
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
// Write current journal sector only if it's dirty and full
BS_SUBMIT_GET_SQE_DECL(sqe1);
}
struct io_uring_sqe *sqe2 = NULL;
@@ -195,18 +157,16 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
}
// Got SQEs. Prepare previous journal sector write if required
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
if (immediate_commit == IMMEDIATE_NONE)
if (sqe1)
{
if (sqe1)
{
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
}
else
{
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
}
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
// FIXME rename to min/max _flushing
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
}
else
{
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
}
// Then pre-fill journal entry
journal_entry_small_write *je = (journal_entry_small_write*)
@@ -214,11 +174,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
// Figure out where data will be
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
@@ -230,12 +186,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
je->crc32_data = crc32c(0, op->buf, op->len);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
if (immediate_commit != IMMEDIATE_NONE)
{
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
}
if (op->len > 0)
{
// Prepare journal data write
@@ -263,120 +213,16 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
{
journal.next_free = journal_block_size;
}
if (immediate_commit == IMMEDIATE_NONE)
{
// Remember small write as unsynced
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
// Remember small write as unsynced
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
if (!PRIV(op)->pending_ops)
{
PRIV(op)->op_state = 4;
continue_write(op);
}
else
{
PRIV(op)->op_state = 3;
ack_write(op);
}
}
inflight_writes++;
return 1;
}
int blockstore_impl_t::continue_write(blockstore_op_t *op)
{
io_uring_sqe *sqe = NULL;
journal_entry_big_write *je;
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 4)
goto resume_4;
else
return 1;
resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry
sqe = get_sqe();
if (!sqe)
{
return 0;
}
je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.sector_info[journal.cur_sector].dirty = false;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif
je->oid = op->oid;
je->version = op->version;
je->offset = op->offset;
je->len = op->len;
je->location = dirty_it->second.location;
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal, journal.cur_sector, sqe,
[this, op](ring_data_t *data) { handle_write_event(data, op); });
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3;
return 1;
resume_4:
// Switch object state
#ifdef BLOCKSTORE_DEBUG
printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
#endif
bool imm = dirty_it->second.state == ST_D_SUBMITTED
? (immediate_commit == IMMEDIATE_ALL)
: (immediate_commit != IMMEDIATE_NONE);
if (imm)
{
auto & unstab = unstable_writes[op->oid];
unstab = unstab < op->version ? op->version : unstab;
}
if (dirty_it->second.state == ST_J_SUBMITTED)
{
dirty_it->second.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
}
else if (dirty_it->second.state == ST_D_SUBMITTED)
{
dirty_it->second.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
}
else if (dirty_it->second.state == ST_DEL_SUBMITTED)
{
dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
if (imm)
{
// Deletions are treated as immediately stable
mark_stable(dirty_it->first);
}
}
if (immediate_commit == IMMEDIATE_ALL)
{
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{
if (dirty_it->second.state == ST_J_WAIT_BIG)
{
dirty_it->second.state = ST_J_IN_FLIGHT;
}
dirty_it++;
}
}
inflight_writes--;
// Acknowledge write
op->retval = op->len;
FINISH_OP(op);
return 1;
}
@@ -385,7 +231,6 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
live = true;
if (data->res != data->iov.iov_len)
{
inflight_writes--;
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
@@ -396,117 +241,88 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
if (PRIV(op)->pending_ops == 0)
{
release_journal_sectors(op);
PRIV(op)->op_state++;
if (!continue_write(op))
{
submit_queue.push_front(op);
}
ack_write(op);
}
}
void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
{
// Release flushed journal sectors
if (PRIV(op)->min_flushed_journal_sector > 0 &&
PRIV(op)->max_flushed_journal_sector > 0)
// Release used journal sectors
if (PRIV(op)->min_used_journal_sector > 0 &&
PRIV(op)->max_used_journal_sector > 0)
{
uint64_t s = PRIV(op)->min_flushed_journal_sector;
uint64_t s = PRIV(op)->min_used_journal_sector;
while (1)
{
journal.sector_info[s-1].usage_count--;
if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
{
// We know for sure that we won't write into this sector anymore
uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
if (new_ds >= journal.len)
{
new_ds = journal.block_size;
}
if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
(new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
{
journal.dirty_start = new_ds;
}
}
if (s == PRIV(op)->max_flushed_journal_sector)
if (s == PRIV(op)->max_used_journal_sector)
break;
s = 1 + s % journal.sector_count;
}
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
}
}
void blockstore_impl_t::ack_write(blockstore_op_t *op)
{
// Switch object state
auto & dirty_entry = dirty_db[(obj_ver_id){
.oid = op->oid,
.version = op->version,
}];
#ifdef BLOCKSTORE_DEBUG
printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_entry.state);
#endif
if (dirty_entry.state == ST_J_SUBMITTED)
{
dirty_entry.state = ST_J_WRITTEN;
}
else if (dirty_entry.state == ST_D_SUBMITTED)
{
dirty_entry.state = ST_D_WRITTEN;
}
else if (dirty_entry.state == ST_DEL_SUBMITTED)
{
dirty_entry.state = ST_DEL_WRITTEN;
}
// Acknowledge write without sync
op->retval = op->len;
FINISH_OP(op);
}
int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
{
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
{
return 0;
}
io_uring_sqe *sqe = NULL;
if (immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
journal.sector_info[journal.cur_sector].dirty)
{
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
BS_SUBMIT_GET_SQE_DECL(sqe);
}
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
BS_SUBMIT_GET_ONLY_SQE(sqe);
// Prepare journal sector write
if (immediate_commit == IMMEDIATE_NONE)
{
if (sqe)
{
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
}
else
{
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
}
}
// Pre-fill journal entry
journal_entry_del *je = (journal_entry_del*)
prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf(
"journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
je->oid = op->oid;
je->version = op->version;
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1;
dirty_it->second.state = ST_DEL_SUBMITTED;
if (immediate_commit != IMMEDIATE_NONE)
{
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
// Remember small write as unsynced
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
if (!PRIV(op)->pending_ops)
{
PRIV(op)->op_state = 4;
continue_write(op);
}
else
{
PRIV(op)->op_state = 3;
}
// Remember small write as unsynced
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
return 1;
}

View File

@@ -1,349 +0,0 @@
#include "cluster_client.h"
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
{
this->ringloop = ringloop;
this->tfd = tfd;
msgr.tfd = tfd;
msgr.ringloop = ringloop;
msgr.repeer_pgs = [this](osd_num_t peer_osd)
{
// peer_osd just connected or dropped connection
if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
{
// really connected :)
continue_ops();
}
};
st_cli.tfd = tfd;
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
log_level = config["log_level"].int64_value();
st_cli.parse_config(config);
st_cli.load_global_config();
}
void cluster_client_t::continue_ops()
{
for (auto op_it = unsent_ops.begin(); op_it != unsent_ops.end(); )
{
cluster_op_t *op = *op_it;
if (op->needs_reslice && !op->sent_count)
{
op->parts.clear();
op->done_count = 0;
op->needs_reslice = false;
}
if (!op->parts.size())
{
unsent_ops.erase(op_it++);
execute(op);
continue;
}
if (!op->needs_reslice)
{
for (auto & op_part: op->parts)
{
if (!op_part.sent && !op_part.done)
{
try_send(op, &op_part);
}
}
if (op->sent_count == op->parts.size() - op->done_count)
{
unsent_ops.erase(op_it++);
sent_ops.insert(op);
}
else
op_it++;
}
else
op_it++;
}
}
static uint32_t is_power_of_two(uint64_t value)
{
uint32_t l = 0;
while (value > 1)
{
if (value & 1)
{
return 64;
}
value = value >> 1;
l++;
}
return l;
}
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
bs_block_size = config["block_size"].uint64_value();
bs_disk_alignment = config["disk_alignment"].uint64_value();
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
if (!bs_block_size)
bs_block_size = DEFAULT_BLOCK_SIZE;
if (!bs_disk_alignment)
bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
if (!bs_bitmap_granularity)
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
{
uint32_t block_order;
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
throw std::runtime_error("Bad block size");
}
if (config.find("pg_stripe_size") != config.end())
{
pg_stripe_size = config["pg_stripe_size"].uint64_value();
if (!pg_stripe_size)
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
if (config["immediate_commit"] == "all")
{
// Cluster-wide immediate_commit mode
immediate_commit = true;
}
msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
if (!msgr.peer_connect_interval)
msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
if (!msgr.peer_connect_timeout)
msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
}
void cluster_client_t::on_load_pgs_hook(bool success)
{
if (success)
{
pg_count = st_cli.pg_config.size();
continue_ops();
}
}
void cluster_client_t::on_change_hook(json11::Json::object & changes)
{
if (pg_count != st_cli.pg_config.size())
{
// At this point, all operations should be suspended
// And they need to be resliced!
for (auto op: unsent_ops)
{
op->needs_reslice = true;
}
for (auto op: sent_ops)
{
op->needs_reslice = true;
}
pg_count = st_cli.pg_config.size();
}
continue_ops();
}
void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
{
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
{
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
}
// FIXME: Implement OSD_OP_SYNC for immediate_commit == false
void cluster_client_t::execute(cluster_op_t *op)
{
if (op->opcode == OSD_OP_SYNC && immediate_commit)
{
// Syncs are not required in the immediate_commit mode
op->retval = 0;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
if (op->opcode != OSD_OP_READ && op->opcode != OSD_OP_OUT || !op->inode || !op->len ||
op->offset % bs_disk_alignment || op->len % bs_disk_alignment)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
if (!pg_stripe_size)
{
// Config is not loaded yet
unsent_ops.insert(op);
return;
}
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
{
// Copy operation
cluster_op_t *op_copy = new cluster_op_t();
op_copy->opcode = op->opcode;
op_copy->inode = op->inode;
op_copy->offset = op->offset;
op_copy->len = op->len;
op_copy->buf = malloc(op->len);
memcpy(op_copy->buf, op->buf, op->len);
unsynced_ops.push_back(op_copy);
unsynced_bytes += op->len;
if (inmemory_commit)
{
// Immediately acknowledge write and continue with the copy
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
op = op_copy;
}
if (unsynced_bytes >= inmemory_dirty_limit)
{
// Push an extra SYNC operation
}
}
// Slice the request into individual object stripe requests
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
uint64_t pg_block_size = bs_block_size * pg_part_count;
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
int part_count = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
if (op->offset < (stripe+pg_block_size) && (op->offset+op->len) > stripe)
{
part_count++;
}
}
op->parts.resize(part_count);
bool resend = false;
int i = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
uint64_t stripe_end = stripe + pg_block_size;
if (op->offset < stripe_end && (op->offset+op->len) > stripe)
{
pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pg_count + 1;
op->parts[i] = {
.parent = op,
.offset = op->offset < stripe ? stripe : op->offset,
.len = (uint32_t)((op->offset+op->len) > stripe_end ? pg_block_size : op->offset+op->len-stripe),
.pg_num = pg_num,
.buf = op->buf + (op->offset < stripe ? stripe-op->offset : 0),
.sent = false,
.done = false,
};
if (!try_send(op, &op->parts[i]))
{
// Part needs to be sent later
resend = true;
}
i++;
}
}
if (resend)
{
unsent_ops.insert(op);
}
else
{
sent_ops.insert(op);
}
}
bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
{
auto pg_it = st_cli.pg_config.find(part->pg_num);
if (pg_it != st_cli.pg_config.end() &&
!pg_it->second.pause && pg_it->second.cur_primary)
{
osd_num_t primary_osd = pg_it->second.cur_primary;
auto peer_it = msgr.osd_peer_fds.find(primary_osd);
if (peer_it != msgr.osd_peer_fds.end())
{
int peer_fd = peer_it->second;
part->osd_num = primary_osd;
part->sent = true;
op->sent_count++;
part->op = {
.op_type = OSD_OP_OUT,
.peer_fd = peer_fd,
.req = { .rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = op_id++,
.opcode = op->opcode,
},
.inode = op->inode,
.offset = part->offset,
.len = part->len,
} },
.callback = [this, part](osd_op_t *op_part)
{
handle_op_part(part);
},
};
part->op.send_list.push_back(part->op.req.buf, OSD_PACKET_SIZE);
if (op->opcode == OSD_OP_WRITE)
{
part->op.send_list.push_back(part->buf, part->len);
}
else
{
part->op.buf = part->buf;
}
msgr.outbox_push(&part->op);
return true;
}
else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
{
msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
}
}
return false;
}
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
{
cluster_op_t *op = part->parent;
part->sent = false;
op->sent_count--;
part->op.buf = NULL;
if (part->op.reply.hdr.retval != part->op.req.rw.len)
{
// Operation failed, retry
printf(
"Operation part failed on OSD %lu: retval=%ld (expected %u), reconnecting\n",
part->osd_num, part->op.reply.hdr.retval, part->op.req.rw.len
);
msgr.stop_client(part->op.peer_fd);
if (op->sent_count == op->parts.size() - op->done_count - 1)
{
// Resend later when OSDs come up
// FIXME: Check for different types of errors
// FIXME: Repeat operations after a small timeout, for the case when OSD is coming up
sent_ops.erase(op);
unsent_ops.insert(op);
}
if (op->sent_count == 0 && op->needs_reslice)
{
// PG count has changed, reslice the operation
unsent_ops.erase(op);
op->parts.clear();
op->done_count = 0;
op->needs_reslice = false;
execute(op);
}
}
else
{
// OK
part->done = true;
op->done_count++;
if (op->done_count >= op->parts.size())
{
// Finished!
sent_ops.erase(op);
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
}
}
}

View File

@@ -1,80 +0,0 @@
#pragma once
#include "messenger.h"
#include "etcd_state_client.h"
#define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BLOCK_SIZE 128*1024
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
#define DEFAULT_DISK_ALIGNMENT 4096
#define DEFAULT_BITMAP_GRANULARITY 4096
struct cluster_op_t;
struct cluster_op_part_t
{
cluster_op_t *parent;
uint64_t offset;
uint32_t len;
pg_num_t pg_num;
osd_num_t osd_num;
void *buf;
bool sent;
bool done;
osd_op_t op;
};
struct cluster_op_t
{
uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC
uint64_t inode;
uint64_t offset;
uint64_t len;
int retval;
void *buf;
std::function<void(cluster_op_t*)> callback;
protected:
bool needs_reslice = false;
int sent_count = 0, done_count = 0;
std::vector<cluster_op_part_t> parts;
friend class cluster_client_t;
};
class cluster_client_t
{
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
uint64_t pg_part_count = 2;
uint64_t pg_stripe_size = 0;
uint64_t bs_block_size = 0;
uint64_t bs_disk_alignment = 0;
uint64_t bs_bitmap_granularity = 0;
uint64_t pg_count = 0;
bool immediate_commit = false;
bool inmemory_commit = false;
uint64_t inmemory_dirty_limit = 32*1024*1024;
int log_level;
uint64_t op_id = 1;
etcd_state_client_t st_cli;
osd_messenger_t msgr;
std::set<cluster_op_t*> sent_ops, unsent_ops;
// unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
std::vector<cluster_op_t*> unsynced_ops;
uint64_t unsynced_bytes = 0;
public:
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
void execute(cluster_op_t *op);
protected:
void continue_ops();
void on_load_config_hook(json11::Json::object & cfg);
void on_load_pgs_hook(bool success);
void on_change_hook(json11::Json::object & changes);
void on_change_osd_state_hook(uint64_t peer_osd);
bool try_send(cluster_op_t *op, cluster_op_part_t *part);
void handle_op_part(cluster_op_part_t *part);
};

View File

@@ -1,165 +0,0 @@
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <malloc.h>
#include <linux/fs.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "blockstore_impl.h"
#include "crc32c.h"
struct journal_dump_t
{
char *journal_device;
uint32_t journal_block;
uint64_t journal_offset;
uint64_t journal_len;
uint64_t journal_pos;
int fd;
void dump_block(void *buf);
};
int main(int argc, char *argv[])
{
if (argc < 5)
{
printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
return 1;
}
journal_dump_t self;
self.journal_device = argv[1];
self.journal_block = strtoul(argv[2], NULL, 10);
self.journal_offset = strtoull(argv[3], NULL, 10);
self.journal_len = strtoull(argv[4], NULL, 10);
if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
self.journal_block > 128*1024)
{
printf("Invalid journal block size\n");
return 1;
}
self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
if (self.fd == -1)
{
printf("Failed to open journal\n");
return 1;
}
void *data = memalign(MEM_ALIGNMENT, self.journal_block);
self.journal_pos = 0;
while (self.journal_pos < self.journal_len)
{
int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
assert(r == self.journal_block);
uint64_t s;
for (s = 0; s < self.journal_block; s += 8)
{
if (*((uint64_t*)(data+s)) != 0)
break;
}
if (s == self.journal_block)
{
printf("offset %08lx: zeroes\n", self.journal_pos);
self.journal_pos += self.journal_block;
}
else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
{
printf("offset %08lx:\n", self.journal_pos);
self.dump_block(data);
}
else
{
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
self.journal_pos += self.journal_block;
}
}
free(data);
close(self.fd);
return 0;
}
void journal_dump_t::dump_block(void *buf)
{
uint32_t pos = 0;
journal_pos += journal_block;
int entry = 0;
bool wrapped = false;
while (pos < journal_block)
{
journal_entry *je = (journal_entry*)(buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_START || je->type > JE_DELETE)
{
break;
}
const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
if (je->type == JE_START)
{
printf("je_start start=%08lx\n", je->start.journal_start);
}
else if (je->type == JE_SMALL_WRITE)
{
printf(
"je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u loc=%08lx",
je->small_write.oid.inode, je->small_write.oid.stripe,
je->small_write.version, je->small_write.offset, je->small_write.len,
je->small_write.data_offset
);
if (journal_pos + je->small_write.len > journal_len)
{
// data continues from the beginning of the journal
journal_pos = journal_block;
wrapped = true;
}
if (journal_pos != je->small_write.data_offset)
{
printf(" (mismatched, calculated = %lu)", journal_pos);
}
journal_pos += je->small_write.len;
if (journal_pos >= journal_len)
{
journal_pos = journal_block;
wrapped = true;
}
uint32_t data_crc32 = 0;
void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
data_crc32 = crc32c(0, data, je->small_write.len);
free(data);
printf(
" data_crc32=%08x%s", je->small_write.crc32_data,
(data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
);
printf("\n");
}
else if (je->type == JE_BIG_WRITE)
{
printf("je_big_write oid=%lu:%lu ver=%lu loc=%08lx\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
}
else if (je->type == JE_STABLE)
{
printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
}
else if (je->type == JE_ROLLBACK)
{
printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
}
else if (je->type == JE_DELETE)
{
printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
}
pos += je->size;
entry++;
}
if (wrapped)
{
journal_pos = journal_len;
}
}

View File

@@ -1,87 +0,0 @@
#include <sys/epoll.h>
#include <sys/poll.h>
#include <unistd.h>
#include "epoll_manager.h"
#define MAX_EPOLL_EVENTS 64
epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
{
this->ringloop = ringloop;
epoll_fd = epoll_create(1);
if (epoll_fd < 0)
{
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
}
tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
handle_epoll_events();
}
epoll_manager_t::~epoll_manager_t()
{
if (tfd)
{
delete tfd;
tfd = NULL;
}
close(epoll_fd);
}
void epoll_manager_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
{
if (handler != NULL)
{
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev;
ev.data.fd = fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers[fd] = handler;
}
else
{
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers.erase(fd);
}
}
void epoll_manager_t::handle_epoll_events()
{
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
{
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
data->callback = [this](ring_data_t *data)
{
if (data->res < 0)
{
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
}
handle_epoll_events();
};
ringloop->submit();
int nfds;
epoll_event events[MAX_EPOLL_EVENTS];
do
{
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
for (int i = 0; i < nfds; i++)
{
auto & cb = epoll_handlers[events[i].data.fd];
cb(events[i].data.fd, events[i].events);
}
} while (nfds == MAX_EPOLL_EVENTS);
}

View File

@@ -1,20 +0,0 @@
#pragma once
#include <map>
#include "ringloop.h"
#include "timerfd_manager.h"
class epoll_manager_t
{
int epoll_fd;
ring_loop_t *ringloop;
std::map<int, std::function<void(int, int)>> epoll_handlers;
public:
epoll_manager_t(ring_loop_t *ringloop);
~epoll_manager_t();
void set_fd_handler(int fd, std::function<void(int, int)> handler);
void handle_epoll_events();
timerfd_manager_t *tfd;
};

View File

@@ -1,424 +0,0 @@
#include "osd_ops.h"
#include "pg_states.h"
#include "etcd_state_client.h"
#include "http_client.h"
#include "base64.h"
json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
{
json_kv_t kv;
kv.key = base64_decode(kv_json["key"].string_value());
std::string json_err, json_text = base64_decode(kv_json["value"].string_value());
kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
if (json_err != "")
{
printf("Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
kv.key = "";
}
return kv;
}
void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
{
etcd_call("/kv/txn", txn, timeout, callback);
}
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
{
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
std::string etcd_api_path;
int pos = etcd_address.find('/');
if (pos >= 0)
{
etcd_api_path = etcd_address.substr(pos);
etcd_address = etcd_address.substr(0, pos);
}
std::string req = payload.dump();
req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
"Host: "+etcd_address+"\r\n"
"Content-Type: application/json\r\n"
"Content-Length: "+std::to_string(req.size())+"\r\n"
"Connection: close\r\n"
"\r\n"+req;
http_request_json(tfd, etcd_address, req, timeout, callback);
}
void etcd_state_client_t::parse_config(json11::Json & config)
{
this->etcd_addresses.clear();
if (config["etcd_address"].is_string())
{
std::string ea = config["etcd_address"].string_value();
while (1)
{
int pos = ea.find(',');
std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
if (addr.length() > 0)
{
if (addr.find('/') < 0)
addr += "/v3";
this->etcd_addresses.push_back(addr);
}
if (pos >= 0)
ea = ea.substr(pos+1);
else
break;
}
}
else if (config["etcd_address"].array_items().size())
{
for (auto & ea: config["etcd_address"].array_items())
{
std::string addr = ea.string_value();
if (addr != "")
{
if (addr.find('/') < 0)
addr += "/v3";
this->etcd_addresses.push_back(addr);
}
}
}
this->etcd_prefix = config["etcd_prefix"].string_value();
if (this->etcd_prefix == "")
{
this->etcd_prefix = "/microceph";
}
else if (this->etcd_prefix[0] != '/')
{
this->etcd_prefix = "/"+this->etcd_prefix;
}
this->log_level = config["log_level"].int64_value();
}
void etcd_state_client_t::start_etcd_watcher()
{
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
std::string etcd_api_path;
int pos = etcd_address.find('/');
if (pos >= 0)
{
etcd_api_path = etcd_address.substr(pos);
etcd_address = etcd_address.substr(0, pos);
}
etcd_watches_initialised = 0;
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT, [this](const http_response_t *msg)
{
if (msg->body.length())
{
std::string json_err;
json11::Json data = json11::Json::parse(msg->body, json_err);
if (json_err != "")
{
printf("Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
}
else
{
if (data["result"]["created"].bool_value())
{
etcd_watches_initialised++;
}
if (etcd_watches_initialised == 4)
{
etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
}
// First gather all changes into a hash to remove multiple overwrites
json11::Json::object changes;
for (auto & ev: data["result"]["events"].array_items())
{
auto kv = parse_etcd_kv(ev["kv"]);
if (kv.key != "")
{
changes[kv.key] = kv.value;
}
}
for (auto & kv: changes)
{
if (this->log_level > 0)
{
printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.dump().c_str());
}
parse_state(kv.first, kv.second);
}
// React to changes
if (on_change_hook != NULL)
{
on_change_hook(changes);
}
}
}
if (msg->eof)
{
etcd_watch_ws = NULL;
if (etcd_watches_initialised == 0)
{
// Connection not established, retry in <ETCD_SLOW_TIMEOUT>
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int)
{
start_etcd_watcher();
});
}
else
{
// Connection was live, retry immediately
start_etcd_watcher();
}
}
});
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/") },
{ "range_end", base64_encode(etcd_prefix+"/config0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_CONFIG_WATCH_ID },
} }
}).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/osd/state/") },
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_OSD_STATE_WATCH_ID },
} }
}).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/state/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_STATE_WATCH_ID },
} }
}).dump());
etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") },
{ "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_HISTORY_WATCH_ID },
} }
}).dump());
}
void etcd_state_client_t::load_global_config()
{
etcd_call("/kv/range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/global") }
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
{
if (err != "")
{
printf("Error reading OSD configuration from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
{
load_global_config();
});
return;
}
if (!etcd_watch_revision)
{
etcd_watch_revision = data["header"]["revision"].uint64_value();
}
json11::Json::object global_config;
if (data["kvs"].array_items().size() > 0)
{
auto kv = parse_etcd_kv(data["kvs"][0]);
if (kv.value.is_object())
{
global_config = kv.value.object_items();
}
}
on_load_config_hook(global_config);
});
}
void etcd_state_client_t::load_pgs()
{
json11::Json::array txn = {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/pgs") },
} }
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") },
} }
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/state/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") },
} }
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/osd/state/") },
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") },
} }
},
};
json11::Json::object req = { { "success", txn } };
json11::Json checks = load_pgs_checks_hook != NULL ? load_pgs_checks_hook() : json11::Json();
if (checks.array_items().size() > 0)
{
req["compare"] = checks;
}
etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
{
if (err != "")
{
printf("Error loading PGs from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
{
load_pgs();
});
return;
}
if (!data["succeeded"].bool_value())
{
on_load_pgs_hook(false);
return;
}
for (auto & res: data["responses"].array_items())
{
for (auto & kv_json: res["response_range"]["kvs"].array_items())
{
auto kv = parse_etcd_kv(kv_json);
parse_state(kv.key, kv.value);
}
}
on_load_pgs_hook(true);
});
}
void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
{
if (key == etcd_prefix+"/config/pgs")
{
for (auto & pg_item: this->pg_config)
{
pg_item.second.exists = false;
}
for (auto & pg_item: value["items"].object_items())
{
pg_num_t pg_num = stoull_full(pg_item.first);
if (!pg_num)
{
printf("Bad key in PG configuration: %s (must be a number), skipped\n", pg_item.first.c_str());
continue;
}
this->pg_config[pg_num].exists = true;
this->pg_config[pg_num].pause = pg_item.second["pause"].bool_value();
this->pg_config[pg_num].primary = pg_item.second["primary"].uint64_value();
this->pg_config[pg_num].target_set.clear();
for (auto pg_osd: pg_item.second["osd_set"].array_items())
{
this->pg_config[pg_num].target_set.push_back(pg_osd.uint64_value());
}
if (this->pg_config[pg_num].target_set.size() != 3)
{
printf("Bad PG %u config format: incorrect osd_set = %s\n", pg_num, pg_item.second["osd_set"].dump().c_str());
this->pg_config[pg_num].target_set.resize(3);
this->pg_config[pg_num].pause = true;
}
}
}
else if (key.substr(0, etcd_prefix.length()+12) == etcd_prefix+"/pg/history/")
{
// <etcd_prefix>/pg/history/%d
pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+12));
if (!pg_num)
{
printf("Bad etcd key %s, ignoring\n", key.c_str());
}
else
{
auto & pg_cfg = this->pg_config[pg_num];
pg_cfg.target_history.clear();
pg_cfg.all_peers.clear();
// Refuse to start PG if any set of the <osd_sets> has no live OSDs
for (auto hist_item: value["osd_sets"].array_items())
{
std::vector<osd_num_t> history_set;
for (auto pg_osd: hist_item.array_items())
{
history_set.push_back(pg_osd.uint64_value());
}
pg_cfg.target_history.push_back(history_set);
}
// Include these additional OSDs when peering the PG
for (auto pg_osd: value["all_peers"].array_items())
{
pg_cfg.all_peers.push_back(pg_osd.uint64_value());
}
}
}
else if (key.substr(0, etcd_prefix.length()+10) == etcd_prefix+"/pg/state/")
{
// <etcd_prefix>/pg/state/%d
pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+10));
if (!pg_num)
{
printf("Bad etcd key %s, ignoring\n", key.c_str());
}
else if (value.is_null())
{
this->pg_config[pg_num].cur_primary = 0;
this->pg_config[pg_num].cur_state = 0;
}
else
{
osd_num_t cur_primary = value["primary"].uint64_value();
int state = 0;
for (auto & e: value["state"].array_items())
{
int i;
for (i = 0; i < pg_state_bit_count; i++)
{
if (e.string_value() == pg_state_names[i])
{
state = state | pg_state_bits[i];
break;
}
}
if (i >= pg_state_bit_count)
{
printf("Unexpected PG %u state keyword in etcd: %s\n", pg_num, e.dump().c_str());
return;
}
}
if (!cur_primary || !value["state"].is_array() || !state ||
(state & PG_OFFLINE) && state != PG_OFFLINE ||
(state & PG_PEERING) && state != PG_PEERING ||
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
{
printf("Unexpected PG %u state in etcd: primary=%lu, state=%s\n", pg_num, cur_primary, value["state"].dump().c_str());
return;
}
this->pg_config[pg_num].cur_primary = cur_primary;
this->pg_config[pg_num].cur_state = state;
}
}
else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")
{
// <etcd_prefix>/osd/state/%d
osd_num_t peer_osd = std::stoull(key.substr(etcd_prefix.length()+11));
if (peer_osd > 0)
{
if (value.is_object() && value["state"] == "up" &&
value["addresses"].is_array() &&
value["port"].int64_value() > 0 && value["port"].int64_value() < 65536)
{
this->peer_states[peer_osd] = value;
}
else
{
this->peer_states.erase(peer_osd);
}
if (on_change_osd_state_hook != NULL)
{
on_change_osd_state_hook(peer_osd);
}
}
}
}

View File

@@ -1,61 +0,0 @@
#pragma once
#include "osd_id.h"
#include "http_client.h"
#include "timerfd_manager.h"
#define ETCD_CONFIG_WATCH_ID 1
#define ETCD_PG_STATE_WATCH_ID 2
#define ETCD_PG_HISTORY_WATCH_ID 3
#define ETCD_OSD_STATE_WATCH_ID 4
#define MAX_ETCD_ATTEMPTS 5
#define ETCD_SLOW_TIMEOUT 5000
#define ETCD_QUICK_TIMEOUT 1000
struct pg_config_t
{
bool exists;
osd_num_t primary;
std::vector<osd_num_t> target_set;
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
bool pause;
osd_num_t cur_primary;
int cur_state;
};
struct json_kv_t
{
std::string key;
json11::Json value;
};
struct etcd_state_client_t
{
std::vector<std::string> etcd_addresses;
std::string etcd_prefix;
int log_level = 0;
timerfd_manager_t *tfd = NULL;
int etcd_watches_initialised = 0;
uint64_t etcd_watch_revision = 0;
websocket_t *etcd_watch_ws = NULL;
std::map<pg_num_t, pg_config_t> pg_config;
std::map<osd_num_t, json11::Json> peer_states;
std::function<void(json11::Json::object &)> on_change_hook;
std::function<void(json11::Json::object &)> on_load_config_hook;
std::function<json11::Json()> load_pgs_checks_hook;
std::function<void(bool)> on_load_pgs_hook;
std::function<void(uint64_t)> on_change_osd_state_hook;
json_kv_t parse_etcd_kv(const json11::Json & kv_json);
void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
void start_etcd_watcher();
void load_global_config();
void load_pgs();
void parse_state(const std::string & key, const json11::Json & value);
void parse_config(json11::Json & config);
};

View File

@@ -1,298 +0,0 @@
// FIO engine to test cluster I/O
//
// Random write:
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
//
// Linear write:
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
//
// Random read (run with -iodepth=32 or -iodepth=1):
//
// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
// -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <vector>
#include <unordered_map>
#include "epoll_manager.h"
#include "cluster_client.h"
extern "C" {
#define CONFIG_HAVE_GETTID
#define CONFIG_PWRITEV2
#include "fio/fio.h"
#include "fio/optgroup.h"
}
struct sec_data
{
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
bool last_sync = false;
/* The list of completed io_u structs. */
std::vector<io_u*> completed;
uint64_t op_n = 0, inflight = 0;
bool trace = false;
};
struct sec_options
{
int __pad;
char *etcd_host = NULL;
char *etcd_prefix = NULL;
int inode = 0;
int trace = 0;
};
static struct fio_option options[] = {
{
.name = "etcd",
.lname = "etcd address",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, etcd_host),
.help = "etcd address in the form HOST:PORT[/PATH]",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "etcd",
.lname = "etcd key prefix",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, etcd_prefix),
.help = "etcd key prefix, by default /microceph",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "inode",
.lname = "inode to run tests on",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, inode),
.help = "inode to run tests on (1 by default)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "osd_trace",
.lname = "OSD trace",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, trace),
.help = "Trace OSD operations",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
};
static int sec_setup(struct thread_data *td)
{
sec_data *bsd;
bsd = new sec_data;
if (!bsd)
{
td_verror(td, errno, "calloc");
return 1;
}
td->io_ops_data = bsd;
if (!td->files_index)
{
add_file(td, "osd_cluster", 0, 0);
td->o.nr_files = td->o.nr_files ? : 1;
td->o.open_files++;
}
return 0;
}
static void sec_cleanup(struct thread_data *td)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd)
{
delete bsd->cli;
delete bsd->epmgr;
delete bsd->ringloop;
bsd->cli = NULL;
bsd->epmgr = NULL;
bsd->ringloop = NULL;
}
}
/* Connect to the server from each thread. */
static int sec_init(struct thread_data *td)
{
sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
json11::Json cfg = json11::Json::object {
{ "etcd_address", std::string(o->etcd_host) },
{ "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/microceph") },
};
bsd->ringloop = new ring_loop_t(512);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
bsd->trace = o->trace ? true : false;
return 0;
}
/* Begin read or write request. */
static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
{
sec_options *opt = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
int n = bsd->op_n;
fio_ro_check(td, io);
if (io->ddir == DDIR_SYNC && bsd->last_sync)
{
return FIO_Q_COMPLETED;
}
io->engine_data = bsd;
cluster_op_t *op = new cluster_op_t;
switch (io->ddir)
{
case DDIR_READ:
op->opcode = OSD_OP_READ;
op->inode = opt->inode;
op->offset = io->offset;
op->len = io->xfer_buflen;
op->buf = io->xfer_buf;
bsd->last_sync = false;
break;
case DDIR_WRITE:
op->opcode = OSD_OP_WRITE;
op->inode = opt->inode;
op->offset = io->offset;
op->len = io->xfer_buflen;
op->buf = io->xfer_buf;
bsd->last_sync = false;
break;
case DDIR_SYNC:
op->opcode = OSD_OP_SYNC;
bsd->last_sync = true;
break;
default:
io->error = EINVAL;
return FIO_Q_COMPLETED;
}
op->callback = [io, n](cluster_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
sec_data *bsd = (sec_data*)io->engine_data;
bsd->inflight--;
bsd->completed.push_back(io);
if (bsd->trace)
{
printf("--- %s n=%d retval=%d\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n, op->retval);
}
delete op;
};
if (opt->trace)
{
printf("+++ %s # %d\n", io->ddir == DDIR_READ ? "READ" :
(io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n);
}
io->error = 0;
bsd->inflight++;
bsd->op_n++;
bsd->cli->execute(op);
if (io->error != 0)
return FIO_Q_COMPLETED;
return FIO_Q_QUEUED;
}
static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
while (true)
{
bsd->ringloop->loop();
if (bsd->completed.size() >= min)
break;
bsd->ringloop->wait();
}
return bsd->completed.size();
}
static struct io_u *sec_event(struct thread_data *td, int event)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd->completed.size() == 0)
return NULL;
/* FIXME We ignore the event number and assume fio calls us exactly once for [0..nr_events-1] */
struct io_u *ev = bsd->completed.back();
bsd->completed.pop_back();
return ev;
}
static int sec_io_u_init(struct thread_data *td, struct io_u *io)
{
io->engine_data = NULL;
return 0;
}
static void sec_io_u_free(struct thread_data *td, struct io_u *io)
{
}
static int sec_open_file(struct thread_data *td, struct fio_file *f)
{
return 0;
}
static int sec_invalidate(struct thread_data *td, struct fio_file *f)
{
return 0;
}
struct ioengine_ops ioengine = {
.name = "microceph_cluster",
.version = FIO_IOOPS_VERSION,
.flags = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
.setup = sec_setup,
.init = sec_init,
.queue = sec_queue,
.getevents = sec_getevents,
.event = sec_event,
.cleanup = sec_cleanup,
.open_file = sec_open_file,
.invalidate = sec_invalidate,
.io_u_init = sec_io_u_init,
.io_u_free = sec_io_u_free,
.option_struct_size = sizeof(struct sec_options),
.options = options,
};
static void fio_init fio_sec_register(void)
{
register_ioengine(&ioengine);
}
static void fio_exit fio_sec_unregister(void)
{
unregister_ioengine(&ioengine);
}

View File

@@ -23,7 +23,6 @@
#include "blockstore.h"
extern "C" {
#define CONFIG_HAVE_GETTID
#define CONFIG_PWRITEV2
#include "fio/fio.h"
#include "fio/optgroup.h"
@@ -101,7 +100,7 @@ static void bs_cleanup(struct thread_data *td)
bsd->ringloop->loop();
if (bsd->bs->is_safe_to_stop())
goto safe;
} while (bsd->ringloop->has_work());
} while (bsd->ringloop->get_loop_again());
bsd->ringloop->wait();
}
safe:

View File

@@ -5,7 +5,7 @@
// Random write:
//
// fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -host=127.0.0.1 -port=11203 [-block_size_order=17] [-single_primary=1] -size=1000M
// -host=127.0.0.1 -port=11203 [-single_primary=1] -size=1000M
//
// Linear write:
//
@@ -28,7 +28,6 @@
#include "rw_blocking.h"
#include "osd_ops.h"
extern "C" {
#define CONFIG_HAVE_GETTID
#define CONFIG_PWRITEV2
#include "fio/fio.h"
#include "fio/optgroup.h"
@@ -53,7 +52,6 @@ struct sec_options
int port = 0;
int single_primary = 0;
int trace = 0;
int block_order = 17;
};
static struct fio_option options[] = {
@@ -75,15 +73,6 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "block_size_order",
.lname = "Blockstore block size order",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, block_order),
.help = "Blockstore block size order (size = 2^order)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "single_primary",
.lname = "Single Primary",
@@ -150,8 +139,6 @@ static int sec_init(struct thread_data *td)
{
sec_options *o = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data;
bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order;
struct sockaddr_in addr;
int r;

View File

@@ -1,680 +0,0 @@
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <net/if.h>
#include <arpa/inet.h>
#include <ifaddrs.h>
#include <ctype.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include "json11/json11.hpp"
#include "http_client.h"
#include "timerfd_manager.h"
#define READ_BUFFER_SIZE 9000
static int extract_port(std::string & host);
static std::string strtolower(const std::string & in);
static std::string trim(const std::string & in);
static std::string ws_format_frame(int type, uint64_t size);
static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
// FIXME: Use keepalive
struct http_co_t
{
timerfd_manager_t *tfd;
int request_timeout = 0;
std::string host;
std::string request;
std::string ws_outbox;
std::string response;
bool want_streaming;
http_response_t parsed;
uint64_t target_response_size = 0;
int state = 0;
int peer_fd = -1;
int timeout_id = -1;
int epoll_events = 0;
int sent = 0;
std::vector<char> rbuf;
iovec read_iov, send_iov;
msghdr read_msg = { 0 }, send_msg = { 0 };
std::function<void(const http_response_t*)> callback;
websocket_t ws;
int onstack = 0;
bool ended = false;
~http_co_t();
inline void stackin() { onstack++; }
inline void stackout() { onstack--; if (!onstack && ended) end(); }
inline void end() { ended = true; if (!onstack) { delete this; } }
void start_connection();
void handle_events();
void handle_connect_result();
void submit_read();
void submit_send();
bool handle_read();
void post_message(int type, const std::string & msg);
};
#define HTTP_CO_CONNECTING 1
#define HTTP_CO_SENDING_REQUEST 2
#define HTTP_CO_REQUEST_SENT 3
#define HTTP_CO_HEADERS_RECEIVED 4
#define HTTP_CO_WEBSOCKET 5
#define HTTP_CO_CHUNKED 6
#define DEFAULT_TIMEOUT 5000
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback)
{
http_co_t *handler = new http_co_t();
handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
handler->want_streaming = options.want_streaming;
handler->tfd = tfd;
handler->host = host;
handler->request = request;
handler->callback = callback;
handler->ws.co = handler;
handler->start_connection();
}
void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
int timeout, std::function<void(std::string, json11::Json r)> callback)
{
http_request(tfd, host, request, { .timeout = timeout }, [callback](const http_response_t* res)
{
if (res->error_code != 0)
{
callback("Error code: "+std::to_string(res->error_code)+" ("+std::string(strerror(res->error_code))+")", json11::Json());
return;
}
if (res->status_code != 200)
{
callback("HTTP "+std::to_string(res->status_code)+" "+res->status_line+" body: "+trim(res->body), json11::Json());
return;
}
std::string json_err;
json11::Json data = json11::Json::parse(res->body, json_err);
if (json_err != "")
{
callback("Bad JSON: "+json_err+" (response: "+trim(res->body)+")", json11::Json());
return;
}
callback(std::string(), data);
});
}
websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback)
{
std::string request = "GET "+path+" HTTP/1.1\r\n"
"Host: "+host+"\r\n"
"Upgrade: websocket\r\n"
"Connection: upgrade\r\n"
"Sec-WebSocket-Key: x3JJHMbDL1EzLkh9GBhXDw==\r\n"
"Sec-WebSocket-Version: 13\r\n"
"\r\n";
http_co_t *handler = new http_co_t();
handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
handler->want_streaming = false;
handler->tfd = tfd;
handler->host = host;
handler->request = request;
handler->callback = callback;
handler->ws.co = handler;
handler->start_connection();
return &handler->ws;
}
void websocket_t::post_message(int type, const std::string & msg)
{
co->post_message(type, msg);
}
void websocket_t::close()
{
co->end();
}
http_co_t::~http_co_t()
{
if (timeout_id >= 0)
{
tfd->clear_timer(timeout_id);
timeout_id = -1;
}
if (peer_fd >= 0)
{
tfd->set_fd_handler(peer_fd, NULL);
close(peer_fd);
peer_fd = -1;
}
if (parsed.headers["transfer-encoding"] == "chunked")
{
int prev = 0, pos = 0;
while ((pos = response.find("\r\n", prev)) >= prev)
{
uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
parsed.body += response.substr(pos+2, len);
prev = pos+2+len+2;
}
}
else
{
std::swap(parsed.body, response);
}
parsed.eof = true;
callback(&parsed);
}
void http_co_t::start_connection()
{
stackin();
int port = extract_port(host);
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
{
parsed.error_code = ENXIO;
stackout();
end();
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(port ? port : 80);
peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
parsed.error_code = errno;
stackout();
end();
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
if (response.length() == 0)
{
parsed.error_code = ETIME;
}
end();
});
}
epoll_events = 0;
// Finally call connect
r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
parsed.error_code = errno;
stackout();
end();
return;
}
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
handle_events();
});
state = HTTP_CO_CONNECTING;
stackout();
}
void http_co_t::handle_events()
{
stackin();
while (epoll_events)
{
if (state == HTTP_CO_CONNECTING)
{
handle_connect_result();
}
else
{
epoll_events &= ~EPOLLOUT;
if (epoll_events & EPOLLIN)
{
submit_read();
}
else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
{
end();
break;
}
}
}
stackout();
}
void http_co_t::handle_connect_result()
{
stackin();
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
parsed.error_code = result;
stackout();
end();
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
state = HTTP_CO_SENDING_REQUEST;
submit_send();
stackout();
}
void http_co_t::submit_read()
{
stackin();
int res;
if (rbuf.size() != READ_BUFFER_SIZE)
{
rbuf.resize(READ_BUFFER_SIZE);
}
read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
read_msg.msg_iov = &read_iov;
read_msg.msg_iovlen = 1;
res = recvmsg(peer_fd, &read_msg, 0);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN || res == 0)
{
epoll_events = epoll_events & ~EPOLLIN;
}
else if (res < 0)
{
end();
}
else if (res > 0)
{
response += std::string(rbuf.data(), res);
handle_read();
}
stackout();
}
void http_co_t::submit_send()
{
stackin();
int res;
again:
if (sent < request.size())
{
send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
send_msg.msg_iov = &send_iov;
send_msg.msg_iovlen = 1;
res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN)
{
res = 0;
}
else if (res < 0)
{
stackout();
end();
return;
}
sent += res;
if (state == HTTP_CO_SENDING_REQUEST)
{
if (sent >= request.size())
{
state = HTTP_CO_REQUEST_SENT;
}
else
goto again;
}
else if (state == HTTP_CO_WEBSOCKET)
{
request = request.substr(sent);
sent = 0;
goto again;
}
}
stackout();
}
bool http_co_t::handle_read()
{
stackin();
if (state == HTTP_CO_REQUEST_SENT)
{
int pos = response.find("\r\n\r\n");
if (pos >= 0)
{
if (timeout_id >= 0)
{
tfd->clear_timer(timeout_id);
timeout_id = -1;
}
state = HTTP_CO_HEADERS_RECEIVED;
parse_http_headers(response, &parsed);
if (parsed.status_code == 101 &&
parsed.headers.find("sec-websocket-accept") != parsed.headers.end() &&
parsed.headers["upgrade"] == "websocket" &&
parsed.headers["connection"] == "upgrade")
{
// Don't care about validating the key
state = HTTP_CO_WEBSOCKET;
request = ws_outbox;
ws_outbox = "";
sent = 0;
submit_send();
}
else if (parsed.headers["transfer-encoding"] == "chunked")
{
state = HTTP_CO_CHUNKED;
}
else if (parsed.headers["connection"] != "close")
{
target_response_size = stoull_full(parsed.headers["content-length"]);
if (!target_response_size)
{
// Sorry, unsupported response
stackout();
end();
return false;
}
}
}
}
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
{
stackout();
end();
return false;
}
if (state == HTTP_CO_CHUNKED && response.size() > 0)
{
int prev = 0, pos = 0;
while ((pos = response.find("\r\n", prev)) >= prev)
{
uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
if (!len)
{
// Zero length chunk indicates EOF
parsed.eof = true;
break;
}
if (response.size() < pos+2+len+2)
{
break;
}
parsed.body += response.substr(pos+2, len);
prev = pos+2+len+2;
}
if (prev > 0)
{
response = response.substr(prev);
}
if (parsed.eof)
{
stackout();
end();
return false;
}
if (want_streaming && parsed.body.size() > 0)
{
callback(&parsed);
parsed.body = "";
}
}
if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
{
while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body))
{
callback(&parsed);
parsed.body = "";
}
}
stackout();
return true;
}
void http_co_t::post_message(int type, const std::string & msg)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
request += ws_format_frame(type, msg.size());
request += msg;
submit_send();
}
else
{
ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg;
}
stackout();
}
uint64_t stoull_full(const std::string & str, int base)
{
if (isspace(str[0]))
{
return 0;
}
char *end = NULL;
uint64_t r = strtoull(str.c_str(), &end, base);
if (end != str.c_str()+str.length())
{
return 0;
}
return r;
}
void parse_http_headers(std::string & res, http_response_t *parsed)
{
int pos = res.find("\r\n");
pos = pos < 0 ? res.length() : pos+2;
std::string status_line = res.substr(0, pos);
int http_version;
char *status_text = NULL;
sscanf(status_line.c_str(), "HTTP/1.%d %d %ms", &http_version, &parsed->status_code, &status_text);
if (status_text)
{
parsed->status_line = status_text;
// %ms = allocate a buffer
free(status_text);
status_text = NULL;
}
int prev = pos;
while ((pos = res.find("\r\n", prev)) >= prev)
{
if (pos == prev)
{
res = res.substr(pos+2);
break;
}
std::string header = res.substr(prev, pos-prev);
int p2 = header.find(":");
if (p2 >= 0)
{
std::string key = strtolower(header.substr(0, p2));
int p3 = p2+1;
while (p3 < header.length() && isblank(header[p3]))
p3++;
parsed->headers[key] = key == "connection" || key == "upgrade" || key == "transfer-encoding"
? strtolower(header.substr(p3)) : header.substr(p3);
}
prev = pos+2;
}
}
static std::string ws_format_frame(int type, uint64_t size)
{
// Always zero mask
std::string res;
int p = 0;
res.resize(2 + (size >= 126 ? 2 : 0) + (size >= 65536 ? 6 : 0) + /*mask*/4);
res[p++] = 0x80 | type;
if (size < 126)
res[p++] = size | /*mask*/0x80;
else if (size < 65536)
{
res[p++] = 126 | /*mask*/0x80;
res[p++] = (size >> 8) & 0xFF;
res[p++] = (size >> 0) & 0xFF;
}
else
{
res[p++] = 127 | /*mask*/0x80;
res[p++] = (size >> 56) & 0xFF;
res[p++] = (size >> 48) & 0xFF;
res[p++] = (size >> 40) & 0xFF;
res[p++] = (size >> 32) & 0xFF;
res[p++] = (size >> 24) & 0xFF;
res[p++] = (size >> 16) & 0xFF;
res[p++] = (size >> 8) & 0xFF;
res[p++] = (size >> 0) & 0xFF;
}
res[p++] = 0;
res[p++] = 0;
res[p++] = 0;
res[p++] = 0;
return res;
}
static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
{
uint64_t hdr = 2;
if (buf.size() < hdr)
{
return false;
}
type = buf[0] & ~0x80;
bool mask = !!(buf[1] & 0x80);
hdr += mask ? 4 : 0;
uint64_t len = ((uint8_t)buf[1] & ~0x80);
if (len == 126)
{
hdr += 2;
if (buf.size() < hdr)
{
return false;
}
len = ((uint64_t)(uint8_t)buf[2] << 8) | ((uint64_t)(uint8_t)buf[3] << 0);
}
else if (len == 127)
{
hdr += 8;
if (buf.size() < hdr)
{
return false;
}
len = ((uint64_t)(uint8_t)buf[2] << 56) |
((uint64_t)(uint8_t)buf[3] << 48) |
((uint64_t)(uint8_t)buf[4] << 40) |
((uint64_t)(uint8_t)buf[5] << 32) |
((uint64_t)(uint8_t)buf[6] << 24) |
((uint64_t)(uint8_t)buf[7] << 16) |
((uint64_t)(uint8_t)buf[8] << 8) |
((uint64_t)(uint8_t)buf[9] << 0);
}
if (buf.size() < hdr+len)
{
return false;
}
if (mask)
{
for (int i = 0; i < len; i++)
buf[hdr+i] ^= buf[hdr-4+(i & 3)];
}
res += buf.substr(hdr, len);
buf = buf.substr(hdr+len);
return true;
}
std::vector<std::string> getifaddr_list(bool include_v6)
{
std::vector<std::string> addresses;
ifaddrs *list, *ifa;
if (getifaddrs(&list) == -1)
{
throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
}
for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
{
if (!ifa->ifa_addr)
{
continue;
}
int family = ifa->ifa_addr->sa_family;
if ((family == AF_INET || family == AF_INET6 && include_v6) &&
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
{
void *addr_ptr;
if (family == AF_INET)
addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
else
addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
char addr[INET6_ADDRSTRLEN];
if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
{
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
}
addresses.push_back(std::string(addr));
}
}
freeifaddrs(list);
return addresses;
}
static int extract_port(std::string & host)
{
int port = 0;
int pos = 0;
if ((pos = host.find(':')) >= 0)
{
port = strtoull(host.c_str() + pos + 1, NULL, 10);
if (port >= 0x10000)
{
port = 0;
}
host = host.substr(0, pos);
}
return port;
}
static std::string strtolower(const std::string & in)
{
std::string s = in;
for (int i = 0; i < s.length(); i++)
{
s[i] = tolower(s[i]);
}
return s;
}
static std::string trim(const std::string & in)
{
int begin = in.find_first_not_of(" \n\r\t");
if (begin == -1)
return "";
int end = in.find_last_not_of(" \n\r\t");
return in.substr(begin, end+1-begin);
}

View File

@@ -1,56 +0,0 @@
#pragma once
#include <string>
#include <vector>
#include <map>
#include <functional>
#include "json11/json11.hpp"
#define WS_CONTINUATION 0
#define WS_TEXT 1
#define WS_BINARY 2
#define WS_CLOSE 8
#define WS_PING 9
#define WS_PONG 10
class timerfd_manager_t;
struct http_options_t
{
int timeout;
bool want_streaming;
};
struct http_response_t
{
bool eof = false;
int error_code = 0;
int status_code = 0;
std::string status_line;
std::map<std::string, std::string> headers;
int ws_msg_type = -1;
std::string body;
};
struct http_co_t;
struct websocket_t
{
http_co_t *co;
void post_message(int type, const std::string & msg);
void close();
};
void parse_http_headers(std::string & res, http_response_t *parsed);
std::vector<std::string> getifaddr_list(bool include_v6 = false);
uint64_t stoull_full(const std::string & str, int base = 10);
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback);
void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
int timeout, std::function<void(std::string, json11::Json r)> callback);
websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback);

View File

@@ -1,521 +0,0 @@
// Data distribution optimizer using linear programming (lp_solve)
const child_process = require('child_process');
const NO_OSD = 'Z';
async function lp_solve(text)
{
const cp = child_process.spawn('lp_solve');
let stdout = '', stderr = '', finish_cb;
cp.stdout.on('data', buf => stdout += buf.toString());
cp.stderr.on('data', buf => stderr += buf.toString());
cp.on('exit', () => finish_cb && finish_cb());
cp.stdin.write(text);
cp.stdin.end();
if (cp.exitCode == null)
{
await new Promise(ok => finish_cb = ok);
}
if (!stdout.trim())
{
return null;
}
let score = 0;
let vars = {};
for (const line of stdout.split(/\n/))
{
let m = /^(^Value of objective function: ([\d\.]+)|Actual values of the variables:)\s*$/.exec(line);
if (m)
{
if (m[2])
{
score = m[2];
}
continue;
}
else if (/This problem is (infeasible|unbounded)/.exec(line))
{
return null;
}
let [ k, v ] = line.trim().split(/\s+/, 2);
if (v)
{
vars[k] = v;
}
}
return { score, vars };
}
async function optimize_initial(osd_tree, pg_count, max_combinations)
{
max_combinations = max_combinations || 10000;
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
let all_pgs = all_combinations(osd_tree, null, true);
if (all_pgs.length > max_combinations)
{
const prob = max_combinations/all_pgs.length;
all_pgs = all_pgs.filter(pg => Math.random() < prob);
}
const pg_per_osd = {};
for (const pg of all_pgs)
{
for (const osd of pg)
{
pg_per_osd[osd] = pg_per_osd[osd] || [];
pg_per_osd[osd].push("pg_"+pg.join("_"));
}
}
const pg_size = Math.min(Object.keys(osd_tree).length, 3);
let lp = '';
lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n";
for (const osd in pg_per_osd)
{
if (osd !== NO_OSD)
{
let osd_pg_count = all_weights[osd]/total_weight*pg_size*pg_count;
lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
}
}
for (const pg of all_pgs)
{
lp += 'pg_'+pg.join('_')+" >= 0;\n";
}
lp += "sec "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(', ')+";\n";
const lp_result = await lp_solve(lp);
if (!lp_result)
{
throw new Error('Problem is infeasible or unbounded - is it a bug?');
}
const int_pgs = make_int_pgs(lp_result.vars, pg_count);
const eff = pg_list_space_efficiency(int_pgs, all_weights);
return { score: lp_result.score, weights: lp_result.vars, int_pgs, space: eff*pg_size, total_space: total_weight };
}
function make_int_pgs(weights, pg_count)
{
const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
let int_pgs = [];
let pg_left = pg_count;
let weight_left = total_weight;
for (const pg_name in weights)
{
let n = Math.round(weights[pg_name] / weight_left * pg_left);
for (let i = 0; i < n; i++)
{
int_pgs.push(pg_name.substr(3).split('_'));
}
weight_left -= weights[pg_name];
pg_left -= n;
}
return int_pgs;
}
// Try to minimize data movement
async function optimize_change(prev_int_pgs, osd_tree, max_combinations)
{
max_combinations = max_combinations || 10000;
const pg_size = Math.min(Object.keys(osd_tree).length, 3);
const pg_count = prev_int_pgs.length;
const prev_weights = {};
const prev_pg_per_osd = {};
for (const pg of prev_int_pgs)
{
const pg_name = 'pg_'+pg.join('_');
prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1;
for (const osd of pg)
{
prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || [];
prev_pg_per_osd[osd].push(pg_name);
}
}
// Get all combinations
let all_pgs = all_combinations(osd_tree, null, true);
if (all_pgs.length > max_combinations)
{
const intersecting = all_pgs.filter(pg => prev_weights['pg_'+pg.join('_')]);
if (intersecting.length > max_combinations)
{
const prob = max_combinations/intersecting.length;
all_pgs = intersecting.filter(pg => Math.random() < prob);
}
else
{
const prob = (max_combinations-intersecting.length)/all_pgs.length;
all_pgs = all_pgs.filter(pg => Math.random() < prob || prev_weights['pg_'+pg.join('_')]);
}
}
const pg_per_osd = {};
for (const pg of all_pgs)
{
const pg_name = 'pg_'+pg.join('_');
for (const osd of pg)
{
pg_per_osd[osd] = pg_per_osd[osd] || [];
pg_per_osd[osd].push(pg_name);
}
}
// Penalize PGs based on their similarity to old PGs
const intersect = {};
for (const pg_name in prev_weights)
{
const pg = pg_name.substr(3).split(/_/);
intersect[pg[0]+'::'] = intersect[':'+pg[1]+':'] = intersect['::'+pg[2]] = 2;
intersect[pg[0]+'::'+pg[2]] = intersect[':'+pg[1]+':'+pg[2]] = intersect[pg[0]+':'+pg[1]+':'] = 1;
}
const move_weights = {};
for (const pg of all_pgs)
{
move_weights['pg_'+pg.join('_')] =
intersect[pg[0]+'::'+pg[2]] || intersect[':'+pg[1]+':'+pg[2]] || intersect[pg[0]+':'+pg[1]+':'] ||
intersect[pg[0]+'::'] || intersect[':'+pg[1]+':'] || intersect['::'+pg[2]] ||
3;
}
// Calculate total weight - old PG weights
const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
// Generate the LP problem
let lp = '';
lp += 'max: '+all_pg_names.map(pg_name => (
prev_weights[pg_name] ? `${4-move_weights[pg_name]}*add_${pg_name} - 4*del_${pg_name}` : `${4-move_weights[pg_name]}*${pg_name}`
)).join(' + ')+';\n';
for (const osd in pg_per_osd)
{
if (osd !== NO_OSD)
{
const osd_sum = (pg_per_osd[osd]||[]).map(pg_name => prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : pg_name).join(' + ');
const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]).filter(old_pg_name => move_weights[old_pg_name]).length;
let osd_pg_count = all_weights[osd]*3/total_weight*pg_count - rm_osd_pg_count;
lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
}
}
let pg_vars = [];
for (const pg_name of all_pg_names)
{
if (prev_weights[pg_name])
{
pg_vars.push(`add_${pg_name}`, `del_${pg_name}`);
// Can't add or remove less than zero
lp += `add_${pg_name} >= 0;\n`;
lp += `del_${pg_name} >= 0;\n`;
// Can't remove more than the PG already has
lp += `add_${pg_name} - del_${pg_name} >= -${prev_weights[pg_name]};\n`;
}
else
{
pg_vars.push(pg_name);
lp += `${pg_name} >= 0;\n`;
}
}
lp += 'sec '+pg_vars.join(', ')+';\n';
// Solve it
const lp_result = await lp_solve(lp);
if (!lp_result)
{
console.log(lp);
throw new Error('Problem is infeasible or unbounded - is it a bug?');
}
// Generate the new distribution
const weights = { ...prev_weights };
for (const k in prev_weights)
{
if (!move_weights[k])
{
delete weights[k];
}
}
for (const k in lp_result.vars)
{
if (k.substr(0, 4) === 'add_')
{
weights[k.substr(4)] = (weights[k.substr(4)] || 0) + Number(lp_result.vars[k]);
}
else if (k.substr(0, 4) === 'del_')
{
weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]);
}
else
{
weights[k] = Number(lp_result.vars[k]);
}
}
for (const k in weights)
{
if (!weights[k])
{
delete weights[k];
}
}
const int_pgs = make_int_pgs(weights, pg_count);
// Align them with most similar previous PGs
const new_pgs = align_pgs(prev_int_pgs, int_pgs);
let differs = 0, osd_differs = 0;
for (let i = 0; i < pg_count; i++)
{
if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
{
differs++;
}
for (let j = 0; j < 3; j++)
{
if (new_pgs[i][j] != prev_int_pgs[i][j])
{
osd_differs++;
}
}
}
return {
prev_pgs: prev_int_pgs,
score: lp_result.score,
weights,
int_pgs: new_pgs,
differs,
osd_differs,
space: pg_size * pg_list_space_efficiency(new_pgs, all_weights),
total_space: total_weight,
};
}
function print_change_stats(retval, detailed)
{
const new_pgs = retval.int_pgs;
const prev_int_pgs = retval.prev_pgs;
if (prev_int_pgs)
{
if (detailed)
{
for (let i = 0; i < new_pgs.length; i++)
{
if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
{
console.log("pg "+i+": "+prev_int_pgs[i].join(' ')+" -> "+new_pgs[i].join(' '));
}
}
}
console.log(
"Data movement: "+retval.differs+" pgs, "+
retval.osd_differs+" pg*osds = "+Math.round(retval.osd_differs / prev_int_pgs.length / 3 * 10000)/100+" %"
);
}
console.log(
"Total space (raw): "+Math.round(retval.space*100)/100+" TB, space efficiency: "+
Math.round(retval.space/(retval.total_space||1)*10000)/100+" %"
);
}
function align_pgs(prev_int_pgs, int_pgs)
{
const aligned_pgs = [];
put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg.join(':') ]);
put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::'+pg[2], ':'+pg[1]+':'+pg[2], pg[0]+':'+pg[1]+':' ]);
put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::', ':'+pg[1]+':', '::'+pg[2] ]);
const free_slots = prev_int_pgs.map((pg, i) => !aligned_pgs[i] ? i : null).filter(i => i != null);
for (const pg of int_pgs)
{
if (!free_slots.length)
{
throw new Error("Can't place unaligned PG");
}
aligned_pgs[free_slots.shift()] = pg;
}
return aligned_pgs;
}
function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
{
let prev_indexes = {};
for (let i = 0; i < prev_int_pgs.length; i++)
{
for (let k of keygen(prev_int_pgs[i]))
{
prev_indexes[k] = prev_indexes[k] || [];
prev_indexes[k].push(i);
}
}
PG: for (let i = int_pgs.length-1; i >= 0; i--)
{
let pg = int_pgs[i];
let keys = keygen(int_pgs[i]);
for (let k of keys)
{
while (prev_indexes[k] && prev_indexes[k].length)
{
let idx = prev_indexes[k].shift();
if (!aligned_pgs[idx])
{
aligned_pgs[idx] = pg;
int_pgs.splice(i, 1);
continue PG;
}
}
}
}
}
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
// levels = { string: number }
// to a two-level osd_tree suitable for all_combinations()
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
{
osd_level = levels[osd_level] || osd_level;
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
for (const node of osd_tree)
{
if ((levels[node.level] || node.level) < failure_domain_level)
{
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
}
else
{
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
}
}
return domains;
}
function extract_osds(osd_tree, levels, osd_level, osds = {})
{
for (const node of osd_tree)
{
if ((levels[node.level] || node.level) >= osd_level)
{
osds[node.id] = node.size;
}
else
{
extract_osds(node.children||[], levels, osd_level, osds);
}
}
return osds;
}
// FIXME: support different pg_sizes, not just 3
// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
function all_combinations(osd_tree, count, ordered)
{
const hosts = Object.keys(osd_tree).sort();
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
while (hosts.length < 3)
{
osds[NO_OSD] = [ NO_OSD ];
hosts.push(NO_OSD);
}
let host_idx = [ 0, 1, 2 ];
let osd_idx = [ 0, 0, 0 ];
const r = [];
while (!count || count < 0 || r.length < count)
{
let inc;
if (host_idx[2] != host_idx[1] && host_idx[2] != host_idx[0] && host_idx[1] != host_idx[0])
{
r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
inc = 2;
while (inc >= 0)
{
osd_idx[inc]++;
if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
{
osd_idx[inc] = 0;
inc--;
}
else
{
break;
}
}
}
else
{
inc = -1;
}
if (inc < 0)
{
// no osds left in current host combination, select the next one
osd_idx = [ 0, 0, 0 ];
host_idx[2]++;
if (host_idx[2] >= hosts.length)
{
host_idx[1]++;
host_idx[2] = ordered ? host_idx[1]+1 : 0;
if ((ordered ? host_idx[2] : host_idx[1]) >= hosts.length)
{
host_idx[0]++;
host_idx[1] = ordered ? host_idx[0]+1 : 0;
host_idx[2] = ordered ? host_idx[1]+1 : 0;
if ((ordered ? host_idx[2] : host_idx[0]) >= hosts.length)
{
break;
}
}
}
}
}
return r;
}
function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
{
const per_osd = {};
for (const pg_name in weights)
{
for (const osd of pg_name.substr(3).split(/_/))
{
per_osd[osd] = (per_osd[osd]||0) + weights[pg_name];
}
}
return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes);
}
function pg_list_space_efficiency(pgs, osd_sizes)
{
const per_osd = {};
for (const pg of pgs)
{
for (const osd of pg)
{
per_osd[osd] = (per_osd[osd]||0) + 1;
}
}
return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes);
}
function pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes)
{
// each PG gets randomly selected in 1/N cases
// & there are x PGs per OSD
// => an OSD is selected in x/N cases
// => total space * x/N <= OSD size
// => total space <= OSD size * N/x
let space;
for (let osd in per_osd)
{
if (osd in osd_sizes)
{
const space_estimate = osd_sizes[osd] * pg_count / per_osd[osd];
if (space == null || space > space_estimate)
{
space = space_estimate;
}
}
}
return space == null ? 0 : space;
}
module.exports = {
NO_OSD,
optimize_initial,
optimize_change,
print_change_stats,
pg_weights_space_efficiency,
pg_list_space_efficiency,
pg_per_osd_space_efficiency,
flatten_tree,
lp_solve,
make_int_pgs,
align_pgs,
all_combinations,
};

View File

@@ -1,22 +0,0 @@
#!/usr/bin/node
const Mon = require('./mon.js');
const options = {};
for (let i = 2; i < process.argv.length; i++)
{
if (process.argv[i].substr(0, 2) == '--')
{
options[process.argv[i].substr(2)] = process.argv[i+1];
i++;
}
}
if (!options.etcd_url)
{
console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url "http://127.0.0.1:2379,..." --etcd_prefix "/rage" --etcd_start_timeout 5');
process.exit();
}
new Mon(options).start();

858
lp/mon.js
View File

@@ -1,858 +0,0 @@
const http = require('http');
const os = require('os');
const WebSocket = require('ws');
const LPOptimizer = require('./lp-optimizer.js');
const stableStringify = require('./stable-stringify.js');
class Mon
{
static etcd_tree = {
config: {
global: null,
/* placement_tree = {
levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
nodes: { host1: { level: 'host', parent: 'rack1' }, ... },
failure_domain: 'host',
} */
placement_tree: null,
osd: {},
pgs: {},
},
osd: {
state: {},
stats: {},
},
mon: {
master: null,
},
pg: {
change_stamp: null,
state: {},
stats: {},
history: {},
},
}
constructor(config)
{
// FIXME: Maybe prefer local etcd
this.etcd_urls = [];
for (let url of config.etcd_url.split(/,/))
{
let scheme = 'http';
url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
if (!/\/[^\/]/.exec(url))
url += '/v3';
this.etcd_urls.push(scheme+'://'+url);
}
this.etcd_prefix = config.etcd_prefix || '/rage';
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(Mon.etcd_tree));
}
async start()
{
await this.load_config();
await this.get_lease();
await this.become_master();
await this.load_cluster_state();
await this.start_watcher();
await this.recheck_pgs();
}
async load_config()
{
const res = await this.etcd_call('/txn', { success: [
{ requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
] }, this.etcd_start_timeout, -1);
this.parse_kv(res.responses[0].response_range.kvs[0]);
this.check_config();
}
check_config()
{
this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
if (this.config.etcd_mon_timeout <= 0)
{
this.config.etcd_mon_timeout = 1000;
}
this.config.etcd_mon_retries = Number(this.config.etcd_mon_retries) || 5;
if (this.config.etcd_mon_retries < 0)
{
this.config.etcd_mon_retries = 0;
}
this.config.mon_change_timeout = Number(this.config.mon_change_timeout) || 1000;
if (this.config.mon_change_timeout < 100)
{
this.config.mon_change_timeout = 100;
}
this.config.mon_stats_timeout = Number(this.config.mon_stats_timeout) || 1000;
if (this.config.mon_stats_timeout < 100)
{
this.config.mon_stats_timeout = 100;
}
// After this number of seconds, a dead OSD will be removed from PG distribution
this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
if (!this.config.osd_out_time)
{
this.config.osd_out_time = 30*60; // 30 minutes by default
}
this.config.max_osd_combinations = Number(this.config.max_osd_combinations) || 10000;
if (this.config.max_osd_combinations < 100)
{
this.config.max_osd_combinations = 100;
}
}
async start_watcher(retries)
{
let retry = 0;
if (retries >= 0 && retries < 1)
{
retries = 1;
}
while (retries < 0 || retry < retries)
{
const base = 'ws'+this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)].substr(4);
const ok = await new Promise((ok, no) =>
{
const timer_id = setTimeout(() =>
{
this.ws.close();
ok(false);
}, timeout);
this.ws = new WebSocket(base+'/watch');
this.ws.on('open', () =>
{
if (timer_id)
clearTimeout(timer_id);
ok(true);
});
});
if (!ok)
{
this.ws = null;
}
retry++;
}
if (!this.ws)
{
this.die('Failed to open etcd watch websocket');
}
this.ws.send(JSON.stringify({
create_request: {
key: b64(this.etcd_prefix+'/'),
range_end: b64(this.etcd_prefix+'0'),
start_revision: ''+this.etcd_watch_revision,
watch_id: 1,
},
}));
this.ws.on('message', (msg) =>
{
let data;
try
{
data = JSON.parse(msg);
}
catch (e)
{
}
if (!data || !data.result || !data.result.events)
{
console.error('Garbage received from watch websocket: '+msg);
}
else
{
let stats_changed = false, changed = false;
console.log('Revision '+data.result.header.revision+' events: ');
for (const e of data.result.events)
{
this.parse_kv(e.kv);
const key = e.kv.key.substr(this.etcd_prefix.length);
if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/')
{
stats_changed = true;
}
else if (key != '/stats')
{
changed = true;
}
console.log(e);
}
if (stats_changed)
{
this.schedule_update_stats();
}
if (changed)
{
this.schedule_recheck();
}
}
});
}
async get_lease()
{
const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
this.etcd_lease_id = res.ID;
setInterval(async () =>
{
const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
if (!res.result.TTL)
{
this.die('Lease expired');
}
}, config.etcd_mon_timeout);
}
async become_master()
{
const state = { ip: this.local_ips() };
while (1)
{
const res = await this.etcd_call('/txn', {
compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.etcd_prefix+'/mon/master') } ],
success: [ { key: b64(this.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.etcd_lease_id } ],
}, this.etcd_start_timeout, 0);
if (!res.succeeded)
{
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout));
}
}
}
async load_cluster_state()
{
const res = await this.etcd_call('/txn', { success: [
{ requestRange: { key: b64(this.etcd_prefix+'/'), range_end: b64(this.etcd_prefix+'0') } },
] }, this.etcd_start_timeout, -1);
this.etcd_watch_revision = BigInt(res.header.revision)+BigInt(1);
const data = JSON.parse(JSON.stringify(Mon.etcd_tree));
for (const response of res.responses)
{
for (const kv of response.response_range.kvs)
{
this.parse_kv(kv);
}
}
this.state = data;
}
all_osds()
{
return Object.keys(this.state.osd.stats);
}
get_osd_tree()
{
this.state.config.placement_tree = this.state.config.placement_tree||{};
const levels = this.state.config.placement_tree.levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
const tree = { '': { children: [] } };
for (const node_id in this.state.config.placement_tree.nodes||{})
{
const node_cfg = this.state.config.placement_tree.nodes[node_id];
if (!node_id || /^\d/.exec(node_id) ||
!node_cfg.level || !levels[node_cfg.level])
{
// All nodes must have non-empty non-numeric IDs and valid levels
continue;
}
tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
}
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
const down_time = Date.now()/1000 - this.config.osd_out_time;
for (const osd_num of this.all_osds().sort((a, b) => a - b))
{
const stat = this.state.osd.stats[osd_num];
if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
{
// Numeric IDs are reserved for OSDs
const reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight) || 1;
tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
tree[osd_num].level = 'osd';
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
delete tree[osd_num].children;
}
}
for (const node_id in tree)
{
if (node_id === '')
{
continue;
}
const node_cfg = tree[node_id];
const node_level = levels[node_cfg.level] || node_cfg.level;
let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
&& tree[node_cfg.parent].level;
parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
// Parent's level must be less than child's; OSDs must be leaves
const parent = parent_level && parent_level < node_level ? tree[node_cfg.parent] : '';
tree[parent].children.push(tree[node_id]);
delete node_cfg.parent;
}
return LPOptimizer.flatten_tree(tree[''].children, levels, this.state.config.failure_domain, 'osd');
}
async stop_all_pgs()
{
let has_online = false, paused = true;
for (const pg in this.state.config.pgs.items||{})
{
const cur_state = ((this.state.pg.state[pg]||{}).state||[]).join(',');
if (cur_state != '' && cur_state != 'offline')
{
has_online = true;
}
if (!this.state.config.pgs.items[pg].pause)
{
paused = false;
}
}
if (!paused)
{
console.log('Stopping all PGs before changing PG count');
const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
for (const pg in new_cfg.items)
{
new_cfg.items[pg].pause = true;
}
// Check that no OSDs change their state before we pause PGs
// Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
// and can't see the old PG configuration
const checks = [];
for (const osd_num of this.all_osds())
{
const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
}
const res = await this.etcd_call('/txn', {
compare: [
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
...checks,
],
success: [
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
],
}, this.config.etcd_mon_timeout, 0);
if (!res.succeeded)
{
return false;
}
this.state.config.pgs = new_cfg;
}
return !has_online;
}
scale_pg_count(prev_pgs, pg_history, new_pg_count)
{
const old_pg_count = prev_pgs.length;
// Add all possibly intersecting PGs into the history of new PGs
if (!(new_pg_count % old_pg_count))
{
// New PG count is a multiple of the old PG count
const mul = (new_pg_count / old_pg_count);
for (let i = 0; i < new_pg_count; i++)
{
const old_i = Math.floor(new_pg_count / mul);
pg_history[i] = JSON.parse(JSON.stringify(this.state.pg.history[1+old_i]));
}
}
else if (!(old_pg_count % new_pg_count))
{
// Old PG count is a multiple of the new PG count
const mul = (old_pg_count / new_pg_count);
for (let i = 0; i < new_pg_count; i++)
{
pg_history[i] = {
osd_sets: [],
all_peers: [],
};
for (let j = 0; j < mul; j++)
{
pg_history[i].osd_sets.push(prev_pgs[i*mul]);
const hist = this.state.pg.history[1+i*mul+j];
if (hist && hist.osd_sets && hist.osd_sets.length)
{
Array.prototype.push.apply(pg_history[i].osd_sets, hist.osd_sets);
}
if (hist && hist.all_peers && hist.all_peers.length)
{
Array.prototype.push.apply(pg_history[i].all_peers, hist.all_peers);
}
}
}
}
else
{
// Any PG may intersect with any PG after non-multiple PG count change
// So, merge ALL PGs history
let all_sets = {};
let all_peers = {};
for (const pg of prev_pgs)
{
all_sets[pg.join(' ')] = pg;
}
for (const pg in this.state.pg.history)
{
const hist = this.state.pg.history[pg];
if (hist && hist.osd_sets)
{
for (const pg of hist.osd_sets)
{
all_sets[pg.join(' ')] = pg;
}
}
if (hist && hist.all_peers)
{
for (const osd_num of hist.all_peers)
{
all_peers[osd_num] = Number(osd_num);
}
}
}
all_sets = Object.values(all_sets);
all_peers = Object.values(all_peers);
for (let i = 0; i < new_pg_count; i++)
{
pg_history[i] = { osd_sets: all_sets, all_peers };
}
}
// Mark history keys for removed PGs as removed
for (let i = new_pg_count; i < old_pg_count; i++)
{
pg_history[i] = null;
}
if (old_pg_count < new_pg_count)
{
for (let i = new_pg_count-1; i >= 0; i--)
{
prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
}
}
else if (old_pg_count > new_pg_count)
{
for (let i = 0; i < new_pg_count; i++)
{
prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
}
prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
}
}
async save_new_pgs(prev_pgs, new_pgs, pg_history, tree_hash)
{
const txn = [], checks = [];
const pg_items = {};
new_pgs.map((osd_set, i) =>
{
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
const alive_set = osd_set.filter(osd_num => osd_num);
pg_items[i+1] = {
osd_set,
primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
};
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
{
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
pg_history[i].osd_sets.push(prev_pgs[i]);
}
});
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
{
checks.push({
key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
target: 'MOD',
mod_revision: ''+this.etcd_watch_revision,
result: 'LESS',
});
if (pg_history[i])
{
txn.push({
requestPut: {
key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
value: b64(JSON.stringify(pg_history[i])),
},
});
}
else
{
txn.push({
requestDeleteRange: {
key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
},
});
}
}
this.state.config.pgs = {
hash: tree_hash,
items: pg_items,
};
const res = await this.etcd_call('/txn', {
compare: [
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
...checks,
],
success: [
{ requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(this.state.config.pgs)) } },
...txn,
],
}, this.config.etcd_mon_timeout, 0);
return res.succeeded;
}
async recheck_pgs()
{
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
const tree_cfg = {
osd_tree: this.get_osd_tree(),
pg_count: this.config.pg_count || Object.keys(this.state.config.pgs.items||{}).length || 128,
max_osd_combinations: this.config.max_osd_combinations,
};
const tree_hash = sha1hex(stableStringify(tree_cfg));
if (this.state.config.pgs.hash != tree_hash)
{
// Something has changed
const prev_pgs = [];
for (const pg in this.state.config.pgs.items||{})
{
prev_pgs[pg-1] = this.state.config.pgs.items[pg].osd_set;
}
const pg_history = [];
const old_pg_count = prev_pgs.length;
let optimize_result;
if (old_pg_count > 0)
{
if (old_pg_count != tree_cfg.pg_count)
{
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs())
{
this.schedule_recheck();
return;
}
this.scale_pg_count(prev_pgs, pg_history, new_pg_count);
}
optimize_result = await LPOptimizer.optimize_change(prev_pgs, tree_cfg.osd_tree, tree_cfg.max_osd_combinations);
}
else
{
optimize_result = await LPOptimizer.optimize_initial(tree_cfg.osd_tree, tree_cfg.pg_count, tree_cfg.max_osd_combinations);
}
if (!await this.save_new_pgs(prev_pgs, optimize_result.int_pgs, pg_history, tree_hash))
{
console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
this.schedule_recheck();
return;
}
console.log('PG configuration successfully changed');
if (old_pg_count != optimize_result.int_pgs.length)
{
console.log(`PG count changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`);
}
LPOptimizer.print_change_stats(optimize_result);
}
}
schedule_recheck()
{
if (this.recheck_timer)
{
clearTimeout(this.recheck_timer);
this.recheck_timer = null;
}
this.recheck_timer = setTimeout(() =>
{
this.recheck_timer = null;
this.recheck_pgs().catch(console.error);
}, this.config.mon_change_timeout || 1000);
}
sum_stats()
{
let overflow = false;
this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} };
const op_stats = {}, subop_stats = {}, recovery_stats = {};
for (const osd in this.state.osd.stats)
{
const st = this.state.osd.stats[osd];
for (const op in st.op_stats||{})
{
op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
op_stats[op].count += BigInt(st.op_stats.count||0);
op_stats[op].usec += BigInt(st.op_stats.usec||0);
op_stats[op].bytes += BigInt(st.op_stats.bytes||0);
}
for (const op in st.subop_stats||{})
{
subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
subop_stats[op].count += BigInt(st.subop_stats.count||0);
subop_stats[op].usec += BigInt(st.subop_stats.usec||0);
}
for (const op in st.recovery_stats||{})
{
recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
recovery_stats[op].count += BigInt(st.recovery_stats.count||0);
recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
}
}
for (const op in op_stats)
{
if (op_stats[op].count >= 0x10000000000000000n)
{
if (!this.prev_stats.op_stats[op])
{
overflow = true;
}
else
{
op_stats[op].count -= this.prev_stats.op_stats[op].count;
op_stats[op].usec -= this.prev_stats.op_stats[op].usec;
op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes;
}
}
}
for (const op in subop_stats)
{
if (subop_stats[op].count >= 0x10000000000000000n)
{
if (!this.prev_stats.subop_stats[op])
{
overflow = true;
}
else
{
subop_stats[op].count -= this.prev_stats.subop_stats[op].count;
subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec;
}
}
}
for (const op in recovery_stats)
{
if (recovery_stats[op].count >= 0x10000000000000000n)
{
if (!this.prev_stats.recovery_stats[op])
{
overflow = true;
}
else
{
recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count;
recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes;
}
}
}
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
for (const pg_num in this.state.pg.stats)
{
const st = this.state.pg.stats[pg_num];
for (const k in object_counts)
{
if (st[k+'_count'])
{
object_counts[k] += BigInt(st[k+'_count']);
}
}
}
return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts });
}
async update_total_stats()
{
const stats = this.sum_stats();
if (!stats.overflow)
{
// Convert to strings, serialize and save
const ser = {};
for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ])
{
ser[st] = {};
for (const op in stats[st])
{
ser[st][op] = {};
for (const k in stats[st][op])
{
ser[st][op][k] = ''+stats[st][op][k];
}
}
}
ser.object_counts = {};
for (const k in stats.object_counts)
{
ser.object_counts[k] = ''+stats.object_counts[k];
}
await this.etcd_call('/txn', {
success: [ { requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } } ],
}, this.config.etcd_mon_timeout, 0);
}
}
schedule_update_stats()
{
if (this.stats_timer)
{
clearTimeout(this.stats_timer);
this.stats_timer = null;
}
this.stats_timer = setTimeout(() =>
{
this.stats_timer = null;
this.update_total_stats().catch(console.error);
}, this.config.mon_stats_timeout || 1000);
}
parse_kv(kv)
{
if (!kv || !kv.key)
{
return;
}
kv.key = de64(kv.key);
kv.value = kv.value ? JSON.parse(de64(kv.value)) : null;
const key = kv.key.substr(this.etcd_prefix.length).replace(/^\/+/, '').split('/');
const cur = this.state, orig = Mon.etcd_tree;
for (let i = 0; i < key.length-1; i++)
{
if (!orig[key[i]])
{
console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
return;
}
orig = orig[key[i]];
cur = (cur[key[i]] = cur[key[i]] || {});
}
if (orig[key.length-1])
{
console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
return;
}
cur[key[key.length-1]] = kv.value;
if (key.join('/') === 'config/global')
{
this.state.config.global = this.state.config.global || {};
this.config = this.state.config.global;
this.check_config();
}
}
async etcd_call(path, body, timeout, retries)
{
let retry = 0;
if (retries >= 0 && retries < 1)
{
retries = 1;
}
while (retries < 0 || retry < retries)
{
const base = this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)];
const res = await POST(base+path, body, timeout);
if (res.json)
{
if (res.json.error)
{
console.log('etcd returned error: '+res.json.error);
break;
}
return res.json;
}
retry++;
}
this.die();
}
die(err)
{
// In fact we can just try to rejoin
console.fatal(err || 'Cluster connection failed');
process.exit(1);
}
local_ips()
{
const ips = [];
const ifaces = os.networkInterfaces();
for (const ifname in ifaces)
{
for (const iface of ifaces[ifname])
{
if (iface.family == 'IPv4' && !iface.internal)
{
ips.push(iface.address);
}
}
}
return ips;
}
}
function POST(url, body, timeout)
{
return new Promise((ok, no) =>
{
const body_text = Buffer.from(JSON.stringify(body));
let timer_id = timeout > 0 ? setTimeout(() =>
{
if (req)
req.abort();
req = null;
ok({ error: 'timeout' });
}, timeout) : null;
let req = http.request(url, { method: 'POST', headers: {
'Content-Type': 'application/json',
'Content-Length': body_text,
} }, (res) =>
{
if (!req)
{
return;
}
clearTimeout(timer_id);
if (res.statusCode != 200)
{
ok({ error: res.statusCode, response: res });
return;
}
let res_body = '';
res.setEncoding('utf8');
res.on('data', chunk => { res_body += chunk });
res.on('end', () =>
{
try
{
res_body = JSON.parse(res_body);
ok({ response: res, json: res_body });
}
catch (e)
{
ok({ error: e, response: res, body: res_body });
}
});
});
req.write(body_text);
req.end();
});
}
function b64(str)
{
return Buffer.from(str).toString('base64');
}
function de64(str)
{
return Buffer.from(str, 'base64').toString();
}
function sha1hex(str)
{
const hash = crypto.createHash('sha1');
hash.update(str);
return hash.digest('hex');
}

View File

@@ -1,14 +0,0 @@
{
"name": "rage-mon",
"version": "1.0.0",
"description": "RAGE storage monitor service",
"main": "mon.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Vitaliy Filippov",
"license": "UNLICENSED",
"dependencies": {
"ws": "^7.2.5"
}
}

View File

@@ -1,71 +0,0 @@
const LPOptimizer = require('./lp-optimizer.js');
const crush_tree = [
{ level: 1, children: [
{ level: 2, children: [
{ level: 3, id: 1, size: 3 },
{ level: 3, id: 2, size: 3 },
] },
{ level: 2, children: [
{ level: 3, id: 3, size: 3 },
{ level: 3, id: 4, size: 3 },
] },
] },
{ level: 1, children: [
{ level: 2, children: [
{ level: 3, id: 5, size: 3 },
{ level: 3, id: 6, size: 3 },
] },
{ level: 2, children: [
{ level: 3, id: 7, size: 3 },
{ level: 3, id: 8, size: 3 },
] },
] },
{ level: 1, children: [
{ level: 2, children: [
{ level: 3, id: 9, size: 3 },
{ level: 3, id: 10, size: 3 },
] },
{ level: 2, children: [
{ level: 3, id: 11, size: 3 },
{ level: 3, id: 12, size: 3 },
] },
] },
];
const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
console.log(osd_tree);
async function run()
{
const cur_tree = {};
console.log('Empty tree:');
let res = await LPOptimizer.optimize_initial(cur_tree, 256);
LPOptimizer.print_change_stats(res, false);
console.log('\nAdding 1st failure domain:');
cur_tree['dom1'] = osd_tree['dom1'];
res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false);
console.log('\nAdding 2nd failure domain:');
cur_tree['dom2'] = osd_tree['dom2'];
res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false);
console.log('\nAdding 3rd failure domain:');
cur_tree['dom3'] = osd_tree['dom3'];
res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving 3rd failure domain:');
delete cur_tree['dom3'];
res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving 2nd failure domain:');
delete cur_tree['dom2'];
res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving 1st failure domain:');
delete cur_tree['dom1'];
res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
LPOptimizer.print_change_stats(res, false);
}
run().catch(console.error);

View File

@@ -1,94 +0,0 @@
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
100: {
7: 3.63869,
},
300: {
10: 3.46089,
11: 3.46089,
12: 3.46089,
},
400: {
1: 3.49309,
2: 3.49309,
3: 3.49309,
},
500: {
4: 3.58498,
// 8: 3.58589,
9: 3.63869,
},
600: {
5: 3.63869,
6: 3.63869,
},
/* 100: {
1: 2.72800,
},
200: {
2: 2.72900,
},
300: {
3: 1.87000,
},
400: {
4: 1.87000,
},
500: {
5: 3.63869,
},*/
};
const crush_tree = [
{ level: 1, children: [
{ level: 2, children: [
{ level: 3, id: 1, size: 3 },
{ level: 3, id: 2, size: 2 },
] },
{ level: 2, children: [
{ level: 3, id: 3, size: 4 },
{ level: 3, id: 4, size: 4 },
] },
] },
{ level: 1, children: [
{ level: 2, children: [
{ level: 3, id: 5, size: 4 },
{ level: 3, id: 6, size: 1 },
] },
{ level: 2, children: [
{ level: 3, id: 7, size: 3 },
{ level: 3, id: 8, size: 5 },
] },
] },
{ level: 1, children: [
{ level: 2, children: [
{ level: 3, id: 9, size: 5 },
{ level: 3, id: 10, size: 2 },
] },
{ level: 2, children: [
{ level: 3, id: 11, size: 3 },
{ level: 3, id: 12, size: 3 },
] },
] },
];
async function run()
{
// Test: add 1 OSD of almost the same size. Ideal data movement could be 1/12 = 8.33%. Actual is ~13%
// Space efficiency is ~99.5% in both cases.
let res = await LPOptimizer.optimize_initial(osd_tree, 256);
LPOptimizer.print_change_stats(res, false);
console.log('adding osd.8');
osd_tree[500][8] = 3.58589;
res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
LPOptimizer.print_change_stats(res, false);
console.log('removing osd.8');
delete osd_tree[500][8];
res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
LPOptimizer.print_change_stats(res, false);
res = await LPOptimizer.optimize_initial(LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), 256);
LPOptimizer.print_change_stats(res, false);
}
run().catch(console.error);

View File

@@ -1,398 +0,0 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netinet/tcp.h>
#include "messenger.h"
osd_op_t::~osd_op_t()
{
assert(!bs_op);
assert(!op_data);
if (rmw_buf)
{
free(rmw_buf);
}
if (buf)
{
// Note: reusing osd_op_t WILL currently lead to memory leaks
// So we don't reuse it, but free it every time
free(buf);
}
}
void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
{
if (wanted_peers.find(peer_osd) == wanted_peers.end())
{
wanted_peers[peer_osd] = (osd_wanted_peer_t){
.address_list = peer_state["addresses"],
.port = (int)peer_state["port"].int64_value(),
};
}
else
{
wanted_peers[peer_osd].address_list = peer_state["addresses"];
wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
}
wanted_peers[peer_osd].address_changed = true;
if (!wanted_peers[peer_osd].connecting &&
(time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
{
try_connect_peer(peer_osd);
}
}
void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
{
auto wp_it = wanted_peers.find(peer_osd);
if (wp_it == wanted_peers.end())
{
return;
}
if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
{
wanted_peers.erase(peer_osd);
return;
}
auto & wp = wp_it->second;
if (wp.address_index >= wp.address_list.array_items().size())
{
return;
}
wp.cur_addr = wp.address_list[wp.address_index].string_value();
wp.cur_port = wp.port;
try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
}
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{
on_connect_peer(peer_osd, -EINVAL);
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
on_connect_peer(peer_osd, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int timeout_id = -1;
if (peer_connect_timeout > 0)
{
timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
{
osd_num_t peer_osd = clients[peer_fd].osd_num;
stop_client(peer_fd);
on_connect_peer(peer_osd, -EIO);
return;
});
}
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
on_connect_peer(peer_osd, -errno);
return;
}
assert(peer_osd != this->osd_num);
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_timeout_id = timeout_id,
.osd_num = peer_osd,
.in_buf = malloc(receive_buffer_size),
};
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
handle_connect_epoll(peer_fd);
});
}
void osd_messenger_t::handle_connect_epoll(int peer_fd)
{
auto & cl = clients[peer_fd];
if (cl.connect_timeout_id >= 0)
{
tfd->clear_timer(cl.connect_timeout_id);
cl.connect_timeout_id = -1;
}
osd_num_t peer_osd = cl.osd_num;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
stop_client(peer_fd);
on_connect_peer(peer_osd, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl.peer_state = PEER_CONNECTED;
// FIXME Disable EPOLLOUT on this fd
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Check OSD number
check_peer_config(cl);
}
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
{
// Mark client as ready (i.e. some data is available)
if (epoll_events & EPOLLRDHUP)
{
// Stop client
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
stop_client(peer_fd);
}
else if (epoll_events & EPOLLIN)
{
// Mark client as ready (i.e. some data is available)
auto & cl = clients[peer_fd];
cl.read_ready++;
if (cl.read_ready == 1)
{
read_ready_clients.push_back(cl.peer_fd);
ringloop->wakeup();
}
}
}
void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
{
auto & wp = wanted_peers.at(peer_osd);
wp.connecting = false;
if (peer_fd < 0)
{
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
if (wp.address_changed)
{
wp.address_changed = false;
wp.address_index = 0;
try_connect_peer(peer_osd);
}
else if (wp.address_index < wp.address_list.array_items().size()-1)
{
// Try other addresses
wp.address_index++;
try_connect_peer(peer_osd);
}
else
{
// Retry again in <peer_connect_interval> seconds
wp.last_connect_attempt = time(NULL);
wp.address_index = 0;
tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
{
try_connect_peer(peer_osd);
});
}
return;
}
printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
wanted_peers.erase(peer_osd);
repeer_pgs(peer_osd);
}
void osd_messenger_t::check_peer_config(osd_client_t & cl)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.show_conf = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SHOW_CONFIG,
},
},
};
op->callback = [this](osd_op_t *op)
{
osd_client_t & cl = clients[op->peer_fd];
std::string json_err;
json11::Json config;
bool err = false;
if (op->reply.hdr.retval < 0)
{
err = true;
printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
}
else
{
config = json11::Json::parse(std::string((char*)op->buf), json_err);
if (json_err != "")
{
err = true;
printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
}
else if (config["osd_num"].uint64_value() != cl.osd_num)
{
err = true;
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
on_connect_peer(cl.osd_num, -1);
}
}
if (err)
{
stop_client(op->peer_fd);
delete op;
return;
}
osd_peer_fds[cl.osd_num] = cl.peer_fd;
on_connect_peer(cl.osd_num, cl.peer_fd);
delete op;
};
outbox_push(op);
}
void osd_messenger_t::cancel_osd_ops(osd_client_t & cl)
{
for (auto p: cl.sent_ops)
{
cancel_op(p.second);
}
cl.sent_ops.clear();
for (auto op: cl.outbox)
{
cancel_op(op);
}
cl.outbox.clear();
if (cl.write_op)
{
cancel_op(cl.write_op);
cl.write_op = NULL;
}
}
void osd_messenger_t::cancel_op(osd_op_t *op)
{
if (op->op_type == OSD_OP_OUT)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
else
{
// This function is only called in stop_client(), so it's fine to destroy the operation
delete op;
}
}
void osd_messenger_t::stop_client(int peer_fd)
{
assert(peer_fd != 0);
auto it = clients.find(peer_fd);
if (it == clients.end())
{
return;
}
uint64_t repeer_osd = 0;
osd_client_t cl = it->second;
if (cl.peer_state == PEER_CONNECTED)
{
if (cl.osd_num)
{
// Reload configuration from etcd when the connection is dropped
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
repeer_osd = cl.osd_num;
}
else
{
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
}
}
clients.erase(it);
tfd->set_fd_handler(peer_fd, NULL);
if (cl.osd_num)
{
osd_peer_fds.erase(cl.osd_num);
// Cancel outbound operations
cancel_osd_ops(cl);
}
if (cl.read_op)
{
delete cl.read_op;
cl.read_op = NULL;
}
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
{
if (*rit == peer_fd)
{
read_ready_clients.erase(rit);
break;
}
}
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
{
if (*wit == peer_fd)
{
write_ready_clients.erase(wit);
break;
}
}
free(cl.in_buf);
close(peer_fd);
if (repeer_osd)
{
repeer_pgs(repeer_osd);
}
}
void osd_messenger_t::accept_connections(int listen_fd)
{
// Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
assert(peer_fd != 0);
char peer_str[256];
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
.in_buf = malloc(receive_buffer_size),
};
// Add FD to epoll
tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
// Try to accept next connection
peer_addr_size = sizeof(addr);
}
if (peer_fd == -1 && errno != EAGAIN)
{
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
}

View File

@@ -1,213 +0,0 @@
#pragma once
#include <sys/types.h>
#include <stdint.h>
#include <arpa/inet.h>
#include <malloc.h>
#include <set>
#include <map>
#include <deque>
#include <vector>
#include "json11/json11.hpp"
#include "osd_ops.h"
#include "timerfd_manager.h"
#include "ringloop.h"
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define CL_READ_HDR 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define OSD_OP_INLINE_BUF_COUNT 16
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
#define DEFAULT_PEER_CONNECT_INTERVAL 5
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
struct osd_op_buf_list_t
{
int count = 0, alloc = 0, sent = 0;
iovec *buf = NULL;
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
~osd_op_buf_list_t()
{
if (buf && buf != inline_buf)
{
free(buf);
}
}
inline iovec* get_iovec()
{
return (buf ? buf : inline_buf) + sent;
}
inline int get_size()
{
return count - sent;
}
inline void push_back(void *nbuf, size_t len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
}
}
buf[count++] = { .iov_base = nbuf, .iov_len = len };
}
};
struct blockstore_op_t;
struct osd_primary_op_data_t;
struct osd_op_t
{
timespec tv_begin;
uint64_t op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
osd_op_buf_list_t send_list;
~osd_op_t();
};
struct osd_client_t
{
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
int connect_timeout_id = -1;
osd_num_t osd_num = 0;
void *in_buf = NULL;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Incoming operations
std::vector<osd_op_t*> received_ops;
// Outbound operations
std::deque<osd_op_t*> outbox;
std::map<int, osd_op_t*> sent_ops;
// PGs dirtied by this client's primary-writes (FIXME to drop the connection)
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
};
struct osd_wanted_peer_t
{
json11::Json address_list;
int port;
time_t last_connect_attempt;
bool connecting, address_changed;
int address_index;
std::string cur_addr;
int cur_port;
};
struct osd_op_stats_t
{
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
};
struct osd_messenger_t
{
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
// osd_num_t is only for logging and asserts
osd_num_t osd_num;
int receive_buffer_size = 9000;
int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
int log_level = 0;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
std::map<uint64_t, int> osd_peer_fds;
uint64_t next_subop_id = 1;
std::map<int, osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// op statistics
osd_op_stats_t stats;
public:
void connect_peer(uint64_t osd_num, json11::Json peer_state);
void stop_client(int peer_fd);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
void handle_peer_epoll(int peer_fd, int epoll_events);
void read_requests();
void send_replies();
void accept_connections(int listen_fd);
protected:
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
void handle_connect_epoll(int peer_fd);
void on_connect_peer(osd_num_t peer_osd, int peer_fd);
void check_peer_config(osd_client_t & cl);
void cancel_osd_ops(osd_client_t & cl);
void cancel_op(osd_op_t *op);
bool try_send(osd_client_t & cl);
void handle_send(int result, int peer_fd);
bool handle_read(int result, int peer_fd);
void handle_finished_read(osd_client_t & cl);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
};

View File

@@ -1,270 +0,0 @@
#include "messenger.h"
void osd_messenger_t::read_requests()
{
while (read_ready_clients.size() > 0)
{
int peer_fd = read_ready_clients[0];
auto & cl = clients[peer_fd];
if (!cl.read_op || cl.read_remaining < receive_buffer_size)
{
cl.read_iov.iov_base = cl.in_buf;
cl.read_iov.iov_len = receive_buffer_size;
}
else
{
cl.read_iov.iov_base = cl.read_buf;
cl.read_iov.iov_len = cl.read_remaining;
}
cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1;
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + 1);
int result = recvmsg(peer_fd, &cl.read_msg, 0);
if (result < 0)
{
result = -errno;
}
handle_read(result, peer_fd);
}
}
bool osd_messenger_t::handle_read(int result, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
if (result < 0 && result != -EAGAIN)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
stop_client(peer_fd);
return false;
}
if (result == -EAGAIN || result < cl.read_iov.iov_len)
{
cl.read_ready--;
if (cl.read_ready > 0)
read_ready_clients.push_back(peer_fd);
}
else
{
read_ready_clients.push_back(peer_fd);
}
if (result > 0)
{
if (cl.read_iov.iov_base == cl.in_buf)
{
// Compose operation(s) from the buffer
int remain = result;
void *curbuf = cl.in_buf;
while (remain > 0)
{
if (!cl.read_op)
{
cl.read_op = new osd_op_t;
cl.read_op->peer_fd = peer_fd;
cl.read_op->op_type = OSD_OP_IN;
cl.read_buf = cl.read_op->req.buf;
cl.read_remaining = OSD_PACKET_SIZE;
cl.read_state = CL_READ_HDR;
}
if (cl.read_remaining > remain)
{
memcpy(cl.read_buf, curbuf, remain);
cl.read_remaining -= remain;
cl.read_buf += remain;
remain = 0;
if (cl.read_remaining <= 0)
handle_finished_read(cl);
}
else
{
memcpy(cl.read_buf, curbuf, cl.read_remaining);
curbuf += cl.read_remaining;
remain -= cl.read_remaining;
cl.read_remaining = 0;
cl.read_buf = NULL;
handle_finished_read(cl);
}
}
}
else
{
// Long data
cl.read_remaining -= result;
cl.read_buf += result;
if (cl.read_remaining <= 0)
{
handle_finished_read(cl);
}
}
if (result >= cl.read_iov.iov_len)
{
return true;
}
}
}
return false;
}
void osd_messenger_t::handle_finished_read(osd_client_t & cl)
{
if (cl.read_state == CL_READ_HDR)
{
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
handle_reply_hdr(&cl);
else
handle_op_hdr(&cl);
}
else if (cl.read_state == CL_READ_DATA)
{
// Operation is ready
cl.received_ops.push_back(cl.read_op);
exec_op(cl.read_op);
cl.read_op = NULL;
cl.read_state = 0;
}
else if (cl.read_state == CL_READ_REPLY_DATA)
{
// Reply is ready
auto req_it = cl.sent_ops.find(cl.read_reply_id);
osd_op_t *request = req_it->second;
cl.sent_ops.erase(req_it);
cl.read_reply_id = 0;
delete cl.read_op;
cl.read_op = NULL;
cl.read_state = 0;
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
stats.subop_stat_count[request->req.hdr.opcode]++;
if (!stats.subop_stat_count[request->req.hdr.opcode])
{
stats.subop_stat_count[request->req.hdr.opcode]++;
stats.subop_stat_sum[request->req.hdr.opcode] = 0;
}
stats.subop_stat_sum[request->req.hdr.opcode] += (
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
);
request->callback(request);
}
else
{
assert(0);
}
}
void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->read_remaining = cur_op->req.sec_rw.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
{
if (cur_op->req.sec_stab.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
cl->read_remaining = cur_op->req.sec_stab.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
cl->read_remaining = cur_op->req.rw.len;
}
if (cl->read_remaining > 0)
{
// Read data
cl->read_buf = cur_op->buf;
cl->read_state = CL_READ_DATA;
}
else
{
// Operation is ready
cl->read_op = NULL;
cl->read_state = 0;
cl->received_ops.push_back(cur_op);
exec_op(cur_op);
}
}
void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
if (req_it == cl->sent_ops.end())
{
// Command out of sync. Drop connection
printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
stop_client(cl->peer_fd);
return;
}
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
if ((op->reply.hdr.opcode == OSD_OP_SECONDARY_READ || op->reply.hdr.opcode == OSD_OP_READ) &&
op->reply.hdr.retval > 0)
{
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
assert(op->buf);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
}
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST && op->reply.hdr.retval > 0)
{
op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
}
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
{
op->buf = malloc(op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
}
else
{
delete cl->read_op;
cl->read_state = 0;
cl->read_op = NULL;
cl->sent_ops.erase(req_it);
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
stats.subop_stat_count[op->req.hdr.opcode]++;
if (!stats.subop_stat_count[op->req.hdr.opcode])
{
stats.subop_stat_count[op->req.hdr.opcode]++;
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
}
stats.subop_stat_sum[op->req.hdr.opcode] += (
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
}

View File

@@ -1,149 +0,0 @@
#include "messenger.h"
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
{
assert(cur_op->peer_fd);
auto & cl = clients.at(cur_op->peer_fd);
if (cur_op->op_type == OSD_OP_OUT)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
}
else
{
// Check that operation actually belongs to this client
bool found = false;
for (auto it = cl.received_ops.begin(); it != cl.received_ops.end(); it++)
{
if (*it == cur_op)
{
found = true;
cl.received_ops.erase(it, it+1);
break;
}
}
if (!found)
{
delete cur_op;
return;
}
}
cl.outbox.push_back(cur_op);
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
{
if (cl.write_state == 0)
{
cl.write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);
}
ringloop->wakeup();
}
}
bool osd_messenger_t::try_send(osd_client_t & cl)
{
int peer_fd = cl.peer_fd;
if (!cl.write_op)
{
// pick next command
cl.write_op = cl.outbox.front();
cl.outbox.pop_front();
cl.write_state = CL_WRITE_REPLY;
if (cl.write_op->op_type == OSD_OP_IN)
{
// Measure execution latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
{
stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
}
stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
);
if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
{
stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.rw.len;
}
else if (cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
}
}
}
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
int result = sendmsg(peer_fd, &cl.write_msg, MSG_NOSIGNAL);
if (result < 0)
result = -errno;
handle_send(result, peer_fd);
return true;
}
void osd_messenger_t::send_replies()
{
while (write_ready_clients.size() > 0)
{
auto & cl = clients[write_ready_clients[0]];
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + 1);
try_send(cl);
}
}
void osd_messenger_t::handle_send(int result, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
if (result < 0 && result != -EAGAIN)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
stop_client(peer_fd);
return;
}
if (result >= 0)
{
osd_op_t *cur_op = cl.write_op;
while (result > 0 && cur_op->send_list.sent < cur_op->send_list.count)
{
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
if (iov.iov_len <= result)
{
result -= iov.iov_len;
cur_op->send_list.sent++;
}
else
{
iov.iov_len -= result;
iov.iov_base += result;
break;
}
}
if (cur_op->send_list.sent >= cur_op->send_list.count)
{
// Done
if (cur_op->op_type == OSD_OP_IN)
{
delete cur_op;
}
else
{
cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
}
cl.write_op = NULL;
cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
}
}
if (cl.write_state != 0)
{
write_ready_clients.push_back(peer_fd);
}
}
}

458
osd.cpp
View File

@@ -7,9 +7,7 @@
#include "osd.h"
#define MAX_EPOLL_EVENTS 64
const char* osd_op_names[] = {
static const char* osd_op_names[] = {
"",
"read",
"write",
@@ -23,7 +21,6 @@ const char* osd_op_names[] = {
"primary_read",
"primary_write",
"primary_sync",
"primary_delete",
};
osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
@@ -31,110 +28,50 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
this->config = config;
this->bs = bs;
this->ringloop = ringloop;
this->tick_tfd = new timerfd_interval(ringloop, 3, [this]()
{
for (int i = 0; i <= OSD_OP_MAX; i++)
{
if (op_stat_count[i] != 0)
{
printf("avg latency for op %d (%s): %ld us\n", i, osd_op_names[i], op_stat_sum[i]/op_stat_count[i]);
op_stat_count[i] = 0;
op_stat_sum[i] = 0;
}
}
for (int i = 0; i <= OSD_OP_MAX; i++)
{
if (subop_stat_count[i] != 0)
{
printf("avg latency for subop %d (%s): %ld us\n", i, osd_op_names[i], subop_stat_sum[i]/subop_stat_count[i]);
subop_stat_count[i] = 0;
subop_stat_sum[i] = 0;
}
}
if (send_stat_count != 0)
{
printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
send_stat_count = 0;
send_stat_sum = 0;
}
});
this->bs_block_size = bs->get_block_size();
// FIXME: use bitmap granularity instead
this->bs_disk_alignment = bs->get_disk_alignment();
parse_config(config);
epoll_fd = epoll_create(1);
if (epoll_fd < 0)
{
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
}
this->tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
});
c_cli.tfd = this->tfd;
c_cli.ringloop = this->ringloop;
c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
init_cluster();
consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&consumer);
}
osd_t::~osd_t()
{
if (tfd)
{
delete tfd;
tfd = NULL;
}
ringloop->unregister_consumer(&consumer);
close(epoll_fd);
close(listen_fd);
}
void osd_t::parse_config(blockstore_config_t & config)
{
// Initial startup configuration
json11::Json json_config = json11::Json(config);
st_cli.parse_config(json_config);
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
if (etcd_report_interval <= 0)
etcd_report_interval = 30;
osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
if (!osd_num)
throw std::runtime_error("osd_num is required in the configuration");
c_cli.osd_num = osd_num;
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
// Cluster configuration
bind_address = config["bind_address"];
if (bind_address == "")
bind_address = "0.0.0.0";
bind_port = stoull_full(config["bind_port"]);
if (bind_port <= 0 || bind_port > 65535)
bind_port = 0;
if (config["immediate_commit"] == "all")
immediate_commit = IMMEDIATE_ALL;
else if (config["immediate_commit"] == "small")
immediate_commit = IMMEDIATE_SMALL;
if (config.find("autosync_interval") != config.end())
{
autosync_interval = strtoull(config["autosync_interval"].c_str(), NULL, 10);
if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
}
if (config.find("client_queue_depth") != config.end())
{
client_queue_depth = strtoull(config["client_queue_depth"].c_str(), NULL, 10);
if (client_queue_depth < 128)
client_queue_depth = 128;
}
if (config.find("pg_stripe_size") != config.end())
{
pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
readonly = true;
print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
if (!print_stats_interval)
print_stats_interval = 3;
c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
if (!c_cli.peer_connect_interval)
c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
if (!c_cli.peer_connect_timeout)
c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
c_cli.log_level = log_level;
}
bind_port = strtoull(config["bind_port"].c_str(), NULL, 10);
if (!bind_port || bind_port > 65535)
bind_port = 11203;
osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
if (!osd_num)
throw std::runtime_error("osd_num is required in the configuration");
run_primary = config["run_primary"] == "true" || config["run_primary"] == "1" || config["run_primary"] == "yes";
if (run_primary)
init_primary();
void osd_t::bind_socket()
{
listen_fd = socket(AF_INET, SOCK_STREAM, 0);
if (listen_fd < 0)
{
@@ -151,27 +88,13 @@ void osd_t::bind_socket()
throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
}
addr.sin_family = AF_INET;
addr.sin_port = htons(bind_port);
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
}
if (bind_port == 0)
{
socklen_t len = sizeof(addr);
if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
{
close(listen_fd);
throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
}
listening_port = ntohs(addr.sin_port);
}
else
{
listening_port = bind_port;
}
if (listen(listen_fd, listen_backlog) < 0)
{
@@ -181,6 +104,13 @@ void osd_t::bind_socket()
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
epoll_fd = epoll_create(1);
if (epoll_fd < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
}
epoll_event ev;
ev.data.fd = listen_fd;
ev.events = EPOLLIN | EPOLLET;
@@ -190,6 +120,39 @@ void osd_t::bind_socket()
close(epoll_fd);
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
consumer.loop = [this]() { loop(); };
ringloop->register_consumer(consumer);
}
osd_t::~osd_t()
{
delete tick_tfd;
ringloop->unregister_consumer(consumer);
close(epoll_fd);
close(listen_fd);
}
osd_op_t::~osd_op_t()
{
if (bs_op)
{
delete bs_op;
}
if (op_data)
{
free(op_data);
}
if (rmw_buf)
{
free(rmw_buf);
}
if (buf)
{
// Note: reusing osd_op_t WILL currently lead to memory leaks
// So we don't reuse it, but free it every time
free(buf);
}
}
bool osd_t::shutdown()
@@ -210,33 +173,8 @@ void osd_t::loop()
wait_state = 1;
}
handle_peers();
c_cli.read_requests();
c_cli.send_replies();
ringloop->submit();
}
void osd_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
{
if (handler != NULL)
{
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev;
ev.data.fd = fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers[fd] = handler;
}
else
{
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
epoll_handlers.erase(fd);
}
read_requests();
send_replies();
}
void osd_t::handle_epoll_events()
@@ -265,12 +203,63 @@ restart:
{
if (events[i].data.fd == listen_fd)
{
c_cli.accept_connections(listen_fd);
// Accept new connections
sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr);
int peer_fd;
while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{
char peer_str[256];
printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = {
.peer_addr = addr,
.peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTED,
};
// Add FD to epoll
epoll_event ev;
ev.data.fd = peer_fd;
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
// Try to accept next connection
peer_addr_size = sizeof(addr);
}
if (peer_fd == -1 && errno != EAGAIN)
{
throw std::runtime_error(std::string("accept: ") + strerror(errno));
}
}
else
{
auto & cb = epoll_handlers[events[i].data.fd];
cb(events[i].data.fd, events[i].events);
auto & cl = clients[events[i].data.fd];
if (cl.peer_state == PEER_CONNECTING)
{
// Either OUT (connected) or HUP
handle_connect_result(cl.peer_fd);
}
else if (events[i].events & EPOLLRDHUP)
{
// Stop client
printf("osd: client %d disconnected\n", cl.peer_fd);
stop_client(cl.peer_fd);
}
else
{
// Mark client as ready (i.e. some data is available)
cl.read_ready++;
if (cl.read_ready == 1)
{
read_ready_clients.push_back(cl.peer_fd);
ringloop->wakeup();
}
}
}
}
if (nfds == MAX_EPOLL_EVENTS)
@@ -279,6 +268,85 @@ restart:
}
}
void osd_t::cancel_osd_ops(osd_client_t & cl)
{
for (auto p: cl.sent_ops)
{
cancel_op(p.second);
}
cl.sent_ops.clear();
for (auto op: cl.outbox)
{
cancel_op(op);
}
cl.outbox.clear();
if (cl.write_op)
{
cancel_op(cl.write_op);
cl.write_op = NULL;
}
}
void osd_t::cancel_op(osd_op_t *op)
{
if (op->op_type == OSD_OP_OUT)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
op->callback(op);
}
else
{
delete op;
}
}
void osd_t::stop_client(int peer_fd)
{
auto it = clients.find(peer_fd);
if (it == clients.end())
{
return;
}
auto & cl = it->second;
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
if (cl.osd_num)
{
// Cancel outbound operations
cancel_osd_ops(cl);
osd_peer_fds.erase(cl.osd_num);
repeer_pgs(cl.osd_num, false);
peering_state |= OSD_PEERING_PEERS;
}
if (cl.read_op)
{
delete cl.read_op;
}
for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
{
if (*rit == peer_fd)
{
read_ready_clients.erase(rit);
break;
}
}
for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
{
if (*wit == peer_fd)
{
write_ready_clients.erase(wit);
break;
}
}
clients.erase(it);
close(peer_fd);
}
void osd_t::exec_op(osd_op_t *cur_op)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@@ -288,29 +356,23 @@ void osd_t::exec_op(osd_op_t *cur_op)
delete cur_op;
return;
}
inflight_ops++;
cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
(cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % bs_disk_alignment || cur_op->req.sec_rw.offset % bs_disk_alignment) ||
(cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
(cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % bs_disk_alignment || cur_op->req.rw.offset % bs_disk_alignment))
(cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
(cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
(cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
{
// Bad command
finish_op(cur_op, -EINVAL);
return;
}
if (readonly &&
cur_op->req.hdr.opcode != OSD_OP_SECONDARY_READ &&
cur_op->req.hdr.opcode != OSD_OP_SECONDARY_LIST &&
cur_op->req.hdr.opcode != OSD_OP_READ &&
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
{
// Readonly mode
finish_op(cur_op, -EROFS);
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = -EINVAL;
outbox_push(this->clients[cur_op->peer_fd], cur_op);
return;
}
inflight_ops++;
if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
{
exec_sync_stab_all(cur_op);
@@ -331,84 +393,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
{
continue_primary_sync(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
{
continue_primary_del(cur_op);
}
else
{
exec_secondary(cur_op);
}
}
void osd_t::reset_stats()
{
c_cli.stats = { 0 };
prev_stats = { 0 };
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
}
void osd_t::print_stats()
{
for (int i = 0; i <= OSD_OP_MAX; i++)
{
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
{
uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
if (c_cli.stats.op_stat_bytes[i] != 0)
{
printf(
"[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
);
}
else
{
printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
}
prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i];
prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i];
prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
}
}
for (int i = 0; i <= OSD_OP_MAX; i++)
{
if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
{
uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i];
prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i];
}
}
for (int i = 0; i < 2; i++)
{
if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
{
uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
printf(
"[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
(recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
(bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
(bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
);
recovery_stat_count[1][i] = recovery_stat_count[0][i];
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
}
}
if (incomplete_objects > 0)
{
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
}
if (degraded_objects > 0)
{
printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
}
if (misplaced_objects > 0)
{
printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
}
}

289
osd.h
View File

@@ -15,29 +15,144 @@
#include "blockstore.h"
#include "ringloop.h"
#include "timerfd_manager.h"
#include "timerfd_interval.h"
#include "osd_ops.h"
#include "osd_peering_pg.h"
#include "messenger.h"
#include "etcd_state_client.h"
#define OSD_LOADING_PGS 0x01
#define OSD_PEERING_PGS 0x04
#define OSD_FLUSHING_PGS 0x08
#define OSD_RECOVERING 0x10
#include "sparsepp/sparsepp/spp.h"
#define IMMEDIATE_NONE 0
#define IMMEDIATE_SMALL 1
#define IMMEDIATE_ALL 2
#define OSD_OP_IN 0
#define OSD_OP_OUT 1
#define MAX_AUTOSYNC_INTERVAL 3600
#define DEFAULT_AUTOSYNC_INTERVAL 5
#define MAX_RECOVERY_QUEUE 2048
#define DEFAULT_RECOVERY_QUEUE 4
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default
#define CL_READ_OP 1
#define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define MAX_EPOLL_EVENTS 64
#define OSD_OP_INLINE_BUF_COUNT 16
#define PEER_CONNECTING 1
#define PEER_CONNECTED 2
#define OSD_PEERING_PEERS 1
#define OSD_PEERING_PGS 2
//#define OSD_STUB
extern const char* osd_op_names[];
struct osd_op_buf_list_t
{
int count = 0, alloc = 0, sent = 0;
iovec *buf = NULL;
iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
~osd_op_buf_list_t()
{
if (buf && buf != inline_buf)
{
free(buf);
}
}
inline iovec* get_iovec()
{
return (buf ? buf : inline_buf) + sent;
}
inline int get_size()
{
return count - sent;
}
inline void push_back(void *nbuf, size_t len)
{
if (count >= alloc)
{
if (!alloc)
{
alloc = OSD_OP_INLINE_BUF_COUNT;
buf = inline_buf;
}
else if (buf == inline_buf)
{
int old = alloc;
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)malloc(sizeof(iovec) * alloc);
memcpy(buf, inline_buf, sizeof(iovec)*old);
}
else
{
alloc = ((alloc/16)*16 + 1);
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
}
}
buf[count++] = { .iov_base = nbuf, .iov_len = len };
}
};
struct osd_primary_op_data_t;
struct osd_op_t
{
timespec tv_begin;
timespec tv_send;
int op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;
osd_op_buf_list_t send_list;
~osd_op_t();
};
struct osd_peer_def_t
{
osd_num_t osd_num = 0;
std::string addr;
int port = 0;
time_t last_connect_attempt = 0;
};
struct osd_client_t
{
sockaddr_in peer_addr;
int peer_port;
int peer_fd;
int peer_state;
std::function<void(osd_num_t, int)> connect_callback;
osd_num_t osd_num = 0;
// Read state
int read_ready = 0;
osd_op_t *read_op = NULL;
int read_reply_id = 0;
iovec read_iov;
msghdr read_msg;
void *read_buf = NULL;
int read_remaining = 0;
int read_state = 0;
// Outbound operations sent to this client (which is probably an OSD peer)
std::map<int, osd_op_t*> sent_ops;
// Outbound messages (replies or requests)
std::deque<osd_op_t*> outbox;
// PGs dirtied by this client's primary-writes
std::set<pg_num_t> dirty_pgs;
// Write state
osd_op_t *write_op = NULL;
msghdr write_msg;
int write_state = 0;
};
struct osd_rmw_stripe_t;
struct osd_object_id_t
{
@@ -45,58 +160,26 @@ struct osd_object_id_t
object_id oid;
};
struct osd_recovery_op_t
{
int st = 0;
bool degraded = false;
pg_num_t pg_num = 0;
object_id oid = { 0 };
osd_op_t *osd_op = NULL;
};
class osd_t
{
// config
blockstore_config_t config;
int etcd_report_interval = 30;
bool readonly = false;
osd_num_t osd_num = 1; // OSD numbers start with 1
bool run_primary = false;
std::vector<osd_peer_def_t> peers;
blockstore_config_t config;
std::string bind_address;
int bind_port, listen_backlog;
// FIXME: Implement client queue depth limit
int client_queue_depth = 128;
bool allow_test_ops = true;
int print_stats_interval = 3;
int immediate_commit = IMMEDIATE_NONE;
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
int log_level = 0;
// cluster state
// peer OSDs
etcd_state_client_t st_cli;
osd_messenger_t c_cli;
int etcd_failed_attempts = 0;
std::string etcd_lease_id;
json11::Json self_state;
bool loading_peer_config = false;
std::set<pg_num_t> pg_state_dirty;
bool pg_config_applied = false;
bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false;
// peers and PGs
std::map<pg_num_t, pg_t> pgs;
std::set<pg_num_t> dirty_pgs;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
std::map<uint64_t, int> osd_peer_fds;
std::vector<pg_t> pgs;
int peering_state = 0;
unsigned pg_count = 0;
std::map<object_id, osd_recovery_op_t> recovery_ops;
osd_op_t *autosync_op = NULL;
uint64_t next_subop_id = 1;
// Unstable writes
std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -108,73 +191,53 @@ class osd_t
int inflight_ops = 0;
blockstore_t *bs;
uint32_t bs_block_size, bs_disk_alignment;
uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
uint64_t parity_block_size = 4*1024*1024; // 4 MB by default
ring_loop_t *ringloop;
timerfd_manager_t *tfd = NULL;
timerfd_interval *tick_tfd;
int wait_state = 0;
int epoll_fd = 0;
int listening_port = 0;
int listen_fd = 0;
ring_consumer_t consumer;
std::map<int, std::function<void(int, int)>> epoll_handlers;
// op statistics
osd_op_stats_t prev_stats;
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
uint64_t recovery_stat_count[2][2] = { 0 };
uint64_t recovery_stat_bytes[2][2] = { 0 };
std::unordered_map<int,osd_client_t> clients;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
uint64_t send_stat_sum = 0;
uint64_t send_stat_count = 0;
// cluster connection
void parse_config(blockstore_config_t & config);
void init_cluster();
void on_change_osd_state_hook(uint64_t osd_num);
void on_change_etcd_state_hook(json11::Json::object & changes);
void on_load_config_hook(json11::Json::object & changes);
json11::Json on_load_pgs_checks_hook();
void on_load_pgs_hook(bool success);
void bind_socket();
void acquire_lease();
json11::Json get_osd_state();
void create_osd_state();
void renew_lease();
void print_stats();
void reset_stats();
json11::Json get_statistics();
void report_statistics();
void report_pg_state(pg_t & pg);
void report_pg_states();
void apply_pg_count();
void apply_pg_config();
// methods
// event loop, socket read/write
void loop();
void set_fd_handler(int fd, std::function<void(int, int)> handler);
void handle_epoll_events();
void read_requests();
void handle_read(ring_data_t *data, int peer_fd);
void handle_op_hdr(osd_client_t *cl);
void handle_reply_hdr(osd_client_t *cl);
bool try_send(osd_client_t & cl);
void send_replies();
void handle_send(ring_data_t *data, int peer_fd);
void outbox_push(osd_client_t & cl, osd_op_t *op);
// peer handling (primary OSD logic)
void parse_test_peer(std::string peer);
void connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback);
void handle_connect_result(int peer_fd);
void cancel_osd_ops(osd_client_t & cl);
void cancel_op(osd_op_t *op);
void stop_client(int peer_fd);
osd_peer_def_t parse_peer(std::string peer);
void init_primary();
void handle_peers();
void repeer_pgs(osd_num_t osd_num);
void start_pg_peering(pg_num_t pg_num);
void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
void discard_list_subop(osd_op_t *list_op);
bool stop_pg(pg_num_t pg_num);
void finish_stop_pg(pg_t & pg);
// flushing, recovery and backfill
void submit_pg_flush_ops(pg_num_t pg_num);
void handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
void submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
bool pick_next_recovery(osd_recovery_op_t &op);
void submit_recovery_op(osd_recovery_op_t *op);
bool continue_recovery();
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
void repeer_pgs(osd_num_t osd_num, bool is_connected);
void start_pg_peering(int i);
// op execution
void exec_op(osd_op_t *cur_op);
void finish_op(osd_op_t *cur_op, int retval);
// secondary ops
void exec_sync_stab_all(osd_op_t *cur_op);
@@ -183,34 +246,18 @@ class osd_t
void secondary_op_callback(osd_op_t *cur_op);
// primary ops
void autosync();
bool prepare_primary_rw(osd_op_t *cur_op);
void continue_primary_read(osd_op_t *cur_op);
void continue_primary_write(osd_op_t *cur_op);
void cancel_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op);
void continue_primary_del(osd_op_t *cur_op);
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
void handle_primary_bs_subop(osd_op_t *subop);
void add_bs_subop_stats(osd_op_t *subop);
void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
void finish_primary_op(osd_op_t *cur_op, int retval);
void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
void submit_primary_sync_subops(osd_op_t *cur_op);
void submit_primary_stab_subops(osd_op_t *cur_op);
inline pg_num_t map_to_pg(object_id oid)
{
return (oid.inode + oid.stripe / pg_stripe_size) % pg_count + 1;
}
public:
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
~osd_t();
void force_stop(int exitcode);
bool shutdown();
};

40
osd_client.cpp Normal file
View File

@@ -0,0 +1,40 @@
void slice()
{
// Slice the request into blockstore requests to individual objects
// Primary OSD still operates individual stripes, except they're twice the size of the blockstore's stripe.
std::vector read_parts;
int block = bs->get_block_size();
uint64_t stripe1 = cur_op->req.rw.offset / block / 2;
uint64_t stripe2 = (cur_op->req.rw.offset + cur_op->req.rw.len + block*2 - 1) / block / 2 - 1;
for (uint64_t s = stripe1; s <= stripe2; s++)
{
uint64_t start = s == stripe1 ? cur_op->req.rw.offset - stripe1*block*2 : 0;
uint64_t end = s == stripe2 ? cur_op->req.rw.offset + cur_op->req.rw.len - stripe2*block*2 : block*2;
if (start < block)
{
read_parts.push_back({
.role = 1,
.oid = {
.inode = cur_op->req.rw.inode,
.stripe = (s << STRIPE_ROLE_BITS) | 1,
},
.version = UINT64_MAX,
.offset = start,
.len = (block < end ? block : end) - start,
});
}
if (end > block)
{
read_parts.push_back({
.role = 2,
.oid = {
.inode = cur_op->req.rw.inode,
.stripe = (s << STRIPE_ROLE_BITS) | 2,
},
.version = UINT64_MAX,
.offset = (start > block ? start-block : 0),
.len = end - (start > block ? start-block : 0),
});
}
}
}

View File

@@ -1,746 +0,0 @@
#include "osd.h"
#include "base64.h"
#include "etcd_state_client.h"
// Startup sequence:
// Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
// -> Load PG config -> Report&lock PG states -> Load peers -> Connect to peers -> Peer PGs
// Event handling
// Wait for PG changes -> Start/Stop PGs when requested
// Peer connection is lost -> Reload connection data -> Try to reconnect
void osd_t::init_cluster()
{
if (!st_cli.etcd_addresses.size())
{
if (run_primary)
{
// Test version of clustering code with 1 PG and 2 peers
// Example: peers = 2:127.0.0.1:11204,3:127.0.0.1:11205
std::string peerstr = config["peers"];
while (peerstr.size())
{
int pos = peerstr.find(',');
parse_test_peer(pos < 0 ? peerstr : peerstr.substr(0, pos));
peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
}
if (st_cli.peer_states.size() < 2)
{
throw std::runtime_error("run_primary requires at least 2 peers");
}
pgs[1] = (pg_t){
.state = PG_PEERING,
.pg_cursize = 0,
.pg_num = 1,
.target_set = { 1, 2, 3 },
.cur_set = { 0, 0, 0 },
};
report_pg_state(pgs[1]);
pg_count = 1;
}
bind_socket();
}
else
{
st_cli.tfd = tfd;
st_cli.log_level = log_level;
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_etcd_state_hook(changes); };
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
peering_state = OSD_LOADING_PGS;
st_cli.load_global_config();
}
if (run_primary && autosync_interval > 0)
{
this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
{
autosync();
});
}
}
void osd_t::parse_test_peer(std::string peer)
{
// OSD_NUM:IP:PORT
int pos1 = peer.find(':');
int pos2 = peer.find(':', pos1+1);
if (pos1 < 0 || pos2 < 0)
throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
std::string addr = peer.substr(pos1+1, pos2-pos1-1);
std::string osd_num_str = peer.substr(0, pos1);
std::string port_str = peer.substr(pos2+1);
osd_num_t peer_osd = strtoull(osd_num_str.c_str(), NULL, 10);
if (!peer_osd)
throw new std::runtime_error("Could not parse OSD peer osd_num");
else if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
throw std::runtime_error("Same osd number "+std::to_string(peer_osd)+" specified twice in peers");
int port = strtoull(port_str.c_str(), NULL, 10);
if (!port)
throw new std::runtime_error("Could not parse OSD peer port");
st_cli.peer_states[peer_osd] = json11::Json::object {
{ "state", "up" },
{ "addresses", json11::Json::array { addr } },
{ "port", port },
};
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
json11::Json osd_t::get_osd_state()
{
std::vector<char> hostname;
hostname.resize(1024);
while (gethostname(hostname.data(), hostname.size()) < 0 && errno == ENAMETOOLONG)
hostname.resize(hostname.size()+1024);
hostname.resize(strnlen(hostname.data(), hostname.size()));
json11::Json::object st;
st["state"] = "up";
if (bind_address != "0.0.0.0")
st["addresses"] = json11::Json::array { bind_address };
else
st["addresses"] = getifaddr_list();
st["host"] = std::string(hostname.data(), hostname.size());
st["port"] = listening_port;
st["primary_enabled"] = run_primary;
st["blockstore_enabled"] = bs ? true : false;
return st;
}
json11::Json osd_t::get_statistics()
{
json11::Json::object st;
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
char time_str[50] = { 0 };
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
st["time"] = time_str;
st["blockstore_ready"] = bs->is_started();
if (bs)
{
st["size"] = bs->get_block_count() * bs->get_block_size();
st["free"] = bs->get_free_block_count() * bs->get_block_size();
}
st["host"] = self_state["host"];
json11::Json::object op_stats, subop_stats;
for (int i = 0; i <= OSD_OP_MAX; i++)
{
op_stats[osd_op_names[i]] = json11::Json::object {
{ "count", c_cli.stats.op_stat_count[i] },
{ "usec", c_cli.stats.op_stat_sum[i] },
{ "bytes", c_cli.stats.op_stat_bytes[i] },
};
}
for (int i = 0; i <= OSD_OP_MAX; i++)
{
subop_stats[osd_op_names[i]] = json11::Json::object {
{ "count", c_cli.stats.subop_stat_count[i] },
{ "usec", c_cli.stats.subop_stat_sum[i] },
};
}
st["op_stats"] = op_stats;
st["subop_stats"] = subop_stats;
st["recovery_stats"] = json11::Json::object {
{ recovery_stat_names[0], json11::Json::object {
{ "count", recovery_stat_count[0][0] },
{ "bytes", recovery_stat_bytes[0][0] },
} },
{ recovery_stat_names[1], json11::Json::object {
{ "count", recovery_stat_count[0][1] },
{ "bytes", recovery_stat_bytes[0][1] },
} },
};
return st;
}
void osd_t::report_statistics()
{
if (etcd_reporting_stats)
{
return;
}
etcd_reporting_stats = true;
json11::Json::array txn = { json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
{ "value", base64_encode(get_statistics().dump()) },
} }
} };
for (auto & p: pgs)
{
auto & pg = p.second;
if (pg.state & (PG_OFFLINE | PG_STARTING))
{
// Don't report statistics for offline PGs
continue;
}
json11::Json::object pg_stats;
pg_stats["object_count"] = pg.total_count;
pg_stats["clean_count"] = pg.clean_count;
pg_stats["misplaced_count"] = pg.misplaced_objects.size();
pg_stats["degraded_count"] = pg.degraded_objects.size();
pg_stats["incomplete_count"] = pg.incomplete_objects.size();
pg_stats["write_osd_set"] = pg.cur_set;
txn.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/pg/stats/"+std::to_string(pg.pg_num)) },
{ "value", base64_encode(json11::Json(pg_stats).dump()) },
} }
});
}
st_cli.etcd_txn(json11::Json::object { { "success", txn } }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
etcd_reporting_stats = false;
if (err != "")
{
printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
// Retry indefinitely
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
{
report_statistics();
});
}
else if (res["error"].string_value() != "")
{
printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, res["error"].string_value().c_str());
force_stop(1);
}
});
}
void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
{
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
{
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
}
}
void osd_t::on_change_etcd_state_hook(json11::Json::object & changes)
{
// FIXME apply config changes in runtime (maybe, some)
apply_pg_count();
apply_pg_config();
}
void osd_t::on_load_config_hook(json11::Json::object & global_config)
{
blockstore_config_t osd_config = this->config;
for (auto & cfg_var: global_config)
{
if (this->config.find(cfg_var.first) == this->config.end())
{
if (cfg_var.second.is_string())
{
osd_config[cfg_var.first] = cfg_var.second.string_value();
}
else
{
osd_config[cfg_var.first] = cfg_var.second.dump();
}
}
}
parse_config(osd_config);
bind_socket();
st_cli.start_etcd_watcher();
acquire_lease();
}
// Acquire lease
void osd_t::acquire_lease()
{
// Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
st_cli.etcd_call("/lease/grant", json11::Json::object {
{ "TTL", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 }
}, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
{
if (err != "" || data["ID"].string_value() == "")
{
printf("Error acquiring a lease from etcd: %s\n", err.c_str());
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
{
acquire_lease();
});
return;
}
etcd_lease_id = data["ID"].string_value();
create_osd_state();
});
printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].c_str(), etcd_report_interval);
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
{
renew_lease();
});
}
// Report "up" state once, then keep it alive using the lease
// Do it first to allow "monitors" check it when moving PGs
void osd_t::create_osd_state()
{
std::string state_key = base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num));
self_state = get_osd_state();
st_cli.etcd_txn(json11::Json::object {
// Check that the state key does not exist
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "CREATE" },
{ "create_revision", 0 },
{ "key", state_key },
}
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", state_key },
{ "value", base64_encode(self_state.dump()) },
{ "lease", etcd_lease_id },
} }
},
} },
{ "failure", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", state_key },
} }
},
} },
}, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
{
if (err != "")
{
etcd_failed_attempts++;
printf("Error creating OSD state key: %s\n", err.c_str());
if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
{
// Die
throw std::runtime_error("Cluster connection failed");
}
// Retry
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
{
create_osd_state();
});
return;
}
if (!data["succeeded"].bool_value())
{
// OSD is already up
auto kv = st_cli.parse_etcd_kv(data["responses"][0]["response_range"]["kvs"][0]);
printf("Key %s already exists in etcd, OSD %lu is still up\n", kv.key.c_str(), this->osd_num);
int64_t port = kv.value["port"].int64_value();
for (auto & addr: kv.value["addresses"].array_items())
{
printf(" listening at: %s:%ld\n", addr.string_value().c_str(), port);
}
force_stop(0);
return;
}
if (run_primary)
{
st_cli.load_pgs();
}
});
}
// Renew lease
void osd_t::renew_lease()
{
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
{ "ID", etcd_lease_id }
}, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
{
if (err == "" && data["result"]["TTL"].string_value() == "")
{
// Die
throw std::runtime_error("etcd lease has expired");
}
if (err != "")
{
etcd_failed_attempts++;
printf("Error renewing etcd lease: %s\n", err.c_str());
if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
{
// Die
throw std::runtime_error("Cluster connection failed");
}
// Retry
tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
{
renew_lease();
});
}
else
{
etcd_failed_attempts = 0;
report_statistics();
}
});
}
void osd_t::force_stop(int exitcode)
{
if (etcd_lease_id != "")
{
st_cli.etcd_call("/kv/lease/revoke", json11::Json::object {
{ "ID", etcd_lease_id }
}, ETCD_QUICK_TIMEOUT, [this, exitcode](std::string err, json11::Json data)
{
if (err != "")
{
printf("Error revoking etcd lease: %s\n", err.c_str());
}
printf("[OSD %lu] Force stopping\n", this->osd_num);
exit(exitcode);
});
}
else
{
printf("[OSD %lu] Force stopping\n", this->osd_num);
exit(exitcode);
}
}
json11::Json osd_t::on_load_pgs_checks_hook()
{
assert(this->pgs.size() == 0);
json11::Json::array checks = {
json11::Json::object {
{ "target", "LEASE" },
{ "lease", etcd_lease_id },
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num)) },
}
};
return checks;
}
void osd_t::on_load_pgs_hook(bool success)
{
if (!success)
{
printf("Error loading PGs from etcd: lease expired\n");
force_stop(1);
}
else
{
peering_state &= ~OSD_LOADING_PGS;
apply_pg_count();
apply_pg_config();
}
}
void osd_t::apply_pg_count()
{
pg_num_t pg_count = st_cli.pg_config.size();
if (pg_count > 0 && (st_cli.pg_config.begin()->first != 1 || std::prev(st_cli.pg_config.end())->first != pg_count))
{
printf("Invalid PG configuration: PG numbers don't cover the whole 1..%d range\n", pg_count);
force_stop(1);
return;
}
if (this->pg_count != 0 && this->pg_count != pg_count)
{
// Check that all PGs are offline. It is not allowed to change PG count when any PGs are online
// The external tool must wait for all PGs to come down before changing PG count
// If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
// So an OSD just dies if it detects PG count change while there are active PGs
int still_active = 0;
for (auto & kv: pgs)
{
if (kv.second.state & PG_ACTIVE)
{
still_active++;
}
}
if (still_active > 0)
{
printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
force_stop(1);
return;
}
}
this->pg_count = pg_count;
}
void osd_t::apply_pg_config()
{
bool all_applied = true;
for (auto & kv: st_cli.pg_config)
{
pg_num_t pg_num = kv.first;
auto & pg_cfg = kv.second;
bool take = pg_cfg.exists && pg_cfg.primary == this->osd_num &&
!pg_cfg.pause && (!pg_cfg.cur_primary || pg_cfg.cur_primary == this->osd_num);
bool currently_taken = this->pgs.find(pg_num) != this->pgs.end() &&
this->pgs[pg_num].state != PG_OFFLINE;
if (currently_taken && !take)
{
// Stop this PG
stop_pg(pg_num);
}
else if (take)
{
// Take this PG
std::set<osd_num_t> all_peers;
for (osd_num_t pg_osd: pg_cfg.target_set)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
for (osd_num_t pg_osd: pg_cfg.all_peers)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
for (auto & hist_item: pg_cfg.target_history)
{
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
}
if (currently_taken)
{
if (this->pgs[pg_num].state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
{
if (this->pgs[pg_num].target_set == pg_cfg.target_set)
{
// No change in osd_set; history changes are ignored
continue;
}
else
{
// Stop PG, reapply change after stopping
stop_pg(pg_num);
all_applied = false;
continue;
}
}
else if (this->pgs[pg_num].state & PG_STOPPING)
{
// Reapply change after stopping
all_applied = false;
continue;
}
else if (this->pgs[pg_num].state & PG_STARTING)
{
if (pg_cfg.cur_primary == this->osd_num)
{
// PG locked, continue
}
else
{
// Reapply change after locking the PG
all_applied = false;
continue;
}
}
else
{
throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(this->pgs[pg_num].state));
}
}
this->pgs[pg_num] = (pg_t){
.state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
.pg_cursize = 0,
.pg_num = pg_num,
.target_history = pg_cfg.target_history,
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
.target_set = pg_cfg.target_set,
};
this->pg_state_dirty.insert(pg_num);
this->pgs[pg_num].print_state();
if (pg_cfg.cur_primary == this->osd_num)
{
// Add peers
for (auto pg_osd: all_peers)
{
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
{
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
}
}
start_pg_peering(pg_num);
}
else
{
// Reapply change after locking the PG
all_applied = false;
}
}
}
report_pg_states();
this->pg_config_applied = all_applied;
}
void osd_t::report_pg_states()
{
if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size())
{
return;
}
etcd_reporting_pg_state = true;
std::vector<std::pair<pg_num_t,bool>> reporting_pgs;
json11::Json::array checks;
json11::Json::array success;
json11::Json::array failure;
for (auto it = pg_state_dirty.begin(); it != pg_state_dirty.end(); it++)
{
auto pg_it = this->pgs.find(*it);
if (pg_it == this->pgs.end())
{
continue;
}
auto & pg = pg_it->second;
reporting_pgs.push_back({ pg.pg_num, pg.history_changed });
std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num));
if (pg.state == PG_STARTING)
{
// Check that the PG key does not exist
// Failed check indicates an unsuccessful PG lock attempt in this case
checks.push_back(json11::Json::object {
{ "target", "VERSION" },
{ "version", 0 },
{ "key", state_key_base64 },
});
}
else
{
// Check that the key is ours
// Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
// and an unexpected race condition for started pgs (PG lock is held by someone else)
checks.push_back(json11::Json::object {
{ "target", "LEASE" },
{ "lease", etcd_lease_id },
{ "key", state_key_base64 },
});
}
if (pg.state == PG_OFFLINE)
{
success.push_back(json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", state_key_base64 },
} }
});
}
else
{
json11::Json::array pg_state_keywords;
for (int i = 0; i < pg_state_bit_count; i++)
{
if (pg.state & pg_state_bits[i])
{
pg_state_keywords.push_back(pg_state_names[i]);
}
}
success.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num)) },
{ "value", base64_encode(json11::Json(json11::Json::object {
{ "primary", this->osd_num },
{ "state", pg_state_keywords },
{ "peers", pg.cur_peers },
}).dump()) },
{ "lease", etcd_lease_id },
} }
});
if (pg.history_changed)
{
pg.history_changed = false;
if (pg.state == PG_ACTIVE)
{
success.push_back(json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
} }
});
}
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{
success.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
{ "value", base64_encode(json11::Json(json11::Json::object {
{ "all_peers", pg.all_peers },
}).dump()) },
} }
});
}
}
}
failure.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", state_key_base64 },
} }
});
}
pg_state_dirty.clear();
st_cli.etcd_txn(json11::Json::object {
{ "compare", checks }, { "success", success }, { "failure", failure }
}, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data)
{
etcd_reporting_pg_state = false;
if (!data["succeeded"].bool_value())
{
// One of PG state updates failed, put dirty flags back
for (auto pp: reporting_pgs)
{
this->pg_state_dirty.insert(pp.first);
if (pp.second)
{
auto pg_it = this->pgs.find(pp.first);
if (pg_it != this->pgs.end())
{
pg_it->second.history_changed = true;
}
}
}
for (auto & res: data["responses"].array_items())
{
if (res["kvs"].array_items().size())
{
auto kv = st_cli.parse_etcd_kv(res["kvs"][0]);
pg_num_t pg_num = stoull_full(kv.key.substr(st_cli.etcd_prefix.length()+10));
auto pg_it = pgs.find(pg_num);
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
{
// Live PG state update failed
printf("Failed to report state of PG %u which is live. Race condition detected, exiting\n", pg_num);
force_stop(1);
return;
}
}
}
// Retry after a short pause (hope we'll get some updates and update PG states accordingly)
tfd->set_timer(500, false, [this](int) { report_pg_states(); });
}
else
{
// Success. We'll get our changes back via the watcher and react to them
for (auto pp: reporting_pgs)
{
auto pg_it = this->pgs.find(pp.first);
if (pg_it != this->pgs.end())
{
if (pg_it->second.state == PG_OFFLINE)
{
// Remove offline PGs after reporting their state
this->pgs.erase(pg_it);
}
}
}
// Push other PG state updates, if any
report_pg_states();
if (!this->pg_state_dirty.size())
{
// Update statistics
report_statistics();
}
}
});
}

View File

@@ -1,300 +0,0 @@
#include "osd.h"
#define FLUSH_BATCH 512
void osd_t::submit_pg_flush_ops(pg_num_t pg_num)
{
pg_t & pg = pgs[pg_num];
pg_flush_batch_t *fb = new pg_flush_batch_t();
pg.flush_batch = fb;
auto it = pg.flush_actions.begin(), prev_it = pg.flush_actions.begin();
bool first = true;
while (it != pg.flush_actions.end())
{
if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
{
// Stop only at the object boundary
break;
}
it->second.submitted = true;
if (it->second.rollback)
{
fb->flush_objects++;
fb->rollback_lists[it->first.osd_num].push_back((obj_ver_id){
.oid = it->first.oid,
.version = it->second.rollback_to,
});
}
if (it->second.make_stable)
{
fb->flush_objects++;
fb->stable_lists[it->first.osd_num].push_back((obj_ver_id){
.oid = it->first.oid,
.version = it->second.stable_to,
});
}
prev_it = it;
first = false;
it++;
}
for (auto & l: fb->rollback_lists)
{
if (l.second.size() > 0)
{
fb->flush_ops++;
submit_flush_op(pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
}
}
for (auto & l: fb->stable_lists)
{
if (l.second.size() > 0)
{
fb->flush_ops++;
submit_flush_op(pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
}
}
}
void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
{
if (pgs.find(pg_num) == pgs.end() || pgs[pg_num].flush_batch != fb)
{
// Throw the result away
return;
}
if (retval != 0)
{
if (peer_osd == this->osd_num)
{
throw std::runtime_error(
std::string(rollback
? "Error while doing local rollback operation: "
: "Error while doing local stabilize operation: "
) + strerror(-retval)
);
}
else
{
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
auto fd_it = c_cli.osd_peer_fds.find(peer_osd);
if (fd_it != c_cli.osd_peer_fds.end())
{
c_cli.stop_client(fd_it->second);
}
return;
}
}
fb->flush_done++;
if (fb->flush_done == fb->flush_ops)
{
// This flush batch is done
std::vector<osd_op_t*> continue_ops;
auto & pg = pgs[pg_num];
auto it = pg.flush_actions.begin(), prev_it = it;
auto erase_start = it;
while (1)
{
if (it == pg.flush_actions.end() ||
it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
{
pg.ver_override.erase((object_id){
.inode = prev_it->first.oid.inode,
.stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
});
auto wr_it = pg.write_queue.find((object_id){
.inode = prev_it->first.oid.inode,
.stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
});
if (wr_it != pg.write_queue.end())
{
continue_ops.push_back(wr_it->second);
pg.write_queue.erase(wr_it);
}
}
if ((it == pg.flush_actions.end() || !it->second.submitted) &&
erase_start != it)
{
pg.flush_actions.erase(erase_start, it);
}
if (it == pg.flush_actions.end())
{
break;
}
prev_it = it;
if (!it->second.submitted)
{
it++;
erase_start = it;
}
else
{
it++;
}
}
delete fb;
pg.flush_batch = NULL;
if (!pg.flush_actions.size())
{
pg.state = pg.state & ~PG_HAS_UNCLEAN;
report_pg_state(pg);
}
for (osd_op_t *op: continue_ops)
{
continue_primary_write(op);
}
if (pg.inflight == 0 && (pg.state & PG_STOPPING))
{
finish_stop_pg(pg);
}
}
}
void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
{
osd_op_t *op = new osd_op_t();
// Copy buffer so it gets freed along with the operation
op->buf = malloc(sizeof(obj_ver_id) * count);
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
if (peer_osd == this->osd_num)
{
// local
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t({
.opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
.callback = [this, op, pg_num, fb](blockstore_op_t *bs_op)
{
add_bs_subop_stats(op);
handle_flush_op(bs_op->opcode == BS_OP_ROLLBACK, pg_num, fb, this->osd_num, bs_op->retval);
delete op->bs_op;
op->bs_op = NULL;
delete op;
},
.len = (uint32_t)count,
.buf = op->buf,
});
bs->enqueue_op(op->bs_op);
}
else
{
// Peer
int peer_fd = c_cli.osd_peer_fds[peer_osd];
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->send_list.push_back(op->buf, count * sizeof(obj_ver_id));
op->peer_fd = peer_fd;
op->req = {
.sec_stab = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = (uint64_t)(rollback ? OSD_OP_SECONDARY_ROLLBACK : OSD_OP_SECONDARY_STABILIZE),
},
.len = count * sizeof(obj_ver_id),
},
};
op->callback = [this, pg_num, fb, peer_osd](osd_op_t *op)
{
handle_flush_op(op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK, pg_num, fb, peer_osd, op->reply.hdr.retval);
delete op;
};
c_cli.outbox_push(op);
}
}
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
{
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
{
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
{
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
{
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
{
op.degraded = true;
op.pg_num = pg_it->first;
op.oid = obj_it->first;
return true;
}
}
}
}
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
{
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
{
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
{
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
{
op.degraded = false;
op.pg_num = pg_it->first;
op.oid = obj_it->first;
return true;
}
}
}
}
return false;
}
void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = {
.rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = 1,
.opcode = OSD_OP_WRITE,
},
.inode = op->oid.inode,
.offset = op->oid.stripe,
.len = 0,
},
};
op->osd_op->callback = [this, op](osd_op_t *osd_op)
{
// Don't sync the write, it will be synced by our regular sync coroutine
if (osd_op->reply.hdr.retval < 0)
{
// Error recovering object
if (osd_op->reply.hdr.retval == -EPIPE)
{
// PG is stopped or one of the OSDs is gone, error is harmless
}
else
{
throw std::runtime_error("Failed to recover an object");
}
}
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
op->osd_op = NULL;
recovery_ops.erase(op->oid);
delete osd_op;
continue_recovery();
};
exec_op(op->osd_op);
}
// Just trigger write requests for degraded objects. They'll be recovered during writing
bool osd_t::continue_recovery()
{
while (recovery_ops.size() < recovery_queue_depth)
{
osd_recovery_op_t op;
if (pick_next_recovery(op))
{
recovery_ops[op.oid] = op;
submit_recovery_op(&recovery_ops[op.oid]);
}
else
return false;
}
return true;
}

View File

@@ -2,17 +2,8 @@
#include <signal.h>
static osd_t *osd = NULL;
static bool force_stopping = false;
static void handle_sigint(int sig)
void handle_sigint(int sig)
{
if (osd && !force_stopping)
{
force_stopping = true;
osd->force_stop(0);
return;
}
exit(0);
}
@@ -34,11 +25,9 @@ int main(int narg, char *args[])
}
}
signal(SIGINT, handle_sigint);
signal(SIGTERM, handle_sigint);
ring_loop_t *ringloop = new ring_loop_t(512);
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
blockstore_t *bs = new blockstore_t(config, ringloop);
osd = new osd_t(config, bs, ringloop);
osd_t *osd = new osd_t(config, bs, ringloop);
while (1)
{
ringloop->loop();

View File

@@ -22,12 +22,9 @@
#define OSD_OP_READ 10
#define OSD_OP_WRITE 11
#define OSD_OP_SYNC 12
#define OSD_OP_DELETE 13
#define OSD_OP_MAX 13
#define OSD_OP_MAX 12
// Alignment & limit for read/write operations
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 512
#endif
#define OSD_RW_ALIGN 512
#define OSD_RW_MAX 64*1024*1024
// common request and reply headers
@@ -60,7 +57,6 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
// object
object_id oid;
// read/write version (automatic or specific)
// FIXME deny values close to UINT64_MAX
uint64_t version;
// offset
uint32_t offset;
@@ -134,7 +130,7 @@ struct __attribute__((__packed__)) osd_op_secondary_list_t
osd_op_header_t header;
// placement group total number and total count
pg_num_t list_pg, pg_count;
uint64_t pg_stripe_size;
uint64_t parity_block_size;
};
struct __attribute__((__packed__)) osd_reply_secondary_list_t
@@ -146,7 +142,6 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
};
// read or write to the primary OSD (must be within individual stripe)
// FIXME: allow to return used block bitmap (required for snapshots)
struct __attribute__((__packed__)) osd_op_rw_t
{
osd_op_header_t header;
@@ -174,7 +169,6 @@ struct __attribute__((__packed__)) osd_reply_sync_t
osd_reply_header_t header;
};
// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
union osd_any_op_t
{
osd_op_header_t hdr;

View File

@@ -3,212 +3,286 @@
#include <algorithm>
#include "base64.h"
#include "osd.h"
void osd_t::init_primary()
{
// Initial test version of clustering code requires exactly 2 peers
// FIXME Hardcode
std::string peerstr = config["peers"];
while (peerstr.size())
{
int pos = peerstr.find(',');
peers.push_back(parse_peer(pos < 0 ? peerstr : peerstr.substr(0, pos)));
peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
for (int i = 0; i < peers.size()-1; i++)
if (peers[i].osd_num == peers[peers.size()-1].osd_num)
throw std::runtime_error("same osd number "+std::to_string(peers[i].osd_num)+" specified twice in peers");
}
if (peers.size() < 2)
throw std::runtime_error("run_primary requires at least 2 peers");
pgs.push_back((pg_t){
.state = PG_OFFLINE,
.pg_cursize = 0,
.pg_num = 1,
.target_set = { 1, 2, 3 },
.cur_set = { 1, 0, 0 },
});
pg_count = 1;
peering_state = OSD_PEERING_PEERS;
}
osd_peer_def_t osd_t::parse_peer(std::string peer)
{
// OSD_NUM:IP:PORT
int pos1 = peer.find(':');
int pos2 = peer.find(':', pos1+1);
if (pos1 < 0 || pos2 < 0)
throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
osd_peer_def_t r;
r.addr = peer.substr(pos1+1, pos2-pos1-1);
std::string osd_num_str = peer.substr(0, pos1);
std::string port_str = peer.substr(pos2+1);
r.osd_num = strtoull(osd_num_str.c_str(), NULL, 10);
if (!r.osd_num)
throw new std::runtime_error("Could not parse OSD peer osd_num");
r.port = strtoull(port_str.c_str(), NULL, 10);
if (!r.port)
throw new std::runtime_error("Could not parse OSD peer port");
return r;
}
void osd_t::connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback)
{
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{
callback(osd_num, -EINVAL);
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
callback(osd_num, -errno);
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
close(peer_fd);
callback(osd_num, -errno);
return;
}
clients[peer_fd] = (osd_client_t){
.peer_addr = addr,
.peer_port = peer_port,
.peer_fd = peer_fd,
.peer_state = PEER_CONNECTING,
.connect_callback = callback,
.osd_num = osd_num,
};
osd_peer_fds[osd_num] = peer_fd;
// Add FD to epoll (EPOLLOUT for tracking connect() result)
epoll_event ev;
ev.data.fd = peer_fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
}
void osd_t::handle_connect_result(int peer_fd)
{
auto & cl = clients[peer_fd];
osd_num_t osd_num = cl.osd_num;
auto callback = cl.connect_callback;
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
stop_client(peer_fd);
callback(osd_num, -result);
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
// Disable EPOLLOUT on this fd
cl.connect_callback = NULL;
cl.peer_state = PEER_CONNECTED;
epoll_event ev;
ev.data.fd = peer_fd;
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
}
callback(osd_num, peer_fd);
}
// Peering loop
void osd_t::handle_peers()
{
if (peering_state & OSD_PEERING_PEERS)
{
for (int i = 0; i < peers.size(); i++)
{
if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end() &&
time(NULL) - peers[i].last_connect_attempt > 5) // FIXME hardcode 5
{
peers[i].last_connect_attempt = time(NULL);
connect_peer(peers[i].osd_num, peers[i].addr.c_str(), peers[i].port, [this](osd_num_t osd_num, int peer_fd)
{
// FIXME: Check peer config after connecting
if (peer_fd < 0)
{
printf("Failed to connect to peer OSD %lu: %s\n", osd_num, strerror(-peer_fd));
return;
}
printf("Connected with peer OSD %lu (fd %d)\n", clients[peer_fd].osd_num, peer_fd);
int i;
for (i = 0; i < peers.size(); i++)
{
if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end())
break;
}
if (i >= peers.size())
{
// Connected to all peers
peering_state = peering_state & ~OSD_PEERING_PEERS;
}
repeer_pgs(osd_num, true);
});
}
}
}
if (peering_state & OSD_PEERING_PGS)
{
bool still = false;
for (auto & p: pgs)
bool still_doing_pgs = false;
for (int i = 0; i < pgs.size(); i++)
{
if (p.second.state == PG_PEERING)
if (pgs[i].state == PG_PEERING)
{
if (!p.second.peering_state->list_ops.size())
if (!pgs[i].peering_state->list_ops.size())
{
p.second.calc_object_states(log_level);
report_pg_state(p.second);
incomplete_objects += p.second.incomplete_objects.size();
misplaced_objects += p.second.misplaced_objects.size();
// FIXME: degraded objects may currently include misplaced, too! Report them separately?
degraded_objects += p.second.degraded_objects.size();
if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
peering_state = peering_state | OSD_FLUSHING_PGS;
else
peering_state = peering_state | OSD_RECOVERING;
pgs[i].calc_object_states();
}
else
{
still = true;
still_doing_pgs = true;
}
}
}
if (!still)
if (!still_doing_pgs)
{
// Done all PGs
peering_state = peering_state & ~OSD_PEERING_PGS;
}
}
if ((peering_state & OSD_FLUSHING_PGS) && !readonly)
{
bool still = false;
for (auto & p: pgs)
{
if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
{
if (!p.second.flush_batch)
{
submit_pg_flush_ops(p.first);
}
still = true;
}
}
if (!still)
{
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
}
}
if ((peering_state & OSD_RECOVERING) && !readonly)
{
if (!continue_recovery())
{
peering_state = peering_state & ~OSD_RECOVERING;
}
}
}
void osd_t::repeer_pgs(osd_num_t peer_osd)
void osd_t::repeer_pgs(osd_num_t osd_num, bool is_connected)
{
// Re-peer affected PGs
for (auto & p: pgs)
// FIXME: We shouldn't rely just on target_set. Other OSDs may also contain PG data.
osd_num_t real_osd = (is_connected ? osd_num : 0);
for (int i = 0; i < pgs.size(); i++)
{
bool repeer = false;
if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
for (int r = 0; r < pgs[i].target_set.size(); r++)
{
for (osd_num_t pg_osd: p.second.all_peers)
if (pgs[i].target_set[r] == osd_num &&
pgs[i].cur_set[r] != real_osd)
{
if (pg_osd == peer_osd)
{
repeer = true;
break;
}
}
if (repeer)
{
// Repeer this pg
printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
start_pg_peering(p.second.pg_num);
pgs[i].cur_set[r] = real_osd;
repeer = true;
break;
}
}
if (repeer)
{
// Repeer this pg
printf("Repeer PG %d because of OSD %lu\n", i, osd_num);
start_pg_peering(i);
peering_state |= OSD_PEERING_PGS;
}
}
}
// Repeer on each connect/disconnect peer event
void osd_t::start_pg_peering(pg_num_t pg_num)
void osd_t::start_pg_peering(int pg_idx)
{
auto & pg = pgs[pg_num];
auto & pg = pgs[pg_idx];
pg.state = PG_PEERING;
this->peering_state |= OSD_PEERING_PGS;
report_pg_state(pg);
// Reset PG state
pg.cur_peers.clear();
pg.state_dict.clear();
incomplete_objects -= pg.incomplete_objects.size();
misplaced_objects -= pg.misplaced_objects.size();
degraded_objects -= pg.degraded_objects.size();
pg.incomplete_objects.clear();
pg.misplaced_objects.clear();
pg.degraded_objects.clear();
pg.flush_actions.clear();
pg.obj_states.clear();
pg.ver_override.clear();
if (pg.flush_batch)
{
delete pg.flush_batch;
}
pg.flush_batch = NULL;
for (auto p: pg.write_queue)
{
cancel_primary_write(p.second);
}
pg.write_queue.clear();
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
{
// Forget this PG's unstable writes
pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1;
if (n == pg.pg_num)
unstable_writes.erase(it++);
else
it++;
}
dirty_pgs.erase(pg.pg_num);
// Calculate current write OSD set
pg.pg_cursize = 0;
pg.cur_set.resize(pg.target_set.size());
pg.cur_loc_set.clear();
for (int role = 0; role < pg.target_set.size(); role++)
for (int role = 0; role < pg.cur_set.size(); role++)
{
pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0;
if (pg.cur_set[role] != 0)
{
pg.pg_cursize++;
pg.cur_loc_set.push_back({
.role = (uint64_t)role,
.osd_num = pg.cur_set[role],
.outdated = false,
});
}
}
if (pg.target_history.size())
{
// Refuse to start PG if no peers are available from any of the historical OSD sets
// (PG history is kept up to the latest active+clean state)
for (auto & history_set: pg.target_history)
{
bool found = false;
for (auto history_osd: history_set)
{
if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
{
found = true;
break;
}
}
if (!found)
{
pg.state = PG_INCOMPLETE;
report_pg_state(pg);
}
}
}
if (pg.pg_cursize < pg.pg_minsize)
{
pg.state = PG_INCOMPLETE;
report_pg_state(pg);
}
std::set<osd_num_t> cur_peers;
for (auto pg_osd: pg.all_peers)
{
if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end())
{
cur_peers.insert(pg_osd);
}
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
{
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
}
}
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
if (pg.peering_state)
{
// Adjust the peering operation that's still in progress - discard unneeded results
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
// Adjust the peering operation that's still in progress
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
{
if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
int role;
for (role = 0; role < pg.cur_set.size(); role++)
{
if (pg.cur_set[role] == it->first)
break;
}
if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
{
// Discard the result after completion, which, chances are, will be unsuccessful
discard_list_subop(it->second);
auto list_op = it->second;
if (list_op->peer_fd == 0)
{
// Self
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
{
if (list_op->bs_op->buf)
free(list_op->bs_op->buf);
delete list_op;
};
}
else
{
// Peer
list_op->callback = [](osd_op_t *list_op)
{
delete list_op;
};
}
pg.peering_state->list_ops.erase(it);
it = pg.peering_state->list_ops.begin();
}
else
it++;
}
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
{
if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
int role;
for (role = 0; role < pg.cur_set.size(); role++)
{
if (pg.cur_set[role] == it->first)
break;
}
if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
{
if (it->second.buf)
{
@@ -217,8 +291,6 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
pg.peering_state->list_results.erase(it);
it = pg.peering_state->list_results.begin();
}
else
it++;
}
}
if (pg.state == PG_INCOMPLETE)
@@ -228,300 +300,107 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
delete pg.peering_state;
pg.peering_state = NULL;
}
printf("PG %d is incomplete\n", pg.pg_num);
return;
}
if (!pg.peering_state)
{
pg.peering_state = new pg_peering_state_t();
pg.peering_state->pg_num = pg.pg_num;
}
for (osd_num_t peer_osd: cur_peers)
auto ps = pg.peering_state;
for (int role = 0; role < pg.cur_set.size(); role++)
{
if (pg.peering_state->list_ops.find(peer_osd) != pg.peering_state->list_ops.end() ||
pg.peering_state->list_results.find(peer_osd) != pg.peering_state->list_results.end())
osd_num_t role_osd = pg.cur_set[role];
if (!role_osd)
{
continue;
}
submit_sync_and_list_subop(peer_osd, pg.peering_state);
if (ps->list_ops.find(role_osd) != ps->list_ops.end() ||
ps->list_results.find(role_osd) != ps->list_results.end())
{
continue;
}
if (role_osd == this->osd_num)
{
// Self
osd_op_t *op = new osd_op_t();
op->op_type = 0;
op->peer_fd = 0;
op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_LIST;
op->bs_op->oid.stripe = parity_block_size;
op->bs_op->len = pg_count,
op->bs_op->offset = pg.pg_num-1,
op->bs_op->callback = [ps, op, role_osd](blockstore_op_t *bs_op)
{
if (op->bs_op->retval < 0)
{
throw std::runtime_error("local OP_LIST failed");
}
printf(
"Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
role_osd, bs_op->retval, bs_op->version
);
ps->list_results[role_osd] = {
.buf = (obj_ver_id*)op->bs_op->buf,
.total_count = (uint64_t)op->bs_op->retval,
.stable_count = op->bs_op->version,
};
ps->list_done++;
ps->list_ops.erase(role_osd);
delete op;
};
bs->enqueue_op(op->bs_op);
ps->list_ops[role_osd] = op;
}
else
{
// Peer
auto & cl = clients[osd_peer_fds[role_osd]];
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.sec_list = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SECONDARY_LIST,
},
.list_pg = pg.pg_num,
.pg_count = pg_count,
.parity_block_size = parity_block_size,
},
};
op->callback = [this, ps, role_osd](osd_op_t *op)
{
if (op->reply.hdr.retval < 0)
{
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
ps->list_ops.erase(role_osd);
stop_client(op->peer_fd);
delete op;
return;
}
printf(
"Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
);
ps->list_results[role_osd] = {
.buf = (obj_ver_id*)op->buf,
.total_count = (uint64_t)op->reply.hdr.retval,
.stable_count = op->reply.sec_list.stable_count,
};
// set op->buf to NULL so it doesn't get freed
op->buf = NULL;
ps->list_done++;
ps->list_ops.erase(role_osd);
delete op;
};
outbox_push(cl, op);
ps->list_ops[role_osd] = op;
}
}
ringloop->wakeup();
}
void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
{
// Sync before listing, if not readonly
if (readonly)
{
submit_list_subop(role_osd, ps);
}
else if (role_osd == this->osd_num)
{
// Self
osd_op_t *op = new osd_op_t();
op->op_type = 0;
op->peer_fd = 0;
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_SYNC;
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
{
if (bs_op->retval < 0)
{
printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
force_stop(1);
return;
}
add_bs_subop_stats(op);
delete op->bs_op;
op->bs_op = NULL;
delete op;
ps->list_ops.erase(role_osd);
submit_list_subop(role_osd, ps);
};
bs->enqueue_op(op->bs_op);
ps->list_ops[role_osd] = op;
}
else
{
// Peer
auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = cl.peer_fd;
op->req = {
.sec_sync = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = OSD_OP_SECONDARY_SYNC,
},
},
};
op->callback = [this, ps, role_osd](osd_op_t *op)
{
if (op->reply.hdr.retval < 0)
{
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
ps->list_ops.erase(role_osd);
c_cli.stop_client(op->peer_fd);
delete op;
return;
}
delete op;
ps->list_ops.erase(role_osd);
submit_list_subop(role_osd, ps);
};
c_cli.outbox_push(op);
ps->list_ops[role_osd] = op;
}
}
void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
{
if (role_osd == this->osd_num)
{
// Self
osd_op_t *op = new osd_op_t();
op->op_type = 0;
op->peer_fd = 0;
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_LIST;
op->bs_op->oid.stripe = pg_stripe_size;
op->bs_op->len = pg_count;
op->bs_op->offset = ps->pg_num-1;
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
{
if (op->bs_op->retval < 0)
{
throw std::runtime_error("local OP_LIST failed");
}
add_bs_subop_stats(op);
printf(
"[PG %u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
ps->pg_num, role_osd, bs_op->retval, bs_op->version
);
ps->list_results[role_osd] = {
.buf = (obj_ver_id*)op->bs_op->buf,
.total_count = (uint64_t)op->bs_op->retval,
.stable_count = op->bs_op->version,
};
ps->list_ops.erase(role_osd);
delete op->bs_op;
op->bs_op = NULL;
delete op;
};
bs->enqueue_op(op->bs_op);
ps->list_ops[role_osd] = op;
}
else
{
// Peer
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
op->peer_fd = c_cli.osd_peer_fds[role_osd];
op->req = {
.sec_list = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = OSD_OP_SECONDARY_LIST,
},
.list_pg = ps->pg_num,
.pg_count = pg_count,
.pg_stripe_size = pg_stripe_size,
},
};
op->callback = [this, ps, role_osd](osd_op_t *op)
{
if (op->reply.hdr.retval < 0)
{
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
ps->list_ops.erase(role_osd);
c_cli.stop_client(op->peer_fd);
delete op;
return;
}
printf(
"[PG %u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
);
ps->list_results[role_osd] = {
.buf = (obj_ver_id*)op->buf,
.total_count = (uint64_t)op->reply.hdr.retval,
.stable_count = op->reply.sec_list.stable_count,
};
// set op->buf to NULL so it doesn't get freed
op->buf = NULL;
ps->list_ops.erase(role_osd);
delete op;
};
c_cli.outbox_push(op);
ps->list_ops[role_osd] = op;
}
}
void osd_t::discard_list_subop(osd_op_t *list_op)
{
if (list_op->peer_fd == 0)
{
// Self
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
{
if (list_op->bs_op->buf)
free(list_op->bs_op->buf);
delete list_op->bs_op;
list_op->bs_op = NULL;
delete list_op;
};
}
else
{
// Peer
list_op->callback = [](osd_op_t *list_op)
{
delete list_op;
};
}
}
bool osd_t::stop_pg(pg_num_t pg_num)
{
auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end())
{
return false;
}
auto & pg = pg_it->second;
if (pg.peering_state)
{
// Stop peering
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
{
discard_list_subop(it->second);
}
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
{
if (it->second.buf)
{
free(it->second.buf);
}
}
delete pg.peering_state;
pg.peering_state = NULL;
}
if (!(pg.state & PG_ACTIVE))
{
return false;
}
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
if (pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
else
{
report_pg_state(pg);
}
return true;
}
void osd_t::finish_stop_pg(pg_t & pg)
{
pg.state = PG_OFFLINE;
report_pg_state(pg);
}
void osd_t::report_pg_state(pg_t & pg)
{
pg.print_state();
this->pg_state_dirty.insert(pg.pg_num);
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
{
// Clear history of active+clean PGs
pg.history_changed = true;
pg.target_history.clear();
pg.all_peers = pg.target_set;
pg.cur_peers = pg.target_set;
}
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
pg.history_changed = true;
pg.target_history.clear();
std::set<osd_num_t> dead_peers;
for (auto pg_osd: pg.all_peers)
{
dead_peers.insert(pg_osd);
}
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{
dead_peers.insert(pg_osd);
}
}
pg.all_peers.clear();
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
pg.cur_peers.clear();
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{
pg.cur_peers.push_back(pg_osd);
}
}
}
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
{
apply_pg_config();
}
report_pg_states();
}

View File

@@ -1,360 +1,159 @@
#include "osd_peering_pg.h"
struct obj_ver_role
void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all)
{
object_id oid;
uint64_t version;
uint64_t osd_num;
bool is_stable;
};
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
{
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, role ASC, osd_num ASC
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
a.version > b.version ||
a.version == b.version && (
a.oid.stripe < b.oid.stripe ||
a.oid.stripe == b.oid.stripe && a.osd_num < b.osd_num
)
)
);
}
struct obj_piece_ver_t
{
uint64_t max_ver = 0;
uint64_t stable_ver = 0;
uint64_t max_target = 0;
};
struct pg_obj_state_check_t
{
pg_t *pg;
std::vector<obj_ver_role> list;
int list_pos;
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
object_id oid = { 0 };
uint64_t max_ver = 0;
uint64_t last_ver = 0;
uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
uint64_t n_unstable = 0, n_buggy = 0;
pg_osd_set_t osd_set;
int log_level;
void walk();
void start_object();
void handle_version();
void finish_object();
};
void pg_obj_state_check_t::walk()
{
pg->clean_count = 0;
pg->total_count = 0;
pg->state = 0;
for (list_pos = 0; list_pos < list.size(); list_pos++)
{
if (oid.inode != list[list_pos].oid.inode ||
oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
{
if (oid.inode != 0)
{
finish_object();
}
start_object();
}
handle_version();
}
if (oid.inode != 0)
{
finish_object();
}
if (pg->pg_cursize < pg->pg_size)
{
pg->state |= PG_DEGRADED;
}
pg->state |= PG_ACTIVE;
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
{
pg->state |= PG_LEFT_ON_DEAD;
}
}
void pg_obj_state_check_t::start_object()
{
obj_start = list_pos;
oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
last_ver = max_ver = list[list_pos].version;
target_ver = 0;
ver_start = list_pos;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
n_unstable = n_buggy = 0;
}
void pg_obj_state_check_t::handle_version()
{
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
{
// Version is either stable or recoverable
target_ver = last_ver;
ver_end = list_pos;
}
if (!target_ver)
{
if (last_ver != list[list_pos].version)
{
ver_start = list_pos;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
last_ver = list[list_pos].version;
}
int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
n_copies++;
if (replica >= pg->pg_size)
{
n_buggy++;
}
else
{
if (list[list_pos].is_stable)
{
n_stable++;
}
if (pg->cur_set[replica] != list[list_pos].osd_num)
{
n_mismatched++;
}
if (!(has_roles & (1 << replica)))
{
has_roles = has_roles | (1 << replica);
n_roles++;
}
}
}
if (!list[list_pos].is_stable)
{
n_unstable++;
}
}
void pg_obj_state_check_t::finish_object()
{
if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
{
// Version is either stable or recoverable
target_ver = last_ver;
ver_end = list_pos;
}
obj_end = list_pos;
auto & pg = *this;
// Remember the decision
uint64_t state = 0;
if (n_buggy > 0)
if (st.n_roles == pg.pg_cursize)
{
state = OBJ_BUGGY;
// FIXME: bring pg offline
throw std::runtime_error("buggy object state");
}
if (n_unstable > 0)
{
pg->state |= PG_HAS_UNCLEAN;
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
for (int i = obj_start; i < obj_end; i++)
{
auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
if (!pcs.max_ver)
{
pcs.max_ver = list[i].version;
}
if (list[i].is_stable && !pcs.stable_ver)
{
pcs.stable_ver = list[i].version;
}
if (list[i].version <= target_ver && !pcs.max_target)
{
pcs.max_target = list[i].version;
}
}
for (auto pp: pieces)
{
auto & pcs = pp.second;
if (pcs.stable_ver < pcs.max_ver)
{
auto & act = pg->flush_actions[pp.first];
// osd_set doesn't include rollback/stable states, so don't include them in the state code either
if (pcs.max_ver > target_ver)
{
act.rollback = true;
act.rollback_to = pcs.max_target;
}
if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
{
act.make_stable = true;
act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
}
}
}
}
if (!target_ver)
{
return;
}
if (n_roles < pg->pg_minsize)
{
if (log_level > 1)
{
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state = OBJ_INCOMPLETE;
pg->state = pg->state | PG_HAS_INCOMPLETE;
}
else if (n_roles < pg->pg_cursize)
{
if (log_level > 1)
{
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state = OBJ_DEGRADED;
pg->state = pg->state | PG_HAS_DEGRADED;
}
if (n_mismatched > 0)
{
if (n_roles >= pg->pg_cursize && log_level > 1)
{
printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
}
if (log_level > 1 && (n_roles < pg->pg_cursize || n_mismatched > 0))
{
if (log_level > 2)
{
for (int i = obj_start; i < obj_end; i++)
{
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
if (st.n_matched == pg.pg_cursize)
state = OBJ_CLEAN;
else
{
for (int i = ver_start; i < ver_end; i++)
{
printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
state = OBJ_MISPLACED;
pg.state = pg.state | PG_HAS_MISPLACED;
}
}
pg->total_count++;
if (state != 0 || ver_end < obj_end)
else if (st.n_roles < pg.pg_minsize)
{
osd_set.clear();
for (int i = ver_start; i < ver_end; i++)
{
osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK),
.osd_num = list[i].osd_num,
.outdated = false,
});
}
}
if (ver_end < obj_end)
{
// Check for outdated versions not present in the current target OSD set
for (int i = ver_end; i < obj_end; i++)
{
int j;
for (j = 0; j < osd_set.size(); j++)
{
if (osd_set[j].osd_num == list[i].osd_num)
{
break;
}
}
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
{
osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK),
.osd_num = list[i].osd_num,
.outdated = true,
});
state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
}
}
}
if (target_ver < max_ver)
{
pg->ver_override[oid] = target_ver;
}
if (state == 0)
{
pg->clean_count++;
printf("Object is unfound: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
state = OBJ_INCOMPLETE;
pg.state = pg.state | PG_HAS_UNFOUND;
}
else
{
auto it = pg->state_dict.find(osd_set);
if (it == pg->state_dict.end())
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
state = OBJ_DEGRADED;
pg.state = pg.state | PG_HAS_DEGRADED;
}
if (st.n_copies > pg.pg_size)
{
state |= OBJ_OVERCOPIED;
pg.state = pg.state | PG_HAS_UNCLEAN;
}
if (st.n_stable < st.n_copies)
{
state |= OBJ_NEEDS_STABLE;
pg.state = pg.state | PG_HAS_UNCLEAN;
}
if (st.target_ver < st.max_ver || st.has_old_unstable)
{
state |= OBJ_NEEDS_ROLLBACK;
pg.state = pg.state | PG_HAS_UNCLEAN;
pg.ver_override[st.oid] = st.target_ver;
}
if (st.is_buggy)
{
state |= OBJ_BUGGY;
// FIXME: bring pg offline
throw std::runtime_error("buggy object state");
}
if (state != OBJ_CLEAN)
{
st.osd_set.clear();
for (int i = st.ver_start; i < st.ver_end; i++)
{
st.osd_set.push_back((pg_obj_loc_t){
.role = (all[i].oid.stripe & STRIPE_MASK),
.osd_num = all[i].osd_num,
.stable = all[i].is_stable,
});
}
std::sort(st.osd_set.begin(), st.osd_set.end());
auto it = pg.state_dict.find(st.osd_set);
if (it == pg.state_dict.end())
{
std::vector<uint64_t> read_target;
read_target.resize(pg->pg_size);
for (int i = 0; i < pg->pg_size; i++)
read_target.resize(pg.pg_size);
for (int i = 0; i < pg.pg_size; i++)
{
read_target[i] = 0;
}
for (auto & o: osd_set)
for (auto & o: st.osd_set)
{
if (!o.outdated)
{
read_target[o.role] = o.osd_num;
}
read_target[o.role] = o.osd_num;
}
pg->state_dict[osd_set] = {
pg.state_dict[st.osd_set] = {
.read_target = read_target,
.osd_set = osd_set,
.osd_set = st.osd_set,
.state = state,
.object_count = 1,
};
it = pg->state_dict.find(osd_set);
it = pg.state_dict.find(st.osd_set);
}
else
{
it->second.object_count++;
}
if (state & OBJ_INCOMPLETE)
pg.obj_states[st.oid] = &it->second;
if (st.target_ver < st.max_ver)
{
pg->incomplete_objects[oid] = &it->second;
pg.ver_override[st.oid] = st.target_ver;
}
else if (state & OBJ_DEGRADED)
if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
{
pg->degraded_objects[oid] = &it->second;
}
else
{
pg->misplaced_objects[oid] = &it->second;
spp::sparse_hash_map<obj_piece_id_t, obj_piece_ver_t> pieces;
for (int i = st.obj_start; i < st.obj_end; i++)
{
auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }];
if (!pcs.max_ver)
{
pcs.max_ver = all[i].version;
}
if (all[i].is_stable && !pcs.stable_ver)
{
pcs.stable_ver = all[i].version;
}
}
for (auto pp: pieces)
{
auto & pcs = pp.second;
if (pcs.stable_ver < pcs.max_ver)
{
auto & act = obj_stab_actions[pp.first];
if (pcs.max_ver > st.target_ver)
{
act.rollback = true;
act.rollback_to = st.target_ver;
}
else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver)
{
act.rollback = true;
act.rollback_to = pcs.stable_ver;
}
if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver)
{
act.make_stable = true;
act.stable_to = st.target_ver;
}
}
}
}
}
else
pg.clean_count++;
pg.total_count++;
}
// FIXME: Write at least some tests for this function
void pg_t::calc_object_states(int log_level)
void pg_t::calc_object_states()
{
auto & pg = *this;
// Copy all object lists into one array
pg_obj_state_check_t st;
st.log_level = log_level;
st.pg = this;
auto ps = peering_state;
std::vector<obj_ver_role> all;
auto ps = pg.peering_state;
for (auto it: ps->list_results)
{
auto nstab = it.second.stable_count;
auto n = it.second.total_count;
auto osd_num = it.first;
uint64_t start = st.list.size();
st.list.resize(start + n);
uint64_t start = all.size();
all.resize(start + n);
obj_ver_id *ov = it.second.buf;
for (uint64_t i = 0; i < n; i++, ov++)
{
st.list[start+i] = {
all[start+i] = {
.oid = ov->oid,
.version = ov->version,
.osd_num = osd_num,
@@ -366,26 +165,101 @@ void pg_t::calc_object_states(int log_level)
}
ps->list_results.clear();
// Sort
std::sort(st.list.begin(), st.list.end());
std::sort(all.begin(), all.end());
// Walk over it and check object states
st.walk();
}
void pg_t::print_state()
{
pg.clean_count = 0;
pg.total_count = 0;
pg.state = 0;
int replica = 0;
pg_obj_state_check_t st;
for (int i = 0; i < all.size(); i++)
{
if (st.oid.inode != all[i].oid.inode ||
st.oid.stripe != (all[i].oid.stripe & ~STRIPE_MASK))
{
if (st.oid.inode != 0)
{
// Remember object state
st.obj_end = st.ver_end = i;
remember_object(st, all);
}
st.obj_start = st.ver_start = i;
st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe & ~STRIPE_MASK };
st.max_ver = st.target_ver = all[i].version;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
st.is_buggy = st.has_old_unstable = false;
}
else if (st.target_ver != all[i].version)
{
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
{
// Last processed version is either recoverable or stable, choose it as target and skip previous versions
st.ver_end = i;
i++;
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
st.oid.stripe == (all[i].oid.stripe & ~STRIPE_MASK))
{
if (!all[i].is_stable)
{
st.has_old_unstable = true;
}
i++;
}
st.obj_end = i;
i--;
continue;
}
else
{
// Last processed version is unstable and unrecoverable
// We'll know that because target_ver < max_ver
st.ver_start = i;
st.target_ver = all[i].version;
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
}
}
replica = (all[i].oid.stripe & STRIPE_MASK);
st.n_copies++;
if (replica >= pg.pg_size)
{
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
st.is_buggy = true;
}
else
{
if (all[i].is_stable)
{
st.n_stable++;
}
if (pg.cur_set[replica] == all[i].osd_num)
{
st.n_matched++;
}
if (!(st.has_roles & (1 << replica)))
{
st.has_roles = st.has_roles | (1 << replica);
st.n_roles++;
}
}
}
if (st.oid.inode != 0)
{
// Remember object state
st.obj_end = st.ver_end = all.size();
remember_object(st, all);
}
if (pg.pg_cursize < pg.pg_size)
{
pg.state = pg.state | PG_DEGRADED;
}
printf(
"[PG %u] is %s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
(state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "",
(state & PG_INCOMPLETE) ? "incomplete" : "",
(state & PG_ACTIVE) ? "active" : "",
(state & PG_STOPPING) ? "stopping" : "",
(state & PG_DEGRADED) ? " + degraded" : "",
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
total_count
"PG %u is active%s%s%s%s%s (%lu objects)\n", pg.pg_num,
(pg.state & PG_DEGRADED) ? " + degraded" : "",
(pg.state & PG_HAS_UNFOUND) ? " + has_unfound" : "",
(pg.state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
(pg.state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
(pg.state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
pg.total_count
);
pg.state = pg.state | PG_ACTIVE;
}

View File

@@ -1,19 +1,43 @@
#include <map>
#include <unordered_map>
#include <vector>
#include <algorithm>
#include "cpp-btree/btree_map.h"
#include "object_id.h"
#include "osd_ops.h"
#include "pg_states.h"
#include "sparsepp/sparsepp/spp.h"
// Placement group states
// Exactly one of these:
#define PG_OFFLINE (1<<0)
#define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3)
// Plus any of these:
#define PG_DEGRADED (1<<4)
#define PG_HAS_UNFOUND (1<<5)
#define PG_HAS_DEGRADED (1<<6)
#define PG_HAS_MISPLACED (1<<7)
#define PG_HAS_UNCLEAN (1<<8)
// FIXME: Safe default that doesn't depend on parity_block_size of pg_parity_size
#define STRIPE_MASK ((uint64_t)4096 - 1)
// OSD object states
#define OBJ_CLEAN 0x01
#define OBJ_MISPLACED 0x02
#define OBJ_DEGRADED 0x03
#define OBJ_INCOMPLETE 0x04
#define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000
#define OBJ_OVERCOPIED 0x40000
#define OBJ_BUGGY 0x80000
struct pg_obj_loc_t
{
uint64_t role;
osd_num_t osd_num;
bool outdated;
bool stable;
};
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -40,9 +64,28 @@ struct osd_op_t;
struct pg_peering_state_t
{
// osd_num -> list result
std::unordered_map<osd_num_t, osd_op_t*> list_ops;
std::unordered_map<osd_num_t, pg_list_result_t> list_results;
pg_num_t pg_num = 0;
spp::sparse_hash_map<osd_num_t, osd_op_t*> list_ops;
spp::sparse_hash_map<osd_num_t, pg_list_result_t> list_results;
int list_done = 0;
};
struct pg_obj_state_check_t
{
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
object_id oid = { 0 };
uint64_t max_ver = 0;
uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
bool is_buggy = false, has_old_unstable = false;
pg_osd_set_t osd_set;
};
struct obj_ver_role
{
object_id oid;
uint64_t version;
uint64_t osd_num;
bool is_stable;
};
struct obj_piece_id_t
@@ -51,63 +94,60 @@ struct obj_piece_id_t
uint64_t osd_num;
};
struct flush_action_t
struct obj_piece_ver_t
{
uint64_t max_ver = 0;
uint64_t stable_ver = 0;
};
struct obj_stab_action_t
{
bool rollback = false, make_stable = false;
uint64_t stable_to = 0, rollback_to = 0;
bool submitted = false;
};
struct pg_flush_batch_t
{
std::map<osd_num_t, std::vector<obj_ver_id>> rollback_lists;
std::map<osd_num_t, std::vector<obj_ver_id>> stable_lists;
int flush_ops = 0, flush_done = 0;
int flush_objects = 0;
};
struct pg_t
{
int state = 0;
int state;
uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
pg_num_t pg_num;
uint64_t clean_count = 0, total_count = 0;
// target history and all potential peers
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
bool history_changed = false;
// peer list from the last peering event
std::vector<osd_num_t> cur_peers;
// target_set is the "correct" peer OSD set for this PG
std::vector<osd_num_t> target_set;
// cur_set is the current set of connected peer OSDs for this PG
// cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
std::vector<osd_num_t> cur_set;
// same thing in state_dict-like format
pg_osd_set_t cur_loc_set;
// moved object map. by default, each object is considered to reside on the cur_set.
// this map stores all objects that differ.
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
// which is up to ~192 MB per 1 TB in the worst case scenario
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
std::map<obj_piece_id_t, flush_action_t> flush_actions;
btree::btree_map<object_id, uint64_t> ver_override;
spp::sparse_hash_map<object_id, pg_osd_set_state_t*> obj_states;
std::map<obj_piece_id_t, obj_stab_action_t> obj_stab_actions;
spp::sparse_hash_map<object_id, uint64_t> ver_override;
pg_peering_state_t *peering_state = NULL;
pg_flush_batch_t *flush_batch = NULL;
int inflight = 0; // including write_queue
std::multimap<object_id, osd_op_t*> write_queue;
void calc_object_states(int log_level);
void print_state();
void calc_object_states();
void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
};
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
{
return a.outdated < b.outdated ||
a.outdated == b.outdated && a.role < b.role ||
a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num ||
a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
}
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
{
// ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
(a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
(a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
)
);
}
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
@@ -132,6 +172,7 @@ namespace std
// Copy-pasted from spp::hash_combine()
seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
}
return seed;
}

View File

@@ -1,54 +0,0 @@
#define _LARGEFILE64_SOURCE
#include "osd_peering_pg.h"
#define STRIPE_SHIFT 12
/**
* TODO tests for object & pg state calculation.
*
* 1) pg=1,2,3. objects:
* v1=1s,2s,3s -> clean
* v1=1s,2s,3 v2=1s,2s,_ -> degraded + needs_rollback
* v1=1s,2s,_ -> degraded
* v1=1s,2s,3s v2=1,6,_ -> degraded + needs_stabilize
* v1=2s,1s,3s -> misplaced
* v1=4,5,6 -> misplaced + needs_stabilize
* v1=1s,2s,6s -> misplaced
* 2) ...
*/
int main(int argc, char *argv[])
{
pg_t pg = {
.state = PG_PEERING,
.pg_num = 1,
.target_set = { 1, 2, 3 },
.cur_set = { 1, 2, 3 },
.peering_state = new pg_peering_state_t(),
};
for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
{
pg_list_result_t r = {
.buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
.total_count = 1024*1024*8,
.stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
};
for (uint64_t i = 0; i < r.total_count; i++)
{
r.buf[i] = {
.oid = {
.inode = 1,
.stripe = (i << STRIPE_SHIFT) | (osd_num-1),
},
.version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
};
}
pg.peering_state->list_results[osd_num] = r;
}
pg.calc_object_states(0);
printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
for (auto it: pg.state_dict)
{
printf("dev: state=%lx\n", it.second.state);
}
return 0;
}

View File

@@ -1,81 +1,98 @@
#include "osd_primary.h"
#include "osd.h"
#include "osd_rmw.h"
#define SUBMIT_READ 0
#define SUBMIT_RMW_READ 1
#define SUBMIT_WRITE 2
// read: read directly or read paired stripe(s), reconstruct, return
// write: read paired stripe(s), reconstruct, modify, calculate parity, write
// write: read paired stripe(s), modify, write
//
// nuance: take care to read the same version from paired stripes!
// to do so, we remember "last readable" version until a write request completes
// and we postpone other write requests to the same stripe until completion of previous ones
//
// sync: sync peers, get unstable versions, stabilize them
// sync: sync peers, get unstable versions from somewhere, stabilize them
struct unstable_osd_num_t
{
osd_num_t osd_num;
int start, len;
};
struct osd_primary_op_data_t
{
int st = 0;
pg_num_t pg_num;
object_id oid;
uint64_t target_ver;
uint64_t fact_ver = 0;
int n_subops = 0, done = 0, errors = 0;
int degraded = 0, pg_size, pg_minsize;
osd_rmw_stripe_t *stripes;
osd_op_t *subops = NULL;
// for sync. oops, requires freeing
std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
obj_ver_id *unstable_writes = NULL;
};
void osd_t::finish_primary_op(osd_op_t *cur_op, int retval)
{
// FIXME add separate magic number
auto cl_it = clients.find(cur_op->peer_fd);
if (cl_it != clients.end())
{
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval;
outbox_push(cl_it->second, cur_op);
}
else
{
delete cur_op;
}
}
bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
{
// PG number is calculated from the offset
// Our EC scheme stores data in fixed chunks equal to (K*block size)
// K = pg_minsize and will be a property of the inode. Not it's hardcoded (FIXME)
uint64_t pg_block_size = bs_block_size * 2;
object_id oid = {
.inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe
.stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
};
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_count + 1;
auto pg_it = pgs.find(pg_num);
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
// But we must not use K in the process of calculating the PG number
// So we calculate the PG number using a separate setting which should be per-inode (FIXME)
// FIXME Real pg_num should equal the below expression + 1
pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / parity_block_size) % pg_count;
// FIXME: Postpone operations in inactive PGs
if (pg_num > pgs.size() || !(pgs[pg_num].state & PG_ACTIVE))
{
// This OSD is not primary for this PG or the PG is inactive
finish_op(cur_op, -EPIPE);
finish_primary_op(cur_op, -EINVAL);
return false;
}
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
uint64_t pg_parity_size = bs_block_size * pgs[pg_num].pg_minsize;
object_id oid = {
.inode = cur_op->req.rw.inode,
// oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
.stripe = (cur_op->req.rw.offset / parity_block_size) * parity_block_size +
((cur_op->req.rw.offset % parity_block_size) / pg_parity_size) * pg_parity_size
};
if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_parity_size) ||
(cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
(cur_op->req.rw.len % bs_disk_alignment) != 0)
{
finish_op(cur_op, -EINVAL);
finish_primary_op(cur_op, -EINVAL);
return false;
}
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc(
sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pg_it->second.pg_size, 1
sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pgs[pg_num].pg_size, 1
);
op_data->pg_num = pg_num;
op_data->oid = oid;
op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
cur_op->op_data = op_data;
split_stripes(pg_it->second.pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
pg_it->second.inflight++;
split_stripes(pgs[pg_num].pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
return true;
}
static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
{
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
{
*object_state = NULL;
return def;
}
auto st_it = pg.incomplete_objects.find(oid);
if (st_it != pg.incomplete_objects.end())
{
*object_state = st_it->second;
return st_it->second->read_target.data();
}
st_it = pg.degraded_objects.find(oid);
if (st_it != pg.degraded_objects.end())
{
*object_state = st_it->second;
return st_it->second->read_target.data();
}
st_it = pg.misplaced_objects.find(oid);
if (st_it != pg.misplaced_objects.end())
{
*object_state = st_it->second;
return st_it->second->read_target.data();
}
*object_state = NULL;
return def;
}
void osd_t::continue_primary_read(osd_op_t *cur_op)
{
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
@@ -106,10 +123,14 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
else
{
// PG may be degraded or have misplaced objects
uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
auto st_it = pg.obj_states.find(op_data->oid);
uint64_t* cur_set = (st_it != pg.obj_states.end()
? st_it->second->read_target.data()
: pg.cur_set.data());
if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
{
finish_op(cur_op, -EIO);
free(op_data);
finish_primary_op(cur_op, -EIO);
return;
}
// Submit reads
@@ -126,7 +147,9 @@ resume_1:
resume_2:
if (op_data->errors > 0)
{
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
free(op_data);
cur_op->op_data = NULL;
finish_primary_op(cur_op, -EIO);
return;
}
if (op_data->degraded)
@@ -150,34 +173,143 @@ resume_2:
}
}
}
finish_op(cur_op, cur_op->req.rw.len);
free(op_data);
cur_op->op_data = NULL;
finish_primary_op(cur_op, cur_op->req.rw.len);
}
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
{
bool w = submit_type == SUBMIT_WRITE;
osd_primary_op_data_t *op_data = cur_op->op_data;
osd_rmw_stripe_t *stripes = op_data->stripes;
// Allocate subops
int n_subops = 0, zero_read = -1;
for (int role = 0; role < pg_size; role++)
{
if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
{
zero_read = role;
}
if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
{
n_subops++;
}
}
if (!n_subops && submit_type == SUBMIT_RMW_READ)
{
n_subops = 1;
}
else
{
zero_read = -1;
}
osd_op_t *subops = new osd_op_t[n_subops];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_subops;
op_data->subops = subops;
int subop = 0;
for (int role = 0; role < pg_size; role++)
{
// We always submit zero-length writes to all replicas, even if the stripe is not modified
if (!(w || stripes[role].read_end != 0 || zero_read == role))
{
continue;
}
osd_num_t role_osd_num = osd_set[role];
if (role_osd_num != 0)
{
if (role_osd_num == this->osd_num)
{
subops[subop].bs_op = new blockstore_op_t({
.opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
.callback = [cur_op, this](blockstore_op_t *subop)
{
handle_primary_subop(cur_op, subop->retval == subop->len, subop->version);
},
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
},
.version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
.buf = w ? stripes[role].write_buf : stripes[role].read_buf,
});
bs->enqueue_op(subops[subop].bs_op);
}
else
{
subops[subop].op_type = OSD_OP_OUT;
subops[subop].send_list.push_back(subops[subop].req.buf, OSD_PACKET_SIZE);
subops[subop].peer_fd = this->osd_peer_fds.at(role_osd_num);
subops[subop].req.sec_rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
},
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
},
.version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
};
subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
if (w && stripes[role].write_end > 0)
{
subops[subop].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
}
subops[subop].callback = [cur_op, this](osd_op_t *subop)
{
// so it doesn't get freed
subop->buf = NULL;
handle_primary_subop(cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
};
outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
}
subop++;
}
}
}
void osd_t::handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
// Check if actions are pending for this object
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid,
.osd_num = 0,
});
if (act_it != pg.flush_actions.end() &&
act_it->first.oid.inode == op_data->oid.inode &&
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
op_data->fact_ver = version;
if (!ok)
{
pg.write_queue.emplace(op_data->oid, cur_op);
return false;
// FIXME: Handle errors
op_data->errors++;
}
// Check if there are other write requests to the same object
auto vo_it = pg.write_queue.find(op_data->oid);
if (vo_it != pg.write_queue.end())
else
{
op_data->st = 1;
pg.write_queue.emplace(op_data->oid, cur_op);
return false;
op_data->done++;
}
if ((op_data->errors + op_data->done) >= op_data->n_subops)
{
delete[] op_data->subops;
op_data->subops = NULL;
op_data->st++;
if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
continue_primary_read(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
continue_primary_write(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
{
continue_primary_sync(cur_op);
}
else
{
throw std::runtime_error("BUG: unknown opcode");
}
}
pg.write_queue.emplace(op_data->oid, cur_op);
return true;
}
void osd_t::continue_primary_write(osd_op_t *cur_op)
@@ -187,115 +319,89 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
return;
}
osd_primary_op_data_t *op_data = cur_op->op_data;
// FIXME: Handle operation cancel
auto & pg = pgs[op_data->pg_num];
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
else if (op_data->st == 7) goto resume_7;
else if (op_data->st == 8) goto resume_8;
else if (op_data->st == 9) goto resume_9;
assert(op_data->st == 0);
if (!check_write_queue(cur_op, pg))
// Check if actions are pending for this object
{
return;
auto act_it = pg.obj_stab_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid,
.osd_num = 0,
});
if (act_it != pg.obj_stab_actions.end() &&
act_it->first.oid.inode == op_data->oid.inode &&
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
{
// FIXME postpone the request until actions are done
free(op_data);
finish_primary_op(cur_op, -EIO);
return;
}
}
// Check if there are other write requests to the same object
{
auto vo_it = pg.write_queue.find(op_data->oid);
if (vo_it != pg.write_queue.end())
{
op_data->st = 1;
pg.write_queue.emplace(op_data->oid, cur_op);
return;
}
pg.write_queue.emplace(op_data->oid, cur_op);
}
resume_1:
// Determine blocks to read and write
// Missing chunks are allowed to be overwritten even in incomplete objects
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for the lower performance impact
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
// Determine blocks to read
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
// Read required blocks
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
resume_2:
op_data->st = 2;
return;
resume_3:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Recover missing stripes, calculate parity
calc_rmw_parity(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
// Calculate parity
calc_rmw_parity(op_data->stripes, pg.pg_size);
// Send writes
submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
resume_4:
op_data->st = 4;
return;
resume_5:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
if (op_data->fact_ver == 1)
{
// Object is created
pg.clean_count++;
pg.total_count++;
}
if (op_data->object_state)
// Remember version as unstable
osd_num_t *osd_set = pg.cur_set.data();
for (int role = 0; role < pg.pg_size; role++)
{
if (osd_set[role] != 0)
{
int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
recovery_stat_count[0][recovery_type]++;
if (!recovery_stat_count[0][recovery_type])
{
recovery_stat_count[0][recovery_type]++;
recovery_stat_bytes[0][recovery_type] = 0;
}
for (int role = 0; role < pg.pg_size; role++)
{
recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
}
this->unstable_writes[(osd_object_id_t){
.osd_num = osd_set[role],
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
},
}] = op_data->fact_ver;
}
if (op_data->object_state->state & OBJ_MISPLACED)
{
// Remove extra chunks
submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set);
if (op_data->n_subops > 0)
{
resume_8:
op_data->st = 8;
return;
resume_9:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
}
}
// Clear object state
remove_object_from_state(op_data->oid, op_data->object_state, pg);
pg.clean_count++;
}
// Remember PG as dirty to drop the connection when PG goes offline
// (this is required because of the "lazy sync")
this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
// Remove version override
pg.ver_override.erase(op_data->oid);
// FIXME: Check for immediate_commit == IMMEDIATE_SMALL
resume_6:
resume_7:
if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
{
return;
}
object_id oid = op_data->oid;
finish_op(cur_op, cur_op->req.rw.len);
finish_primary_op(cur_op, cur_op->req.rw.len);
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
{
auto next_it = pg.write_queue.find(op_data->oid);
auto this_it = next_it;
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() && next_it->first == oid)
if (next_it != pg.write_queue.end() &&
next_it->first == op_data->oid)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
@@ -303,98 +409,27 @@ resume_7:
}
}
bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == base_state)
{
goto resume_6;
}
else if (op_data->st == base_state+1)
{
goto resume_7;
}
if (immediate_commit == IMMEDIATE_ALL)
{
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
op_data->unstable_writes = new obj_ver_id[loc_set.size()];
{
int last_start = 0;
for (auto & chunk: loc_set)
{
op_data->unstable_writes[last_start] = (obj_ver_id){
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
},
.version = op_data->fact_ver,
};
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
.osd_num = chunk.osd_num,
.start = last_start,
.len = 1,
});
last_start++;
}
}
submit_primary_stab_subops(cur_op);
resume_6:
op_data->st = 6;
return false;
resume_7:
// FIXME: Free those in the destructor?
delete op_data->unstable_write_osds;
delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL;
op_data->unstable_write_osds = NULL;
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return false;
}
}
else
{
// Remember version as unstable
for (auto & chunk: loc_set)
{
this->unstable_writes[(osd_object_id_t){
.osd_num = chunk.osd_num,
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
},
}] = op_data->fact_ver;
}
// Remember PG as dirty to drop the connection when PG goes offline
// (this is required because of the "lazy sync")
c_cli.clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
dirty_pgs.insert(op_data->pg_num);
}
return true;
}
// Save and clear unstable_writes -> SYNC all -> STABLE all
// FIXME: Run regular automatic syncs based on the number of unstable writes and/or system time
void osd_t::continue_primary_sync(osd_op_t *cur_op)
{
if (!cur_op->op_data)
{
cur_op->op_data = (osd_primary_op_data_t*)calloc(sizeof(osd_primary_op_data_t), 1);
}
osd_primary_op_data_t *op_data = cur_op->op_data;
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
else if (op_data->st == 6) goto resume_6;
assert(op_data->st == 0);
if (cur_op->op_data->st == 1) goto resume_1;
else if (cur_op->op_data->st == 2) goto resume_2;
else if (cur_op->op_data->st == 3) goto resume_3;
else if (cur_op->op_data->st == 4) goto resume_4;
else if (cur_op->op_data->st == 5) goto resume_5;
else if (cur_op->op_data->st == 6) goto resume_6;
assert(cur_op->op_data->st == 0);
if (syncs_in_progress.size() > 0)
{
// Wait for previous syncs, if any
// FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
syncs_in_progress.push_back(cur_op);
op_data->st = 1;
cur_op->op_data->st = 1;
resume_1:
return;
}
@@ -403,28 +438,27 @@ resume_1:
syncs_in_progress.push_back(cur_op);
}
resume_2:
// FIXME: Handle operation cancel
if (unstable_writes.size() == 0)
{
// Nothing to sync
goto finish;
}
// Save and clear unstable_writes
// In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
// It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
// FIXME: This is possible to do it on a per-client basis
// It would be cool not to copy them here at all, but someone has to deduplicate them by object IDs anyway
cur_op->op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
cur_op->op_data->unstable_writes = new obj_ver_id[unstable_writes.size()];
{
op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
op_data->dirty_pgs = new pg_num_t[dirty_pgs.size()];
op_data->dirty_pg_count = dirty_pgs.size();
osd_num_t last_osd = 0;
int last_start = 0, last_end = 0;
for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++)
{
if (last_osd != it->first.osd_num)
{
if (last_osd != 0)
{
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
.osd_num = last_osd,
.start = last_start,
.len = last_end - last_start,
@@ -433,7 +467,7 @@ resume_2:
last_osd = it->first.osd_num;
last_start = last_end;
}
op_data->unstable_writes[last_end] = (obj_ver_id){
cur_op->op_data->unstable_writes[last_end] = (obj_ver_id){
.oid = it->first.oid,
.version = it->second,
};
@@ -441,226 +475,129 @@ resume_2:
}
if (last_osd != 0)
{
op_data->unstable_write_osds->push_back((unstable_osd_num_t){
cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
.osd_num = last_osd,
.start = last_start,
.len = last_end - last_start,
});
}
int dpg = 0;
for (auto dirty_pg_num: dirty_pgs)
{
pgs[dirty_pg_num].inflight++;
op_data->dirty_pgs[dpg++] = dirty_pg_num;
}
dirty_pgs.clear();
this->unstable_writes.clear();
}
if (immediate_commit != IMMEDIATE_ALL)
{
// SYNC
submit_primary_sync_subops(cur_op);
unstable_writes.clear();
// SYNC
submit_primary_sync_subops(cur_op);
resume_3:
op_data->st = 3;
return;
cur_op->op_data->st = 3;
return;
resume_4:
if (op_data->errors > 0)
{
goto resume_6;
}
}
// Stabilize version sets
submit_primary_stab_subops(cur_op);
resume_5:
op_data->st = 5;
cur_op->op_data->st = 5;
return;
resume_6:
if (op_data->errors > 0)
{
// Return objects back into the unstable write set
for (auto unstable_osd: *(op_data->unstable_write_osds))
{
for (int i = 0; i < unstable_osd.len; i++)
{
// Except those from peered PGs
auto & w = op_data->unstable_writes[i];
pg_num_t wpg = map_to_pg(w.oid);
if (pgs[wpg].state & PG_ACTIVE)
{
uint64_t & dest = this->unstable_writes[(osd_object_id_t){
.osd_num = unstable_osd.osd_num,
.oid = w.oid,
}];
dest = dest < w.version ? w.version : dest;
dirty_pgs.insert(wpg);
}
}
}
}
for (int i = 0; i < op_data->dirty_pg_count; i++)
{
auto & pg = pgs.at(op_data->dirty_pgs[i]);
pg.inflight--;
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
}
// FIXME: Free those in the destructor?
delete op_data->dirty_pgs;
delete op_data->unstable_write_osds;
delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL;
op_data->unstable_write_osds = NULL;
if (op_data->errors > 0)
{
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
}
else
{
// FIXME: Free them correctly (via a destructor or so)
delete cur_op->op_data->unstable_write_osds;
delete[] cur_op->op_data->unstable_writes;
cur_op->op_data->unstable_writes = NULL;
cur_op->op_data->unstable_write_osds = NULL;
finish:
if (cur_op->peer_fd)
{
auto it = c_cli.clients.find(cur_op->peer_fd);
if (it != c_cli.clients.end())
it->second.dirty_pgs.clear();
}
finish_op(cur_op, 0);
}
assert(syncs_in_progress.front() == cur_op);
syncs_in_progress.pop_front();
finish_primary_op(cur_op, 0);
if (syncs_in_progress.size() > 0)
{
cur_op = syncs_in_progress.front();
op_data = cur_op->op_data;
op_data->st++;
cur_op->op_data->st++;
goto resume_2;
}
}
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
{
if (object_state->state & OBJ_INCOMPLETE)
osd_primary_op_data_t *op_data = cur_op->op_data;
int n_osds = op_data->unstable_write_osds->size();
osd_op_t *subops = new osd_op_t[n_osds];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_osds;
op_data->subops = subops;
for (int i = 0; i < n_osds; i++)
{
// Successful write means that object is not incomplete anymore
this->incomplete_objects--;
pg.incomplete_objects.erase(oid);
if (!pg.incomplete_objects.size())
osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
if (sync_osd == this->osd_num)
{
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
report_pg_state(pg);
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_SYNC,
.callback = [cur_op, this](blockstore_op_t *subop)
{
handle_primary_subop(cur_op, subop->retval == 0, 0);
},
});
bs->enqueue_op(subops[i].bs_op);
}
}
else if (object_state->state & OBJ_DEGRADED)
{
this->degraded_objects--;
pg.degraded_objects.erase(oid);
if (!pg.degraded_objects.size())
else
{
pg.state = pg.state & ~PG_HAS_DEGRADED;
report_pg_state(pg);
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd = osd_peer_fds.at(sync_osd);
subops[i].req.sec_sync = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SECONDARY_SYNC,
},
};
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
};
outbox_push(clients[subops[i].peer_fd], &subops[i]);
}
}
else if (object_state->state & OBJ_MISPLACED)
{
this->misplaced_objects--;
pg.misplaced_objects.erase(oid);
if (!pg.misplaced_objects.size())
{
pg.state = pg.state & ~PG_HAS_MISPLACED;
report_pg_state(pg);
}
}
else
{
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
}
object_state->object_count--;
if (!object_state->object_count)
{
pg.state_dict.erase(object_state->osd_set);
}
}
void osd_t::continue_primary_del(osd_op_t *cur_op)
void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
{
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
{
return;
}
osd_primary_op_data_t *op_data = cur_op->op_data;
auto & pg = pgs[op_data->pg_num];
if (op_data->st == 1) goto resume_1;
else if (op_data->st == 2) goto resume_2;
else if (op_data->st == 3) goto resume_3;
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
int n_osds = op_data->unstable_write_osds->size();
osd_op_t *subops = new osd_op_t[n_osds];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_osds;
op_data->subops = subops;
for (int i = 0; i < n_osds; i++)
{
finish_op(cur_op, -EBUSY);
return;
}
if (!check_write_queue(cur_op, pg))
{
return;
}
resume_1:
// Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
resume_2:
op_data->st = 2;
return;
resume_3:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Save version override for parallel reads
pg.ver_override[op_data->oid] = op_data->fact_ver;
// Submit deletes
op_data->fact_ver++;
submit_primary_del_subops(cur_op, NULL, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
resume_4:
op_data->st = 4;
return;
resume_5:
if (op_data->errors > 0)
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
// Remove version override
pg.ver_override.erase(op_data->oid);
// Adjust PG stats after "instant stabilize", because we need object_state above
if (!op_data->object_state)
{
pg.clean_count--;
}
else
{
remove_object_from_state(op_data->oid, op_data->object_state, pg);
}
pg.total_count--;
object_id oid = op_data->oid;
finish_op(cur_op, cur_op->req.rw.len);
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
{
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
auto & stab_osd = (*(op_data->unstable_write_osds))[i];
if (stab_osd.osd_num == this->osd_num)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_STABLE,
.callback = [cur_op, this](blockstore_op_t *subop)
{
handle_primary_subop(cur_op, subop->retval == 0, 0);
},
.len = (uint32_t)stab_osd.len,
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
});
bs->enqueue_op(subops[i].bs_op);
}
else
{
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd = osd_peer_fds.at(stab_osd.osd_num);
subops[i].req.sec_stab = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = this->next_subop_id++,
.opcode = OSD_OP_SECONDARY_STABILIZE,
},
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
};
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
};
outbox_push(clients[subops[i].peer_fd], &subops[i]);
}
}
}

View File

@@ -1,35 +0,0 @@
#pragma once
#include "osd.h"
#include "osd_rmw.h"
#define SUBMIT_READ 0
#define SUBMIT_RMW_READ 1
#define SUBMIT_WRITE 2
struct unstable_osd_num_t
{
osd_num_t osd_num;
int start, len;
};
struct osd_primary_op_data_t
{
int st = 0;
pg_num_t pg_num;
object_id oid;
uint64_t target_ver;
uint64_t fact_ver = 0;
int n_subops = 0, done = 0, errors = 0, epipe = 0;
int degraded = 0, pg_size, pg_minsize;
osd_rmw_stripe_t *stripes;
osd_op_t *subops = NULL;
uint64_t *prev_set = NULL;
pg_osd_set_state_t *object_state = NULL;
// for sync. oops, requires freeing
std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
pg_num_t *dirty_pgs = NULL;
int dirty_pg_count = 0;
obj_ver_id *unstable_writes = NULL;
};

View File

@@ -1,551 +0,0 @@
#include "osd_primary.h"
void osd_t::autosync()
{
// FIXME Autosync based on the number of unstable writes to prevent
// "journal_sector_buffer_count is too low for this batch" errors
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
{
autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN;
autosync_op->req = {
.sync = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = 1,
.opcode = OSD_OP_SYNC,
},
},
};
autosync_op->callback = [this](osd_op_t *op)
{
if (op->reply.hdr.retval < 0)
{
printf("Warning: automatic sync resulted in an error: %ld (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
}
delete autosync_op;
autosync_op = NULL;
};
exec_op(autosync_op);
}
}
void osd_t::finish_op(osd_op_t *cur_op, int retval)
{
inflight_ops--;
if (cur_op->op_data)
{
if (cur_op->op_data->pg_num > 0)
{
auto & pg = pgs[cur_op->op_data->pg_num];
pg.inflight--;
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
}
assert(!cur_op->op_data->subops);
assert(!cur_op->op_data->unstable_write_osds);
assert(!cur_op->op_data->unstable_writes);
assert(!cur_op->op_data->dirty_pgs);
free(cur_op->op_data);
cur_op->op_data = NULL;
}
if (!cur_op->peer_fd)
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
}
else
{
// FIXME add separate magic number
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
if (cl_it != c_cli.clients.end())
{
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval;
c_cli.outbox_push(cur_op);
}
else
{
delete cur_op;
}
}
}
void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
{
bool w = submit_type == SUBMIT_WRITE;
osd_primary_op_data_t *op_data = cur_op->op_data;
osd_rmw_stripe_t *stripes = op_data->stripes;
// Allocate subops
int n_subops = 0, zero_read = -1;
for (int role = 0; role < pg_size; role++)
{
if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
{
zero_read = role;
}
if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
{
n_subops++;
}
}
if (!n_subops && submit_type == SUBMIT_RMW_READ)
{
n_subops = 1;
}
else
{
zero_read = -1;
}
uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0;
op_data->done = op_data->errors = 0;
op_data->n_subops = n_subops;
op_data->subops = subops;
int i = 0;
for (int role = 0; role < pg_size; role++)
{
// We always submit zero-length writes to all replicas, even if the stripe is not modified
if (!(w || stripes[role].read_end != 0 || zero_read == role))
{
continue;
}
osd_num_t role_osd_num = osd_set[role];
if (role_osd_num != 0)
{
if (role_osd_num == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
{
handle_primary_bs_subop(subop);
},
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
},
.version = op_version,
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
.buf = w ? stripes[role].write_buf : stripes[role].read_buf,
});
#ifdef OSD_DEBUG
printf(
"Submit %s to local: %lu:%lu v%lu %u-%u\n", w ? "write" : "read",
op_data->oid.inode, op_data->oid.stripe | role, op_version,
subops[i].bs_op->offset, subops[i].bs_op->len
);
#endif
bs->enqueue_op(subops[i].bs_op);
}
else
{
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
subops[i].req.sec_rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
},
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role,
},
.version = op_version,
.offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
};
#ifdef OSD_DEBUG
printf(
"Submit %s to osd %lu: %lu:%lu v%lu %u-%u\n", w ? "write" : "read", role_osd_num,
op_data->oid.inode, op_data->oid.stripe | role, op_version,
subops[i].req.sec_rw.offset, subops[i].req.sec_rw.len
);
#endif
subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
if (w && stripes[role].write_end > 0)
{
subops[i].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
}
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE &&
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
// so it doesn't get freed
subop->buf = NULL;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// write operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
i++;
}
}
}
static uint64_t bs_op_to_osd_op[] = {
0,
OSD_OP_SECONDARY_READ, // BS_OP_READ
OSD_OP_SECONDARY_WRITE, // BS_OP_WRITE
OSD_OP_SECONDARY_SYNC, // BS_OP_SYNC
OSD_OP_SECONDARY_STABILIZE, // BS_OP_STABLE
OSD_OP_SECONDARY_DELETE, // BS_OP_DELETE
OSD_OP_SECONDARY_LIST, // BS_OP_LIST
OSD_OP_SECONDARY_ROLLBACK, // BS_OP_ROLLBACK
OSD_OP_TEST_SYNC_STAB_ALL, // BS_OP_SYNC_STAB_ALL
};
void osd_t::handle_primary_bs_subop(osd_op_t *subop)
{
osd_op_t *cur_op = (osd_op_t*)subop->op_type;
blockstore_op_t *bs_op = subop->bs_op;
int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE ? bs_op->len : 0;
if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
{
// die
throw std::runtime_error(
"local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
" retval = "+std::to_string(bs_op->retval)+")"
);
}
add_bs_subop_stats(subop);
subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode];
subop->reply.hdr.retval = bs_op->retval;
if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE)
{
subop->req.sec_rw.len = bs_op->len;
subop->reply.sec_rw.version = bs_op->version;
}
delete bs_op;
subop->bs_op = NULL;
handle_primary_subop(subop, cur_op);
}
void osd_t::add_bs_subop_stats(osd_op_t *subop)
{
// Include local blockstore ops in statistics
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
c_cli.stats.op_stat_count[opcode]++;
if (!c_cli.stats.op_stat_count[opcode])
{
c_cli.stats.op_stat_count[opcode] = 1;
c_cli.stats.op_stat_sum[opcode] = 0;
c_cli.stats.op_stat_bytes[opcode] = 0;
}
c_cli.stats.op_stat_sum[opcode] += (
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
);
if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
{
c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
}
}
void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
{
uint64_t opcode = subop->req.hdr.opcode;
int retval = subop->reply.hdr.retval;
int expected = opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE
? subop->req.sec_rw.len : 0;
osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval != expected)
{
printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected);
if (retval == -EPIPE)
{
op_data->epipe++;
}
op_data->errors++;
}
else
{
op_data->done++;
if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
{
uint64_t version = subop->reply.sec_rw.version;
#ifdef OSD_DEBUG
uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
? c_cli.clients[subop->peer_fd].osd_num : osd_num;
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
#endif
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
{
throw std::runtime_error(
"different fact_versions returned from "+std::string(osd_op_names[opcode])+
" subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
);
}
op_data->fact_ver = version;
}
}
if ((op_data->errors + op_data->done) >= op_data->n_subops)
{
delete[] op_data->subops;
op_data->subops = NULL;
op_data->st++;
if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
continue_primary_read(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
continue_primary_write(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
{
continue_primary_sync(cur_op);
}
else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
{
continue_primary_del(cur_op);
}
else
{
throw std::runtime_error("BUG: unknown opcode");
}
}
}
void osd_t::cancel_primary_write(osd_op_t *cur_op)
{
if (cur_op->op_data && cur_op->op_data->subops)
{
// Primary-write operation is waiting for subops, subops
// are sent to peer OSDs, so we can't just throw them away.
// Mark them with an extra EPIPE.
cur_op->op_data->errors++;
cur_op->op_data->epipe++;
cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
}
else
{
finish_op(cur_op, -EPIPE);
}
}
void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
int extra_chunks = 0;
for (auto & chunk: loc_set)
{
if (!cur_set || chunk.osd_num != cur_set[chunk.role])
{
extra_chunks++;
}
}
op_data->n_subops = extra_chunks;
op_data->done = op_data->errors = 0;
if (!extra_chunks)
{
return;
}
osd_op_t *subops = new osd_op_t[extra_chunks];
op_data->subops = subops;
int i = 0;
for (auto & chunk: loc_set)
{
if (!cur_set || chunk.osd_num != cur_set[chunk.role])
{
if (chunk.osd_num == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_DELETE,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
{
handle_primary_bs_subop(subop);
},
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
},
// Same version as write
.version = op_data->fact_ver,
});
bs->enqueue_op(subops[i].bs_op);
}
else
{
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
subops[i].req.sec_del = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = OSD_OP_SECONDARY_DELETE,
},
.oid = {
.inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | chunk.role,
},
// Same version as write
.version = op_data->fact_ver,
};
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// delete operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
i++;
}
}
}
void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
int n_osds = op_data->unstable_write_osds->size();
osd_op_t *subops = new osd_op_t[n_osds];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_osds;
op_data->subops = subops;
for (int i = 0; i < n_osds; i++)
{
osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
if (sync_osd == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_SYNC,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
{
handle_primary_bs_subop(subop);
},
});
bs->enqueue_op(subops[i].bs_op);
}
else
{
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
subops[i].req.sec_sync = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = OSD_OP_SECONDARY_SYNC,
},
};
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
}
}
void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
int n_osds = op_data->unstable_write_osds->size();
osd_op_t *subops = new osd_op_t[n_osds];
op_data->done = op_data->errors = 0;
op_data->n_subops = n_osds;
op_data->subops = subops;
for (int i = 0; i < n_osds; i++)
{
auto & stab_osd = (*(op_data->unstable_write_osds))[i];
if (stab_osd.osd_num == this->osd_num)
{
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bs_op = new blockstore_op_t({
.opcode = BS_OP_STABLE,
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
{
handle_primary_bs_subop(subop);
},
.len = (uint32_t)stab_osd.len,
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
});
bs->enqueue_op(subops[i].bs_op);
}
else
{
subops[i].op_type = OSD_OP_OUT;
subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
subops[i].req.sec_stab = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++,
.opcode = OSD_OP_SECONDARY_STABILIZE,
},
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
};
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
}
}
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
{
auto st_it = pg.write_queue.find(oid), it = st_it;
finish_op(first_op, retval);
if (it != pg.write_queue.end() && it->second == first_op)
{
it++;
}
else
{
// Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG.
return;
}
while (it != pg.write_queue.end() && it->first == oid)
{
finish_op(it->second, retval);
it++;
}
if (st_it != it)
{
pg.write_queue.erase(st_it, it);
}
}

204
osd_receive.cpp Normal file
View File

@@ -0,0 +1,204 @@
#include "osd.h"
void osd_t::read_requests()
{
for (int i = 0; i < read_ready_clients.size(); i++)
{
int peer_fd = read_ready_clients[i];
auto & cl = clients[peer_fd];
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
return;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.read_buf)
{
// no reads in progress
// so this is either a new command or a reply to a previously sent command
if (!cl.read_op)
{
cl.read_op = new osd_op_t;
cl.read_op->peer_fd = peer_fd;
}
cl.read_op->op_type = OSD_OP_IN;
cl.read_buf = &cl.read_op->req.buf;
cl.read_remaining = OSD_PACKET_SIZE;
cl.read_state = CL_READ_OP;
}
cl.read_iov.iov_base = cl.read_buf;
cl.read_iov.iov_len = cl.read_remaining;
cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1;
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
}
read_ready_clients.clear();
}
void osd_t::handle_read(ring_data_t *data, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
if (data->res == -EAGAIN)
{
cl.read_ready--;
if (cl.read_ready > 0)
read_ready_clients.push_back(peer_fd);
return;
}
else if (data->res < 0)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
stop_client(peer_fd);
return;
}
read_ready_clients.push_back(peer_fd);
if (data->res > 0)
{
cl.read_remaining -= data->res;
cl.read_buf += data->res;
if (cl.read_remaining <= 0)
{
cl.read_buf = NULL;
if (cl.read_state == CL_READ_OP)
{
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
{
handle_reply_hdr(&cl);
}
else
{
handle_op_hdr(&cl);
}
}
else if (cl.read_state == CL_READ_DATA)
{
// Operation is ready
exec_op(cl.read_op);
cl.read_op = NULL;
cl.read_state = 0;
}
else if (cl.read_state == CL_READ_REPLY_DATA)
{
// Reply is ready
auto req_it = cl.sent_ops.find(cl.read_reply_id);
osd_op_t *request = req_it->second;
cl.sent_ops.erase(req_it);
cl.read_reply_id = 0;
cl.read_state = 0;
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
subop_stat_count[request->req.hdr.opcode]++;
subop_stat_sum[request->req.hdr.opcode] += (
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
);
request->callback(request);
}
}
}
}
}
void osd_t::handle_op_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
cl->read_remaining = cur_op->req.sec_rw.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
{
if (cur_op->req.sec_stab.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_stab.len);
cl->read_remaining = cur_op->req.sec_stab.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.rw.len);
cl->read_remaining = cur_op->req.rw.len;
}
if (cl->read_remaining > 0)
{
// Read data
cl->read_buf = cur_op->buf;
cl->read_state = CL_READ_DATA;
}
else
{
// Operation is ready
cl->read_op = NULL;
cl->read_state = 0;
exec_op(cur_op);
}
}
void osd_t::handle_reply_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
if (req_it == cl->sent_ops.end())
{
// Command out of sync. Drop connection
printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
stop_client(cl->peer_fd);
return;
}
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->reply.hdr.retval > 0)
{
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
assert(op->buf);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
}
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
op->reply.hdr.retval > 0)
{
op->buf = memalign(512, sizeof(obj_ver_id) * op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
}
else
{
cl->read_state = 0;
cl->sent_ops.erase(req_it);
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
subop_stat_count[op->req.hdr.opcode]++;
subop_stat_sum[op->req.hdr.opcode] += (
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
op->callback(op);
}
}

View File

@@ -1,5 +1,4 @@
#include <malloc.h>
#include <string.h>
#include <assert.h>
#include "xor.h"
#include "osd_rmw.h"
@@ -56,11 +55,6 @@ static inline void cover_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & s
void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t end, osd_rmw_stripe_t *stripes)
{
if (end == 0)
{
// Zero length request - offset doesn't matter
return;
}
end = start+end;
for (int role = 0; role < pg_minsize; role++)
{
@@ -85,21 +79,18 @@ void reconstruct_stripe(osd_rmw_stripe_t *stripes, int pg_size, int role)
}
else if (prev >= 0)
{
assert(stripes[role].read_start >= stripes[prev].read_start &&
stripes[role].read_start >= stripes[other].read_start);
memxor(
stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[prev].read_buf + (stripes[prev].read_start - stripes[role].read_start),
stripes[other].read_buf + (stripes[other].read_start - stripes[other].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
prev = -1;
}
else
{
assert(stripes[role].read_start >= stripes[other].read_start);
memxor(
stripes[role].read_buf,
stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
stripes[other].read_buf + (stripes[other].read_start - stripes[role].read_start),
stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
);
}
@@ -165,11 +156,10 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
return buf;
}
void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size)
void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize)
{
// Generic parity modification (read-modify-write) algorithm
// Read -> Reconstruct missing chunks -> Calc parity chunks -> Write
// Reconstruct -> Read -> Calc parity -> Write
// Now we always read continuous ranges. This means that an update of the beginning
// of one data stripe and the end of another will lead to a read of full paired stripes.
// FIXME: (Maybe) read small individual ranges in that case instead.
@@ -184,90 +174,64 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
stripes[role].write_end = stripes[role].req_end;
}
}
int write_parity = 0;
for (int role = 0; role < pg_minsize; role++)
{
cover_read(start, end, stripes[role]);
}
int has_parity = 0;
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != 0)
if (osd_set[role] != 0)
{
write_parity = 1;
has_parity++;
stripes[role].write_start = start;
stripes[role].write_end = end;
}
}
if (write_parity)
{
for (int role = 0; role < pg_minsize; role++)
{
cover_read(start, end, stripes[role]);
}
}
if (write_osd_set != read_osd_set)
{
pg_cursize = 0;
// Object is degraded/misplaced and will be moved to <write_osd_set>
for (int role = 0; role < pg_size; role++)
{
if (write_osd_set[role] != read_osd_set[role])
{
// FIXME: For EC more than 2+1: handle case when write_osd_set == 0 and read_osd_set != 0
// We need to get data for any moved / recovered chunk
// And we need a continuous write buffer so we'll only optimize
// for the case when the whole chunk is ovewritten in the request
if (stripes[role].req_start != 0 ||
stripes[role].req_end != chunk_size)
{
stripes[role].read_start = 0;
stripes[role].read_end = chunk_size;
// Warning: We don't modify write_start/write_end here, we do it in calc_rmw_parity()
}
}
if (read_osd_set[role] != 0)
{
pg_cursize++;
}
}
else
stripes[role].missing = true;
}
if (pg_cursize < pg_size)
{
// Some stripe(s) are missing, so we need to read parity
for (int role = 0; role < pg_size; role++)
if (has_parity == 0)
{
if (read_osd_set[role] == 0)
// Parity is missing, we don't need to read anything
for (int role = 0; role < pg_minsize; role++)
{
stripes[role].missing = true;
if (stripes[role].read_end != 0)
stripes[role].read_end = 0;
}
}
else
{
// Other stripe(s) are missing
for (int role = 0; role < pg_minsize; role++)
{
if (osd_set[role] == 0 && stripes[role].read_end != 0)
{
int found = 0;
for (int r2 = 0; r2 < pg_size && found < pg_minsize; r2++)
stripes[role].missing = true;
for (int r2 = 0; r2 < pg_size; r2++)
{
// Read the non-covered range of <role> from at least <minsize> other stripes to reconstruct it
if (read_osd_set[r2] != 0)
// Read the non-covered range of <role> from all other stripes to reconstruct it
if (r2 != role && osd_set[r2] != 0)
{
extend_read(stripes[role].read_start, stripes[role].read_end, stripes[r2]);
found++;
}
}
if (found < pg_minsize)
{
// FIXME Object is incomplete - refuse partial overwrite
assert(0);
}
}
}
}
}
// Allocate read buffers
void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
// Position write buffers
void *rmw_buf = alloc_read_buffer(stripes, pg_size, has_parity * (end - start));
// Position parity & write buffers
uint64_t buf_pos = 0, in_pos = 0;
for (int role = 0; role < pg_size; role++)
{
if (stripes[role].req_end != 0)
{
stripes[role].write_buf = request_buf + in_pos;
stripes[role].write_buf = write_buf + in_pos;
in_pos += stripes[role].req_end - stripes[role].req_start;
}
else if (role >= pg_minsize && write_osd_set[role] != 0 && end != 0)
else if (role >= pg_minsize && osd_set[role] != 0)
{
stripes[role].write_buf = rmw_buf + buf_pos;
buf_pos += end - start;
@@ -357,9 +321,13 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
}
}
void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size)
{
int pg_minsize = pg_size-1;
if (stripes[pg_size-1].missing)
{
// Parity OSD is unavailable
return;
}
for (int role = 0; role < pg_size; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
@@ -369,82 +337,31 @@ void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_
break;
}
}
uint32_t start = 0, end = 0;
if (!stripes[pg_minsize].missing || write_osd_set != read_osd_set)
// Calculate new parity (EC k+1)
int parity = pg_size-1, prev = -2;
auto wr_end = stripes[parity].write_end;
auto wr_start = stripes[parity].write_start;
for (int other = 0; other < pg_size-1; other++)
{
for (int role = 0; role < pg_minsize; role++)
if (prev == -2)
{
if (stripes[role].req_end != 0)
{
start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
end = std::max(stripes[role].req_end, end);
}
prev = other;
}
}
if (write_osd_set != read_osd_set)
{
for (int role = 0; role < pg_minsize; role++)
else
{
if (write_osd_set[role] != read_osd_set[role] &&
(stripes[role].req_start != 0 || stripes[role].req_end != chunk_size))
int n1 = 0, n2 = 0;
buf_len_t xor1[3], xor2[3];
if (prev == -1)
{
// FIXME again, handle case when write_osd_set[role] is 0
// Copy modified chunk into the read buffer to write it back
memcpy(
stripes[role].read_buf + stripes[role].req_start,
stripes[role].write_buf,
stripes[role].req_end - stripes[role].req_start
);
stripes[role].write_buf = stripes[role].read_buf;
stripes[role].write_start = 0;
stripes[role].write_end = chunk_size;
}
}
}
if (!stripes[pg_minsize].missing && end != 0)
{
// Calculate new parity (EC k+1)
int parity = pg_minsize, prev = -2;
for (int other = 0; other < pg_minsize; other++)
{
if (prev == -2)
{
prev = other;
xor1[n1++] = { .buf = stripes[parity].write_buf, .len = wr_end-wr_start };
}
else
{
int n1 = 0, n2 = 0;
buf_len_t xor1[3], xor2[3];
if (prev == -1)
{
xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
}
else
{
get_old_new_buffers(stripes[prev], start, end, xor1, n1);
prev = -1;
}
get_old_new_buffers(stripes[other], start, end, xor2, n2);
xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
}
}
}
if (write_osd_set != read_osd_set)
{
for (int role = pg_minsize; role < pg_size; role++)
{
if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
{
// Copy new parity into the read buffer to write it back
memcpy(
stripes[role].read_buf + start,
stripes[role].write_buf,
end - start
);
stripes[role].write_buf = stripes[role].read_buf;
stripes[role].write_start = 0;
stripes[role].write_end = chunk_size;
get_old_new_buffers(stripes[prev], wr_start, wr_end, xor1, n1);
prev = -1;
}
get_old_new_buffers(stripes[other], wr_start, wr_end, xor2, n2);
xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, wr_end-wr_start);
}
}
}

View File

@@ -31,7 +31,6 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi
void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);
void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);
void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize);
void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size);

View File

@@ -2,147 +2,16 @@
#include "osd_rmw.cpp"
#include "test_pattern.h"
void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size);
void test1();
void test4();
void test5();
void test6();
void test7();
void test8();
void test9();
/***
Cases:
1. split(offset=128K-4K, len=8K)
= [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
= { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
= { read: [ 0, 128K-4K ] }
4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
= {
read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
+ check write2 buffer
5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
= {
req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
}
7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
then, after calc_rmw_parity(): {
write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write1==read1,
}
+ check write1 buffer
+ check write2 buffer
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read1 ],
}
+ check write2 buffer
9. object recovery case:
calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
input buffer: NULL,
rmw buffer: [ read0, read1, read2 ],
}
then, after calc_rmw_parity(): {
write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
write0==read0,
}
+ check write0 buffer
***/
int main(int narg, char *args[])
{
// Test 1
test1();
// Test 4
test4();
// Test 5
test5();
// Test 6
test6();
// Test 7
test7();
// Test 8
test8();
// Test 9
test9();
// End
printf("all ok\n");
return 0;
}
void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
{
printf("request");
for (int i = 0; i < pg_size; i++)
{
printf(" {%uK-%uK}", stripes[i].req_start/1024, stripes[i].req_end/1024);
}
printf("\n");
printf("read");
for (int i = 0; i < pg_size; i++)
{
printf(" {%uK-%uK}", stripes[i].read_start/1024, stripes[i].read_end/1024);
}
printf("\n");
printf("write");
for (int i = 0; i < pg_size; i++)
{
printf(" {%uK-%uK}", stripes[i].write_start/1024, stripes[i].write_end/1024);
}
printf("\n");
}
void test1()
{
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 1.1
// Test 1
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
assert(stripes[2].req_end == 0);
// Test 1.2
// Test 2
for (int i = 0; i < 3; i++)
{
stripes[i].read_start = stripes[i].req_start;
@@ -151,26 +20,18 @@ void test1()
assert(extend_missing_stripes(stripes, osd_set, 2, 3) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
// Test 1.3
// Test 3
stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
cover_read(0, 128*1024, stripes[0]);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
}
void test4()
{
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 4.1
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
void* write_buf = malloc(8192);
void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
void* rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 4096 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+128*1024*2);
assert(stripes[2].read_buf == rmw_buf+128*1024*3-4096);
@@ -182,32 +43,24 @@ void test4()
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
set_pattern(stripes[1].read_buf, 128*1024-4096, UINT64_MAX); // didn't read it, it's missing
set_pattern(stripes[2].read_buf, 128*1024-4096, 0); // old parity = 0
calc_rmw_parity(stripes, 3, osd_set, osd_set, 128*1024);
calc_rmw_parity(stripes, 3);
check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
free(rmw_buf);
free(write_buf);
}
void test5()
{
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 5.1
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 64*1024);
assert(stripes[2].req_end == 0);
// Test 5.2
void *write_buf = malloc(64*1024*3);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
write_buf = malloc(64*1024*3);
rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
assert(stripes[0].read_start == 64*1024 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 64*1024 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+64*3*1024);
assert(stripes[2].read_buf == rmw_buf+64*4*1024);
@@ -216,22 +69,15 @@ void test5()
assert(stripes[2].write_buf == rmw_buf);
free(rmw_buf);
free(write_buf);
}
void test6()
{
osd_num_t osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 6.1
memset(stripes, 0, sizeof(stripes));
split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
void *write_buf = malloc(64*1024*3);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024);
osd_set[1] = 2;
write_buf = malloc(64*1024*3);
rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 3);
assert(stripes[0].read_end == 0);
assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == 0);
assert(stripes[1].read_buf == rmw_buf+128*1024);
assert(stripes[2].read_buf == 0);
@@ -240,121 +86,8 @@ void test6()
assert(stripes[2].write_buf == rmw_buf);
free(rmw_buf);
free(write_buf);
}
void test7()
{
osd_num_t osd_set[3] = { 1, 0, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 7.1
split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
void *write_buf = malloc(8192);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == rmw_buf+128*1024);
assert(stripes[1].read_buf == rmw_buf+128*1024*2);
assert(stripes[2].read_buf == rmw_buf+128*1024*3);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+4096);
assert(stripes[2].write_buf == rmw_buf);
// Test 7.2
set_pattern(write_buf, 8192, PATTERN0);
set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
set_pattern(stripes[1].read_buf, 128*1024, UINT64_MAX); // didn't read it, it's missing
set_pattern(stripes[2].read_buf, 128*1024, 0); // old parity = 0
calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[1].write_buf == stripes[1].read_buf);
check_pattern(stripes[1].write_buf, 4096, PATTERN0);
check_pattern(stripes[1].write_buf+4096, 128*1024-4096, PATTERN1);
check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
free(rmw_buf);
free(write_buf);
}
void test8()
{
osd_num_t osd_set[3] = { 0, 2, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 8.1
split_stripes(2, 128*1024, 0, 128*1024+4096, stripes);
void *write_buf = malloc(128*1024+4096);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
assert(stripes[0].read_buf == NULL);
assert(stripes[1].read_buf == rmw_buf+128*1024);
assert(stripes[2].read_buf == NULL);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == write_buf+128*1024);
assert(stripes[2].write_buf == rmw_buf);
// Test 8.2
set_pattern(write_buf, 128*1024+4096, PATTERN0);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN1);
calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); // recheck again
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096); // recheck again
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); // recheck again
assert(stripes[0].write_buf == write_buf); // recheck again
assert(stripes[1].write_buf == write_buf+128*1024); // recheck again
assert(stripes[2].write_buf == rmw_buf); // recheck again
check_pattern(stripes[2].write_buf, 4096, 0); // new parity
check_pattern(stripes[2].write_buf+4096, 128*1024-4096, PATTERN0^PATTERN1); // new parity
free(rmw_buf);
free(write_buf);
}
void test9()
{
osd_num_t osd_set[3] = { 0, 2, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = { 0 };
// Test 9.0
split_stripes(2, 128*1024, 64*1024, 0, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
// Test 9.1
void *write_buf = NULL;
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
assert(stripes[0].read_buf == rmw_buf);
assert(stripes[1].read_buf == rmw_buf+128*1024);
assert(stripes[2].read_buf == rmw_buf+128*1024*2);
assert(stripes[0].write_buf == NULL);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == NULL);
// Test 8.2
set_pattern(stripes[1].read_buf, 128*1024, 0);
set_pattern(stripes[2].read_buf, 128*1024, PATTERN1);
calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
assert(stripes[0].write_buf == rmw_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == NULL);
check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
check_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
free(rmw_buf);
osd_set[1] = 0;
// End
printf("all ok\n");
return 0;
}

View File

@@ -4,34 +4,45 @@
void osd_t::secondary_op_callback(osd_op_t *op)
{
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
inflight_ops--;
auto cl_it = clients.find(op->peer_fd);
if (cl_it != clients.end())
{
op->reply.sec_rw.version = op->bs_op->version;
}
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
{
op->reply.sec_del.version = op->bs_op->version;
}
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->bs_op->retval > 0)
{
op->send_list.push_back(op->buf, op->bs_op->retval);
}
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
{
// allocated by blockstore
op->buf = op->bs_op->buf;
if (op->bs_op->retval > 0)
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = op->bs_op->retval;
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
op->send_list.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id));
op->reply.sec_rw.version = op->bs_op->version;
}
op->reply.sec_list.stable_count = op->bs_op->version;
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
{
op->reply.sec_del.version = op->bs_op->version;
}
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->reply.hdr.retval > 0)
{
op->send_list.push_back(op->buf, op->reply.hdr.retval);
}
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
{
// allocated by blockstore
op->buf = op->bs_op->buf;
if (op->reply.hdr.retval > 0)
{
op->send_list.push_back(op->buf, op->reply.hdr.retval * sizeof(obj_ver_id));
}
op->reply.sec_list.stable_count = op->bs_op->version;
}
auto & cl = cl_it->second;
outbox_push(cl, op);
}
else
{
delete op;
}
int retval = op->bs_op->retval;
delete op->bs_op;
op->bs_op = NULL;
finish_op(op, retval);
}
void osd_t::exec_secondary(osd_op_t *cur_op)
@@ -84,7 +95,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
secondary_op_callback(cur_op);
return;
}
cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
cur_op->bs_op->oid.stripe = cur_op->req.sec_list.parity_block_size;
cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
#ifdef OSD_STUB
@@ -103,10 +114,15 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
{
// FIXME: Send the real config, not its source
std::string cfg_str = json11::Json(config).dump();
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = cfg_str.size()+1;
cur_op->buf = malloc(cfg_str.size()+1);
memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
cur_op->send_list.push_back(cur_op->buf, cfg_str.size()+1);
finish_op(cur_op, cfg_str.size()+1);
auto & cl = clients[cur_op->peer_fd];
cur_op->send_list.push_back(cur_op->buf, cur_op->reply.hdr.retval);
outbox_push(cl, cur_op);
}
void osd_t::exec_sync_stab_all(osd_op_t *cur_op)

131
osd_send.cpp Normal file
View File

@@ -0,0 +1,131 @@
#include "osd.h"
void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
{
assert(cur_op->peer_fd);
if (cur_op->op_type == OSD_OP_OUT)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
}
cl.outbox.push_back(cur_op);
if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
{
if (cl.write_state == 0)
{
cl.write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);
}
ringloop->wakeup();
}
}
bool osd_t::try_send(osd_client_t & cl)
{
int peer_fd = cl.peer_fd;
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
return false;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.write_op)
{
// pick next command
cl.write_op = cl.outbox.front();
cl.outbox.pop_front();
cl.write_state = CL_WRITE_REPLY;
clock_gettime(CLOCK_REALTIME, &cl.write_op->tv_send);
if (cl.write_op->op_type == OSD_OP_IN)
{
// Measure execution latency
timespec tv_end = cl.write_op->tv_send;
op_stat_count[cl.write_op->req.hdr.opcode]++;
op_stat_sum[cl.write_op->req.hdr.opcode] += (
(tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
);
}
}
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
return true;
}
void osd_t::send_replies()
{
for (int i = 0; i < write_ready_clients.size(); i++)
{
int peer_fd = write_ready_clients[i];
if (!try_send(clients[peer_fd]))
{
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
return;
}
}
write_ready_clients.clear();
}
void osd_t::handle_send(ring_data_t *data, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
if (data->res < 0 && data->res != -EAGAIN)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
stop_client(peer_fd);
return;
}
if (data->res >= 0)
{
osd_op_t *cur_op = cl.write_op;
while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
{
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
if (iov.iov_len <= data->res)
{
data->res -= iov.iov_len;
cur_op->send_list.sent++;
}
else
{
iov.iov_len -= data->res;
iov.iov_base += data->res;
break;
}
}
if (cur_op->send_list.sent >= cur_op->send_list.count)
{
// Done
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
{
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
send_stat_count++;
send_stat_sum += (
(tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
(tv_end.tv_nsec - cl.write_op->tv_send.tv_nsec)/1000
);
}
if (cur_op->op_type == OSD_OP_IN)
{
delete cur_op;
}
else
{
cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
}
cl.write_op = NULL;
cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
}
}
if (cl.write_state != 0)
{
write_ready_clients.push_back(peer_fd);
}
}
}

View File

@@ -19,8 +19,6 @@
int connect_osd(const char *osd_address, int osd_port);
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len);
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern);
void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
@@ -31,8 +29,6 @@ void test_primary_sync(int connect_fd);
void test_sync_stab_all(int connect_fd);
void test_list_stab(int connect_fd);
int main0(int narg, char *args[])
{
int connect_fd;
@@ -98,16 +94,7 @@ int main2(int narg, char *args[])
return 0;
}
int main3(int narg, char *args[])
{
int connect_fd;
connect_fd = connect_osd("127.0.0.1", 11203);
test_list_stab(connect_fd);
close(connect_fd);
return 0;
}
int main4(int narg, char *args[])
int main(int narg, char *args[])
{
int connect_fd;
// Cluster write (sync not implemented yet)
@@ -119,15 +106,6 @@ int main4(int narg, char *args[])
return 0;
}
int main(int narg, char *args[])
{
int connect_fd;
connect_fd = connect_osd("192.168.7.2", 43051);
test_read(connect_fd, 1, 1039663104, UINT64_MAX, 0, 128*1024);
close(connect_fd);
return 0;
}
int connect_osd(const char *osd_address, int osd_port)
{
struct sockaddr_in addr;
@@ -170,7 +148,7 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
printf("bad reply: magic, id or opcode does not match request\n");
return false;
}
if (expected >= 0 && reply.hdr.retval != expected)
if (reply.hdr.retval != expected)
{
printf("operation failed, retval=%ld\n", reply.hdr.retval);
return false;
@@ -178,66 +156,6 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
return true;
}
uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len)
{
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid = {
.inode = inode,
.stripe = stripe,
};
op.sec_rw.version = version;
op.sec_rw.offset = offset;
op.sec_rw.len = len;
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, op.sec_rw.len))
{
free(data);
return 0;
}
r = read_blocking(connect_fd, data, len);
if (r != len)
{
free(data);
perror("read data");
return 0;
}
free(data);
printf("Read %lu:%lu v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
op.hdr.opcode = OSD_OP_SECONDARY_LIST;
op.sec_list.list_pg = 1;
op.sec_list.pg_count = 1;
op.sec_list.pg_stripe_size = 4*1024*1024;
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (reply.hdr.retval < 0 || !check_reply(r, op, reply, reply.hdr.retval))
{
return 0;
}
data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
if (r != sizeof(obj_ver_id)*reply.hdr.retval)
{
free(data);
perror("read data");
return 0;
}
obj_ver_id *ov = (obj_ver_id*)data;
for (int i = 0; i < reply.hdr.retval; i++)
{
if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
{
printf("list: %lu:%lu v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
}
}
return 0;
}
uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern)
{
osd_any_op_t op;
@@ -252,7 +170,7 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
op.sec_rw.version = version;
op.sec_rw.offset = 0;
op.sec_rw.len = 128*1024;
void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
void *data = memalign(512, op.sec_rw.len);
for (int i = 0; i < (op.sec_rw.len)/sizeof(uint64_t); i++)
((uint64_t*)data)[i] = pattern;
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
@@ -287,7 +205,7 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
op.rw.inode = inode;
op.rw.offset = offset;
op.rw.len = len;
void *data = memalign(MEM_ALIGNMENT, len);
void *data = memalign(512, len);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, len))
@@ -315,7 +233,7 @@ void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_
op.rw.inode = inode;
op.rw.offset = offset;
op.rw.len = len;
void *data = memalign(MEM_ALIGNMENT, len);
void *data = memalign(512, len);
set_pattern(data, len, pattern);
write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
write_blocking(connect_fd, data, len);
@@ -347,40 +265,3 @@ void test_sync_stab_all(int connect_fd)
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
assert(check_reply(r, op, reply, 0));
}
void test_list_stab(int connect_fd)
{
osd_any_op_t op;
osd_any_reply_t reply;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_LIST;
op.sec_list.pg_count = 0;
assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
assert(check_reply(r, op, reply, -1));
int total_count = reply.hdr.retval;
int stable_count = reply.sec_list.stable_count;
obj_ver_id *data = (obj_ver_id*)malloc(total_count * sizeof(obj_ver_id));
assert(data);
assert(read_blocking(connect_fd, data, total_count * sizeof(obj_ver_id)) == (total_count * sizeof(obj_ver_id)));
int last_start = stable_count;
for (int i = stable_count; i <= total_count; i++)
{
// Stabilize in portions of 32 entries
if (i - last_start >= 32 || i == total_count)
{
op.hdr.opcode = OSD_OP_SECONDARY_STABILIZE;
op.sec_stab.len = sizeof(obj_ver_id) * (i - last_start);
assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
assert(write_blocking(connect_fd, data + last_start, op.sec_stab.len) == op.sec_stab.len);
r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
assert(check_reply(r, op, reply, 0));
last_start = i;
}
}
obj_ver_id *data2 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 32);
assert(data2);
free(data2);
free(data);
}

View File

@@ -1,33 +0,0 @@
#include "pg_states.h"
const int pg_state_bit_count = 13;
const int pg_state_bits[13] = {
PG_STARTING,
PG_PEERING,
PG_INCOMPLETE,
PG_ACTIVE,
PG_STOPPING,
PG_OFFLINE,
PG_DEGRADED,
PG_HAS_INCOMPLETE,
PG_HAS_DEGRADED,
PG_HAS_MISPLACED,
PG_HAS_UNCLEAN,
PG_LEFT_ON_DEAD,
};
const char *pg_state_names[13] = {
"starting",
"peering",
"incomplete",
"active",
"stopping",
"offline",
"degraded",
"has_incomplete",
"has_degraded",
"has_misplaced",
"has_unclean",
"left_on_dead",
};

View File

@@ -1,33 +0,0 @@
#pragma once
// Placement group states
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
// Exactly one of these:
#define PG_STARTING (1<<0)
#define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3)
#define PG_STOPPING (1<<4)
#define PG_OFFLINE (1<<5)
// Plus any of these:
#define PG_DEGRADED (1<<6)
#define PG_HAS_INCOMPLETE (1<<7)
#define PG_HAS_DEGRADED (1<<8)
#define PG_HAS_MISPLACED (1<<9)
#define PG_HAS_UNCLEAN (1<<10)
#define PG_LEFT_ON_DEAD (1<<11)
// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
#define STRIPE_MASK ((uint64_t)4096 - 1)
// OSD object states
#define OBJ_DEGRADED 0x02
#define OBJ_INCOMPLETE 0x04
#define OBJ_MISPLACED 0x08
#define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000
#define OBJ_BUGGY 0x80000
extern const int pg_state_bits[];
extern const char *pg_state_names[];
extern const int pg_state_bit_count;

View File

@@ -18,7 +18,6 @@ ring_loop_t::ring_loop_t(int qd)
{
free_ring_data[i] = i;
}
wait_sqe_id = 1;
}
ring_loop_t::~ring_loop_t()
@@ -28,10 +27,11 @@ ring_loop_t::~ring_loop_t()
io_uring_queue_exit(&ring);
}
void ring_loop_t::register_consumer(ring_consumer_t *consumer)
int ring_loop_t::register_consumer(ring_consumer_t & consumer)
{
unregister_consumer(consumer);
consumer.number = consumers.size();
consumers.push_back(consumer);
return consumer.number;
}
void ring_loop_t::wakeup()
@@ -39,15 +39,12 @@ void ring_loop_t::wakeup()
loop_again = true;
}
void ring_loop_t::unregister_consumer(ring_consumer_t *consumer)
void ring_loop_t::unregister_consumer(ring_consumer_t & consumer)
{
for (int i = 0; i < consumers.size(); i++)
if (consumer.number >= 0 && consumer.number < consumers.size())
{
if (consumers[i] == consumer)
{
consumers.erase(consumers.begin()+i, consumers.begin()+i+1);
break;
}
consumers[consumer.number].loop = NULL;
consumer.number = -1;
}
}
@@ -65,17 +62,12 @@ void ring_loop_t::loop()
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
io_uring_cqe_seen(&ring, cqe);
}
while (get_sqe_queue.size() > 0)
{
(get_sqe_queue[0].second)();
get_sqe_queue.erase(get_sqe_queue.begin());
}
do
{
loop_again = false;
for (int i = 0; i < consumers.size(); i++)
{
consumers[i]->loop();
consumers[i].loop();
}
} while (loop_again);
}

View File

@@ -113,24 +113,23 @@ struct ring_data_t
struct ring_consumer_t
{
int number;
std::function<void(void)> loop;
};
class ring_loop_t
{
std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
std::vector<ring_consumer_t*> consumers;
std::vector<ring_consumer_t> consumers;
struct ring_data_t *ring_datas;
int *free_ring_data;
int wait_sqe_id;
unsigned free_ring_data_ptr;
bool loop_again;
struct io_uring ring;
public:
ring_loop_t(int qd);
~ring_loop_t();
void register_consumer(ring_consumer_t *consumer);
void unregister_consumer(ring_consumer_t *consumer);
int register_consumer(ring_consumer_t & consumer);
void unregister_consumer(ring_consumer_t & consumer);
inline struct io_uring_sqe* get_sqe()
{
@@ -141,35 +140,19 @@ public:
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
return sqe;
}
inline int wait_sqe(std::function<void()> cb)
{
get_sqe_queue.push_back({ wait_sqe_id, cb });
return wait_sqe_id++;
}
inline void cancel_wait_sqe(int wait_id)
{
for (int i = 0; i < get_sqe_queue.size(); i++)
{
if (get_sqe_queue[i].first == wait_id)
{
get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
}
}
}
inline int submit()
{
return io_uring_submit(&ring);
}
inline int wait()
{
struct io_uring_cqe *cqe;
return io_uring_wait_cqe(&ring, &cqe);
return io_uring_submit_and_wait(&ring, 1);
}
inline unsigned space_left()
{
return free_ring_data_ptr;
}
inline bool has_work()
inline bool get_loop_again()
{
return loop_again;
}

View File

@@ -51,40 +51,6 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
return done;
}
int readv_blocking(int fd, iovec *iov, int iovcnt)
{
int v = 0;
int done = 0;
while (v < iovcnt)
{
ssize_t r = readv(fd, iov, iovcnt);
if (r < 0)
{
if (errno != EAGAIN && errno != EPIPE)
{
perror("writev");
exit(1);
}
continue;
}
while (v < iovcnt)
{
if (iov[v].iov_len > r)
{
iov[v].iov_len -= r;
iov[v].iov_base += r;
break;
}
else
{
v++;
}
}
done += r;
}
return done;
}
int writev_blocking(int fd, iovec *iov, int iovcnt)
{
int v = 0;

View File

@@ -5,5 +5,4 @@
int read_blocking(int fd, void *read_buf, size_t remaining);
int write_blocking(int fd, void *write_buf, size_t remaining);
int readv_blocking(int fd, iovec *iov, int iovcnt);
int writev_blocking(int fd, iovec *iov, int iovcnt);

View File

@@ -25,37 +25,20 @@ int connect_stub(const char *server_address, int server_port);
void run_bench(int peer_fd);
static uint64_t read_sum = 0, read_count = 0;
static uint64_t write_sum = 0, write_count = 0;
static uint64_t sync_sum = 0, sync_count = 0;
void handle_sigint(int sig)
{
printf("4k randread: %lu us avg\n", read_count ? read_sum/read_count : 0);
printf("4k randwrite: %lu us avg\n", write_count ? write_sum/write_count : 0);
printf("sync: %lu us avg\n", sync_count ? sync_sum/sync_count : 0);
printf("4k randwrite: %lu us avg\n", write_sum/write_count);
printf("sync: %lu us avg\n", sync_sum/sync_count);
exit(0);
}
int main(int narg, char *args[])
{
if (narg < 2)
{
printf("USAGE: %s SERVER_IP [PORT]\n", args[0]);
return 1;
}
int port = 11203;
if (narg >= 3)
{
port = atoi(args[2]);
if (port <= 0 || port >= 65536)
{
printf("Bad port number\n");
return 1;
}
}
signal(SIGINT, handle_sigint);
int peer_fd = connect_stub(args[1], port);
int peer_fd = connect_stub("127.0.0.1", 11203);
run_bench(peer_fd);
close(peer_fd);
return 0;
@@ -115,37 +98,10 @@ void run_bench(int peer_fd)
osd_any_reply_t reply;
void *buf = NULL;
int r;
iovec iov[2];
timespec tv_begin, tv_end;
clock_gettime(CLOCK_REALTIME, &tv_begin);
while (1)
{
// read
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
op.hdr.opcode = OSD_OP_SECONDARY_READ;
op.sec_rw.oid.inode = 3;
op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
op.sec_rw.version = 0;
op.sec_rw.len = 4096;
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
if (!r)
break;
buf = malloc(op.sec_rw.len);
iov[0] = { reply.buf, OSD_PACKET_SIZE };
iov[1] = { buf, op.sec_rw.len };
r = readv_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
free(buf);
if (!r || !check_reply(OSD_PACKET_SIZE, op, reply, op.sec_rw.len))
break;
clock_gettime(CLOCK_REALTIME, &tv_end);
read_count++;
read_sum += (
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
);
tv_begin = tv_end;
// write
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
@@ -157,9 +113,9 @@ void run_bench(int peer_fd)
op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
buf = malloc(op.sec_rw.len);
memset(buf, rand() % 255, op.sec_rw.len);
iov[0] = { op.buf, OSD_PACKET_SIZE };
iov[1] = { buf, op.sec_rw.len };
r = writev_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
if (r)
r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
free(buf);
if (!r)
break;
@@ -172,7 +128,6 @@ void run_bench(int peer_fd)
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
);
tv_begin = tv_end;
// sync/stab
op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = 1;
@@ -183,12 +138,11 @@ void run_bench(int peer_fd)
r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
if (!check_reply(r, op, reply, 0))
break;
clock_gettime(CLOCK_REALTIME, &tv_end);
clock_gettime(CLOCK_REALTIME, &tv_begin);
sync_count++;
sync_sum += (
(tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
(tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
);
tv_begin = tv_end;
}
}

View File

@@ -1,129 +0,0 @@
/**
* Stub "OSD" implemented on top of osd_messenger to test & compare
* network performance with sync read/write and io_uring
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#include <stdexcept>
#include "ringloop.h"
#include "epoll_manager.h"
#include "messenger.h"
int bind_stub(const char *bind_address, int bind_port);
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
int main(int narg, char *args[])
{
ring_consumer_t looper;
ring_loop_t *ringloop = new ring_loop_t(512);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
osd_messenger_t *msgr = new osd_messenger_t();
msgr->osd_num = 1351;
msgr->tfd = epmgr->tfd;
msgr->ringloop = ringloop;
msgr->repeer_pgs = [](osd_num_t) {};
msgr->exec_op = [msgr](osd_op_t *op) { stub_exec_op(msgr, op); };
// Accept new connections
int listen_fd = bind_stub("0.0.0.0", 11203);
epmgr->set_fd_handler(listen_fd, [listen_fd, msgr](int fd, int events)
{
msgr->accept_connections(listen_fd);
});
looper.loop = [msgr, ringloop]()
{
msgr->read_requests();
msgr->send_replies();
ringloop->submit();
};
ringloop->register_consumer(&looper);
printf("stub_uring_osd: waiting for clients\n");
while (true)
{
ringloop->loop();
ringloop->wait();
}
delete msgr;
delete epmgr;
delete ringloop;
return 0;
}
int bind_stub(const char *bind_address, int bind_port)
{
int listen_backlog = 128;
int listen_fd = socket(AF_INET, SOCK_STREAM, 0);
if (listen_fd < 0)
{
throw std::runtime_error(std::string("socket: ") + strerror(errno));
}
int enable = 1;
setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, bind_address, &addr.sin_addr)) != 1)
{
close(listen_fd);
throw std::runtime_error("bind address "+std::string(bind_address)+(r == 0 ? " is not valid" : ": no ipv4 support"));
}
addr.sin_family = AF_INET;
addr.sin_port = htons(bind_port);
if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("bind: ") + strerror(errno));
}
if (listen(listen_fd, listen_backlog) < 0)
{
close(listen_fd);
throw std::runtime_error(std::string("listen: ") + strerror(errno));
}
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
return listen_fd;
}
void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->send_list.push_back(op->reply.buf, OSD_PACKET_SIZE);
if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
op->reply.hdr.retval = op->req.sec_rw.len;
op->buf = malloc(op->req.sec_rw.len);
op->send_list.push_back(op->buf, op->req.sec_rw.len);
}
else if (op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
op->reply.hdr.retval = op->req.sec_rw.len;
}
else if (op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
{
op->reply.hdr.retval = 0;
}
else
{
printf("client %d: unsupported stub opcode: %lu\n", op->peer_fd, op->req.hdr.opcode);
op->reply.hdr.retval = -EINVAL;
}
msgr->outbox_push(op);
}

325
test.cpp
View File

@@ -13,7 +13,6 @@
#include <assert.h>
#include <stdio.h>
#include <liburing.h>
#include <math.h>
#include <sys/socket.h>
#include <sys/epoll.h>
@@ -62,6 +61,24 @@ static void test_write(struct io_uring *ring, int fd)
free(buf);
}
class obj_ver_hash
{
public:
size_t operator()(const obj_ver_id &s) const
{
size_t seed = 0;
spp::hash_combine(seed, s.oid.inode);
spp::hash_combine(seed, s.oid.stripe);
spp::hash_combine(seed, s.version);
return seed;
}
};
inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
{
return a.oid == b.oid && a.version == b.version;
}
int main00(int argc, char *argv[])
{
// queue with random removal: vector is best :D
@@ -153,9 +170,9 @@ int main0(int argc, char *argv[])
// btree_map 5M entries monotone -> 0.458s, random -> 5.429s
// absl::btree_map 5M entries random -> 5.09s
// sparse_hash_map 5M entries -> 2.193s, random -> 2.586s
btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
//btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
//std::map<obj_ver_id, dirty_entry> dirty_db;
//spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
for (int i = 0; i < 5000000; i++)
{
dirty_db[(obj_ver_id){
@@ -165,7 +182,7 @@ int main0(int argc, char *argv[])
},
.version = 1,
}] = (dirty_entry){
.state = ST_D_SYNCED,
.state = ST_D_META_SYNCED,
.flags = 0,
.location = (uint64_t)i << 17,
.offset = 0,
@@ -320,253 +337,87 @@ int main04(int argc, char *argv[])
return 0;
}
uint64_t jumphash(uint64_t key, int count)
int main05(int argc, char *argv[])
{
uint64_t b = 0;
uint64_t seed = key;
for (int j = 1; j < count; j++)
// FIXME extract this into a test
pg_t pg = {
.state = PG_PEERING,
.pg_num = 1,
.target_set = { 1, 2, 3 },
.cur_set = { 1, 2, 3 },
.peering_state = new pg_peering_state_t(),
};
for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
if (seed < (UINT64_MAX / (j+1)))
pg_list_result_t r = {
.buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
.total_count = 1024*1024*8,
.stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
};
for (uint64_t i = 0; i < r.total_count; i++)
{
b = j;
r.buf[i] = {
.oid = {
.inode = 1,
.stripe = (i << STRIPE_SHIFT) | (osd_num-1),
},
.version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
};
}
pg.peering_state->list_results[osd_num] = r;
}
return b;
}
void jumphash_prepare(int count, uint64_t *out_weights, uint64_t *in_weights)
{
if (count <= 0)
pg.calc_object_states();
printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
for (auto it: pg.state_dict)
{
return;
}
uint64_t total_weight = in_weights[0];
out_weights[0] = UINT64_MAX;
for (int j = 1; j < count; j++)
{
total_weight += in_weights[j];
out_weights[j] = UINT64_MAX / total_weight * in_weights[j];
}
}
uint64_t jumphash_weights(uint64_t key, int count, uint64_t *prepared_weights)
{
uint64_t b = 0;
uint64_t seed = key;
for (int j = 1; j < count; j++)
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
if (seed < prepared_weights[j])
{
b = j;
}
}
return b;
}
void jumphash3(uint64_t key, int count, uint64_t *weights, uint64_t *r)
{
r[0] = 0;
r[1] = 1;
r[2] = 2;
uint64_t total_weight = weights[0]+weights[1]+weights[2];
uint64_t seed = key;
for (int j = 3; j < count; j++)
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
total_weight += weights[j];
if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
r[0] = j;
else
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
r[1] = j;
else
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
r[2] = j;
}
}
}
}
uint64_t crush(uint64_t key, int count, uint64_t *weights)
{
uint64_t b = 0;
uint64_t seed = 0;
uint64_t max = 0;
for (int j = 0; j < count; j++)
{
seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (j + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
seed = -log(((double)seed) / (1ul << 32) / (1ul << 32)) * weights[j];
if (seed > max)
{
max = seed;
b = j;
}
}
return b;
}
void crush3(uint64_t key, int count, uint64_t *weights, uint64_t *r, uint64_t total_weight)
{
uint64_t seed = 0;
uint64_t max = 0;
for (int k1 = 0; k1 < count; k1++)
{
for (int k2 = k1+1; k2 < count; k2++)
{
if (k2 == k1)
{
continue;
}
for (int k3 = k2+1; k3 < count; k3++)
{
if (k3 == k1 || k3 == k2)
{
continue;
}
seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (k1 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (k2 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (k3 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
//seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (weights[k1] + weights[k2] + weights[k3]);
seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (1 -
(1 - 1.0*weights[k1]/total_weight)*
(1 - 1.0*weights[k2]/total_weight)*
(1 - 1.0*weights[k3]/total_weight)
) * UINT64_MAX;
if (seed > max)
{
r[0] = k1;
r[1] = k2;
r[2] = k3;
max = seed;
}
}
}
printf("dev: state=%lx\n", it.second.state);
}
return 0;
}
int main(int argc, char *argv[])
{
int host_count = 6;
uint64_t host_weights[] = {
34609*3,
34931*3,
35850+36387+35859,
36387,
36387*2,
36387,
};
/*int osd_count[] = { 3, 3, 3, 1, 2 };
uint64_t osd_weights[][3] = {
{ 34609, 34609, 34609 },
{ 34931, 34931, 34931 },
{ 35850, 36387, 35859 },
{ 36387 },
{ 36387, 36387 },
};*/
uint64_t total_weight = 0;
for (int i = 0; i < host_count; i++)
timeval fill_start, fill_end, filter_end;
spp::sparse_hash_map<object_id, clean_entry> clean_db;
//std::map<object_id, clean_entry> clean_db;
//btree::btree_map<object_id, clean_entry> clean_db;
gettimeofday(&fill_start, NULL);
printf("filling\n");
uint64_t total = 1024*1024*8*4;
clean_db.resize(total);
for (uint64_t i = 0; i < total; i++)
{
total_weight += host_weights[i];
clean_db[(object_id){
.inode = 1,
//.stripe = (i << STRIPE_SHIFT),
.stripe = (((367*i) % total) << STRIPE_SHIFT),
}] = (clean_entry){
.version = 1,
.location = i << DEFAULT_ORDER,
};
}
uint64_t host_weights_prepared[host_count];
jumphash_prepare(host_count, host_weights_prepared, host_weights);
uint64_t total_pgs[host_count] = { 0 };
int pg_count = 256;
double uniformity[pg_count] = { 0 };
for (uint64_t pg = 1; pg <= pg_count; pg++)
gettimeofday(&fill_end, NULL);
// no resize():
// spp = 17.87s (seq), 41.81s (rand), 3.29s (seq+resize), 8.3s (rand+resize), ~1.3G RAM in all cases
// std::unordered_map = 6.14 sec, ~2.3G RAM
// std::map = 13 sec (seq), 5.54 sec (rand), ~2.5G RAM
// cpp-btree = 2.47 sec (seq) ~1.2G RAM, 20.6 sec (pseudo-random 367*i % total) ~1.5G RAM
printf("filled %.2f sec\n", (fill_end.tv_sec - fill_start.tv_sec) + (fill_end.tv_usec - fill_start.tv_usec) / 1000000.0);
for (int pg = 0; pg < 100; pg++)
{
uint64_t r[3];
/*
// Select first host
//r[0] = jumphash_weights(pg, host_count, host_weights_prepared);
r[0] = crush(pg, host_count, host_weights);
// Select second host
uint64_t seed = pg;
r[1] = r[0];
while (r[1] == r[0])
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
//r[1] = jumphash_weights(seed, host_count, host_weights_prepared);
r[1] = crush(seed, host_count, host_weights);
}
// Select third host
seed = pg;
r[2] = r[0];
while (r[2] == r[0] || r[2] == r[1])
{
seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
//r[2] = jumphash_weights(seed, host_count, host_weights_prepared);
r[2] = crush(seed, host_count, host_weights);
}
*/
/*
// Select second host
uint64_t host_weights1[host_count];
for (int i = 0; i < r[0]; i++)
host_weights1[i] = host_weights[i];
for (int i = r[0]+1; i < host_count; i++)
host_weights1[i-1] = host_weights[i];
r[1] = crush(pg, host_count-1, host_weights1);
// Select third host
for (int i = r[1]+1; i < host_count-1; i++)
host_weights1[i-1] = host_weights[i];
r[2] = crush(pg, host_count-2, host_weights1);
// Transform numbers
r[2] = r[2] >= r[1] ? 1+r[2] : r[2];
r[2] = r[2] >= r[0] ? 1+r[2] : r[2];
r[1] = r[1] >= r[0] ? 1+r[1] : r[1];
*/
crush3(pg, host_count, host_weights, r, total_weight);
uint64_t shift = (2862933555777941757ull*pg + 3037000493ull) % host_count;
if (shift == 1)
{
uint64_t tmp;
tmp = r[0];
r[0] = r[1];
r[1] = r[2];
r[2] = tmp;
}
else if (shift == 2)
{
uint64_t tmp;
tmp = r[0];
r[0] = r[2];
r[2] = r[1];
r[1] = tmp;
}
total_pgs[r[0]]++;
total_pgs[r[1]]++;
total_pgs[r[2]]++;
double u = 0;
for (int i = 0; i < host_count; i++)
{
double d = abs(1 - total_pgs[i]/3.0/pg * total_weight/host_weights[i]);
u += d;
}
uniformity[pg-1] = u/host_count;
printf("pg %lu: hosts %lu, %lu, %lu ; avg deviation = %.2f\n", pg, r[0], r[1], r[2], u/host_count);
obj_ver_id* buf1 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * ((total+99)/100));
int j = 0;
for (auto it: clean_db)
if ((it.first % 100) == pg)
buf1[j++] = { .oid = it.first, .version = it.second.version };
free(buf1);
printf("filtered %d\n", j);
}
printf("total PGs: ");
for (int i = 0; i < host_count; i++)
{
printf(i > 0 ? ", %lu (%.2f)" : "%lu (%.2f)", total_pgs[i], total_pgs[i]/3.0/pg_count * total_weight/host_weights[i]);
}
printf("\n");
gettimeofday(&filter_end, NULL);
// spp = 42.15 sec / 60 sec (rand)
// std::unordered_map = 43.7 sec
// std::map = 156.13 sec
// cpp-btree = 21.87 sec (seq), 44.33 sec (rand)
printf("100 times filter %.2f sec\n", (filter_end.tv_sec - fill_end.tv_sec) + (filter_end.tv_usec - fill_end.tv_usec) / 1000000.0);
return 0;
}

View File

@@ -115,7 +115,7 @@ int main(int narg, char *args[])
}
};
ringloop->register_consumer(&main_cons);
ringloop->register_consumer(main_cons);
while (1)
{
ringloop->loop();

View File

@@ -20,14 +20,14 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func
throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
}
consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&consumer);
ringloop->register_consumer(consumer);
this->ringloop = ringloop;
this->callback = cb;
}
timerfd_interval::~timerfd_interval()
{
ringloop->unregister_consumer(&consumer);
ringloop->unregister_consumer(consumer);
close(timerfd);
}

View File

@@ -6,6 +6,7 @@ class timerfd_interval
{
int wait_state;
int timerfd;
int status;
ring_loop_t *ringloop;
ring_consumer_t consumer;
std::function<void(void)> callback;

View File

@@ -1,159 +0,0 @@
#include <sys/timerfd.h>
#include <sys/poll.h>
#include <sys/epoll.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include "timerfd_manager.h"
timerfd_manager_t::timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler)
{
this->set_fd_handler = set_fd_handler;
wait_state = 0;
timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
if (timerfd < 0)
{
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
}
set_fd_handler(timerfd, [this](int fd, int events)
{
handle_readable();
});
}
timerfd_manager_t::~timerfd_manager_t()
{
set_fd_handler(timerfd, NULL);
close(timerfd);
}
void timerfd_manager_t::inc_timer(timerfd_timer_t & t)
{
t.next.tv_sec += t.millis/1000;
t.next.tv_nsec += (t.millis%1000)*1000000;
if (t.next.tv_nsec > 1000000000)
{
t.next.tv_sec++;
t.next.tv_nsec -= 1000000000;
}
}
int timerfd_manager_t::set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback)
{
int timer_id = id++;
timespec start;
clock_gettime(CLOCK_MONOTONIC, &start);
timers.push_back({
.id = timer_id,
.millis = millis,
.start = start,
.next = start,
.repeat = repeat,
.callback = callback,
});
inc_timer(timers[timers.size()-1]);
set_nearest();
return timer_id;
}
void timerfd_manager_t::clear_timer(int timer_id)
{
for (int i = 0; i < timers.size(); i++)
{
if (timers[i].id == timer_id)
{
timers.erase(timers.begin()+i, timers.begin()+i+1);
if (nearest == i)
{
nearest = -1;
wait_state = wait_state & ~1;
}
else if (nearest > i)
{
nearest--;
}
set_nearest();
break;
}
}
}
void timerfd_manager_t::set_nearest()
{
again:
if (!timers.size())
{
nearest = -1;
itimerspec exp = { 0 };
if (timerfd_settime(timerfd, 0, &exp, NULL))
{
throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
}
wait_state = wait_state & ~1;
}
else
{
nearest = 0;
for (int i = 1; i < timers.size(); i++)
{
if (timers[i].next.tv_sec < timers[nearest].next.tv_sec ||
timers[i].next.tv_sec == timers[nearest].next.tv_sec &&
timers[i].next.tv_nsec < timers[nearest].next.tv_nsec)
{
nearest = i;
}
}
timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
itimerspec exp = {
.it_interval = { 0 },
.it_value = timers[nearest].next,
};
exp.it_value.tv_sec -= now.tv_sec;
exp.it_value.tv_nsec -= now.tv_nsec;
if (exp.it_value.tv_nsec < 0)
{
exp.it_value.tv_sec--;
exp.it_value.tv_nsec += 1000000000;
}
if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
{
// It already happened
trigger_nearest();
goto again;
}
if (timerfd_settime(timerfd, 0, &exp, NULL))
{
throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
}
wait_state = wait_state | 1;
}
}
void timerfd_manager_t::handle_readable()
{
uint64_t n;
size_t res = read(timerfd, &n, 8);
if (res == 8 && nearest >= 0)
{
trigger_nearest();
}
wait_state = 0;
set_nearest();
}
void timerfd_manager_t::trigger_nearest()
{
int nearest_id = timers[nearest].id;
auto cb = timers[nearest].callback;
if (timers[nearest].repeat)
{
inc_timer(timers[nearest]);
}
else
{
timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
}
cb(nearest_id);
nearest = -1;
}

View File

@@ -1,35 +0,0 @@
#pragma once
#include <time.h>
#include <vector>
#include <functional>
struct timerfd_timer_t
{
int id;
uint64_t millis;
timespec start, next;
bool repeat;
std::function<void(int)> callback;
};
class timerfd_manager_t
{
int wait_state = 0;
int timerfd;
int nearest = -1;
int id = 1;
std::vector<timerfd_timer_t> timers;
void inc_timer(timerfd_timer_t & t);
void set_nearest();
void trigger_nearest();
void handle_readable();
public:
std::function<void(int, std::function<void(int, int)>)> set_fd_handler;
timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler);
~timerfd_manager_t();
int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
void clear_timer(int timer_id);
};