Allow to start the OSD just to flush the journal completely

Release 0.5.13
Another followup to 0.5.11
2021-04-10 17:44:12 +03:00 · 2021-04-09 12:10:16 +03:00 · 2021-04-09 12:10:16 +03:00 · 2021-04-08 15:47:18 +03:00 · 2021-04-08 15:47:18 +03:00 · 2021-04-08 14:59:20 +03:00
147 changed files with 6052 additions and 3775 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,5 +1,6 @@
 .git
 build
 packages
 mon/node_modules
 *.o
 *.so
@@ -15,3 +16,4 @@ fio
 qemu
 rpm/*.Dockerfile
 debian/*.Dockerfile
 Dockerfile
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,18 @@
 *.o
 *.so
 package-lock.json
 fio
 qemu
 osd
 stub_osd
 stub_uring_osd
 stub_bench
 osd_test
 osd_peering_pg_test
 dump_journal
 nbd_proxy
 rm_inode
 test_allocator
 test_blockstore
 test_shit
 osd_rmw_test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,5 @@
 cmake_minimum_required(VERSION 2.8)
 project(vitastor)
 add_subdirectory(src)
--- a/Make-gen.pl
+++ b/Make-gen.pl
@@ -1,46 +0,0 @@
 #!/usr/bin/perl
 use strict;
 my $deps = {};
 for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
 {
    if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
    {
        $deps->{$1}->{$2} = 1;
    }
 }
 my $added;
 do
 {
    $added = 0;
    for my $file (keys %$deps)
    {
        for my $dep (keys %{$deps->{$file}})
        {
            if ($deps->{$dep})
            {
                for my $subdep (keys %{$deps->{$dep}})
                {
                    if (!$deps->{$file}->{$subdep})
                    {
                        $added = 1;
                        $deps->{$file}->{$subdep} = 1;
                    }
                }
            }
        }
    }
 } while ($added);
 for my $file (sort keys %$deps)
 {
    if ($file =~ /\.cpp$/)
    {
        my $obj = $file;
        $obj =~ s/\.cpp$/.o/s;
        print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
        print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
    }
 }
--- a/195
+++ b/195
@@ -1,195 +0,0 @@
 BINDIR ?= /usr/bin
 LIBDIR ?= /usr/lib/x86_64-linux-gnu
 QEMU_PLUGINDIR ?= /usr/lib/x86_64-linux-gnu/qemu
 BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
 	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
 # -fsanitize=address
 CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always -I/usr/include/jerasure
 all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
 clean:
 	rm -f *.o libblockstore.so libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
 install: all
 	mkdir -p $(DESTDIR)$(LIBDIR)/vitastor
 	install -m 0755 libfio_sec_osd.so $(DESTDIR)$(LIBDIR)/vitastor/
 	install -m 0755 libfio_cluster.so $(DESTDIR)$(LIBDIR)/vitastor/
 	install -m 0755 libfio_blockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
 	install -m 0755 libblockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
 	mkdir -p $(DESTDIR)$(BINDIR)
 	install -m 0755 osd $(DESTDIR)$(BINDIR)/vitastor-osd
 	install -m 0755 dump_journal $(DESTDIR)$(BINDIR)/vitastor-dump-journal
 	install -m 0755 nbd_proxy $(DESTDIR)$(BINDIR)/vitastor-nbd
 	install -m 0755 rm_inode $(DESTDIR)$(BINDIR)/vitastor-rm
 	mkdir -p $(DESTDIR)$(QEMU_PLUGINDIR)
 	install -m 0755 qemu_driver.so $(DESTDIR)$(QEMU_PLUGINDIR)/block-vitastor.so
 dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
 	g++ $(CXXFLAGS) -o $@ $< crc32c.o
 libblockstore.so: $(BLOCKSTORE_OBJS)
 	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
 libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
 	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor' -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
 OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
 	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \
 	osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o
 osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
 	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor' -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring -lJerasure
 stub_osd: stub_osd.o rw_blocking.o
 	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
 osd_rmw_test: osd_rmw_test.o
 	g++ $(CXXFLAGS) -o $@ osd_rmw_test.o -lJerasure -fsanitize=address
 STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
 stub_uring_osd: $(STUB_URING_OSD_OBJS)
 	g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
 stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
 	g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
 osd_test: osd_test.cpp osd_ops.h rw_blocking.o
 	g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
 osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
 	g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
 libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
 FIO_CLUSTER_OBJS := cluster_client.o epoll_manager.o etcd_state_client.o \
 	messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o osd_ops.o pg_states.o timerfd_manager.o base64.o
 libfio_cluster.so: fio_cluster.o $(FIO_CLUSTER_OBJS)
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) -luring
 nbd_proxy: nbd_proxy.o $(FIO_CLUSTER_OBJS)
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
 rm_inode: rm_inode.o $(FIO_CLUSTER_OBJS)
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
 qemu_driver.o: qemu_driver.c qemu_proxy.h
 	gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
 		-I qemu/include $(CXXFLAGS) -c -o $@ $<
 qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS)
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring
 test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
 	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor' -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
 test_shit: test_shit.cpp osd_peering_pg.o
 	g++ $(CXXFLAGS) -o test_shit test_shit.cpp -luring -lm
 test_allocator: test_allocator.cpp allocator.o
 	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
 crc32c.o: crc32c.c crc32c.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 json11.o: json11/json11.cpp
 	g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
 # Autogenerated
 allocator.o: allocator.cpp allocator.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 base64.o: base64.cpp base64.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 fio_engine.o: fio_engine.cpp blockstore.h fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h json11/json11.hpp object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 fio_sec_osd.o: fio_sec_osd.cpp fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h object_id.h osd_id.h osd_ops.h rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 messenger.o: messenger.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 msgr_receive.o: msgr_receive.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 msgr_send.o: msgr_send.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 nbd_proxy.o: nbd_proxy.cpp cluster_client.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_ops.o: osd_ops.cpp object_id.h osd_id.h osd_ops.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_rmw.o: osd_rmw.cpp malloc_or_die.h object_id.h osd_id.h osd_rmw.h xor.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_rmw_test.o: osd_rmw_test.cpp malloc_or_die.h object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 pg_states.o: pg_states.cpp pg_states.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 qemu_proxy.o: qemu_proxy.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h qemu_proxy.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 ringloop.o: ringloop.cpp ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 rm_inode.o: rm_inode.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 rw_blocking.o: rw_blocking.cpp rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 test_allocator.o: test_allocator.cpp allocator.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 test_shit.o: test_shit.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
--- a/README.md
+++ b/README.md
@@ -16,7 +16,8 @@ breaking changes in the future. However, the following is implemented:
 - Basic part: highly-available block storage with symmetric clustering and no SPOF
 - Performance ;-D
- Two redundancy schemes: Replication and XOR n+1 (simplest case of EC)
+- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
  based on jerasure library with any number of data and parity drives in a group
 - Configuration via simple JSON data structures in etcd
 - Automatic data distribution over OSDs, with support for:
  - Mathematical optimization for better uniformity and less data movement
@@ -39,8 +40,6 @@ breaking changes in the future. However, the following is implemented:
 - OSD creation tool (OSDs currently have to be created by hand)
 - Other administrative tools
 - Per-inode I/O and space usage statistics
 - jerasure EC support with any number of data and parity drives in a group
 - Parallel usage of multiple network interfaces
 - Proxmox and OpenNebula plugins
 - iSCSI proxy
 - Inode metadata storage in etcd
@@ -50,6 +49,7 @@ breaking changes in the future. However, the following is implemented:
 - Checksums
 - SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
 - RDMA and NVDIMM support
 - Web GUI
 - Compression (possibly)
 - Read caching using system page cache (possibly)
@@ -336,9 +336,8 @@ Vitastor with single-thread NBD on the same hardware:
 - You can also rebuild QEMU with a patch that makes LD_PRELOAD unnecessary to load vitastor driver.
  See `qemu-*.*-vitastor.patch`.
 - Install fio 3.7 or later, get its source and symlink it into `<vitastor>/fio`.
- Build Vitastor with `make -j8`.
+- Build & install Vitastor with `mkdir build && cd build && cmake .. && make -j8 && make install`.
- Run `make install` (optionally with `LIBDIR=/usr/lib64 QEMU_PLUGINDIR=/usr/lib64/qemu-kvm`
+  Pay attention to the `QEMU_PLUGINDIR` cmake option - it must be set to `qemu-kvm` on RHEL.
  if you're using an RPM-based distro).
 ## Running
@@ -349,20 +348,16 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
  with lazy fsync, but prepare for inferior single-thread latency.
 - Get a fast network (at least 10 Gbit/s).
 - Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- Start etcd with `--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision` options.
+- Check `/usr/lib/vitastor/mon/make-units.sh` and `/usr/lib/vitastor/mon/make-osd.sh` and
- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
+  put desired values into the variables at the top of these files.
-  (if all your drives have capacitors).
+- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
- Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
+- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Calculate offsets for your drives with `node /usr/lib/vitastor/mon/simple-offsets.js --device /dev/sdX`.
+- You can edit the units and change OSD configuration. Notable configuration variables:
 - Make systemd units for your OSDs. Look at `/usr/lib/vitastor/mon/make-units.sh` for example.
  Notable configuration variables from the example:
  - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
  - `immediate_commit all` - use this if all your drives are server-grade.
  - `disable_device_lock 1` - only required if you run multiple OSDs on one block device.
-  - `flusher_count 16` - flusher is a micro-thread that removes old data from the journal.
+  - `flusher_count 256` - flusher is a micro-thread that removes old data from the journal.
-    More flushers mean more aggressive journal flushing which allows for more throughput
+    You don't have to worry about this parameter anymore, 256 is enough.
    but slightly hurts latency under less load. Flushing will probably be improved in the future
    because currently high queue depths sometimes lead to performance degradation.
  - `disk_alignment`, `journal_block_size`, `meta_block_size` should be set to the internal
    block size of your SSDs which is 4096 on most drives.
  - `journal_no_same_sector_overwrites true` prevents multiple overwrites of the same journal sector.
@@ -373,18 +368,22 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
    setting is set, it is also required to raise `journal_sector_buffer_count` setting, which is the
    number of dirty journal sectors that may be written to at the same time.
 - `systemctl start vitastor.target` everywhere.
- Start any number of monitors: `node /usr/lib/vitastor/mon/mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
+- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
  (if all your drives have capacitors).
 - Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
  For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
 - At this point, one of the monitors will configure PGs and OSDs will start them.
 - You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
- Run tests with (for example): `fio -thread -ioengine=/usr/lib/x86_64-linux-gnu/vitastor/libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
+- Run tests with (for example): `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
 - Upload VM disk image with qemu-img (for example):
  ```
-  LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img convert -f qcow2 debian10.qcow2 -p
+  qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
    -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
  ```
  Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
  if you use unmodified QEMU.
 - Run QEMU with (for example):
  ```
-  LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-system-x86_64 -enable-kvm -m 1024
+  qemu-system-x86_64 -enable-kvm -m 1024
    -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
    -vnc 0.0.0.0:0
@@ -398,10 +397,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
 - Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
  deletion because proper handling of object cleanup in a cluster should be "three-phase"
-  and it's currently not implemented. Inode removal tool currently can't handle unclean
+  and it's currently not implemented. Just to repeat the removal again in this case.
  objects, so incomplete objects become undeletable. This will be fixed in near future
  by allowing the inode removal tool to delete unclean objects. With this problem fixed
  you'll be able just to repeat the removal again.
 ## Implementation Principles
@@ -423,22 +419,27 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
 You can also find me in the Russian Telegram Ceph chat: https://t.me/ceph_ru
 All server-side code (OSD, Monitor and so on) is licensed under the terms of
-Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
+Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
 GNU GPLv3.0 with the additional "Network Interaction" clause which requires
 opensourcing all programs directly or indirectly interacting with Vitastor
-through a computer network ("Proxy Programs"). Proxy Programs may be made public
+through a computer network and expressly designed to be used in conjunction
-not only under the terms of the same license, but also under the terms of any
+with it ("Proxy Programs"). Proxy Programs may be made public not only under
-GPL-Compatible Free Software License, as listed by the Free Software Foundation.
+the terms of the same license, but also under the terms of any GPL-Compatible
 Free Software License, as listed by the Free Software Foundation.
 This is a stricter copyleft license than the Affero GPL.
 Please note that VNPL doesn't require you to open the code of proprietary
 software running inside a VM if it's not specially designed to be used with
 Vitastor.
 Basically, you can't use the software in a proprietary environment to provide
 its functionality to users without opensourcing all intermediary components
 standing between the user and Vitastor or purchasing a commercial license
 from the author 😀.
 Client libraries (cluster_client and so on) are dual-licensed under the same
-VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
 software like QEMU and fio.
-You can find the full text of VNPL-1.0 in the file [VNPL-1.0.txt](VNPL-1.0.txt).
+You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
 GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
--- a/VNPL-1.1.txt
+++ b/VNPL-1.1.txt
@@ -1,7 +1,7 @@
                     VITASTOR NETWORK PUBLIC LICENSE
-                       Version 1, 17 September 2020
+                     Version 1.1,  6 February 2021
- Copyright (C) 2020 Vitaliy Filippov <vitalif@yourcmc.ru>
+ Copyright (C) 2021 Vitaliy Filippov <vitalif@yourcmc.ru>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
@@ -540,12 +540,15 @@ License would be to refrain entirely from conveying the Program.
  13. Remote Network Interaction.
-  Notwithstanding any other provision of this License, if you provide
+  A "Proxy Program" means a separate program which is specially designed to
-any user an opportunity to interact with the covered work directly
+be used in conjunction with the covered work and interacts with it directly
-or indirectly through a computer network, an imitation of such network,
+or indirectly through any kind of API (application programming interfaces),
-or an additional program (hereinafter referred to as a "Proxy Program")
+a computer network, an imitation of such network, or another Proxy Program
-that, in turn, interacts with the covered work through a computer network,
+itself.
-an imitation of such network, or another Proxy Program itself,
+
  Notwithstanding any other provision of this License, if you provide any user
 with an opportunity to interact with the covered work through a computer
 network, an imitation of such network, or any number of "Proxy Programs",
 you must prominently offer that user an opportunity to receive the
 Corresponding Source of the covered work and all Proxy Programs from a
 network server at no charge, through some standard or customary means of
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@@ -1,765 +0,0 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
 #include <stdexcept>
 #include "cluster_client.h"
 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
    this->ringloop = ringloop;
    this->tfd = tfd;
    log_level = config["log_level"].int64_value();
    msgr.osd_num = 0;
    msgr.tfd = tfd;
    msgr.ringloop = ringloop;
    msgr.log_level = log_level;
    msgr.repeer_pgs = [this](osd_num_t peer_osd)
    {
        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
        {
            // peer_osd just connected
            continue_ops();
        }
        else if (unsynced_writes.size())
        {
            // peer_osd just dropped connection
            for (auto op: syncing_writes)
            {
                for (auto & part: op->parts)
                {
                    if (part.osd_num == peer_osd && part.done)
                    {
                        // repeat this operation
                        part.osd_num = 0;
                        part.done = false;
                        assert(!part.sent);
                        op->done_count--;
                    }
                }
            }
            for (auto op: unsynced_writes)
            {
                for (auto & part: op->parts)
                {
                    if (part.osd_num == peer_osd && part.done)
                    {
                        // repeat this operation
                        part.osd_num = 0;
                        part.done = false;
                        assert(!part.sent);
                        op->done_count--;
                    }
                }
                if (op->done_count < op->parts.size())
                {
                    cur_ops.insert(op);
                }
            }
            continue_ops();
        }
    };
    msgr.exec_op = [this](osd_op_t *op)
    {
        // Garbage in
        printf("Incoming garbage from peer %d\n", op->peer_fd);
        msgr.stop_client(op->peer_fd);
        delete op;
    };
    msgr.use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
        config["use_sync_send_recv"].uint64_value();
    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
    st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
    st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
    st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
    st_cli.parse_config(config);
    st_cli.load_global_config();
    if (ringloop)
    {
        consumer.loop = [this]()
        {
            msgr.read_requests();
            msgr.send_replies();
            this->ringloop->submit();
        };
        ringloop->register_consumer(&consumer);
    }
 }
 cluster_client_t::~cluster_client_t()
 {
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
    }
 }
 void cluster_client_t::stop()
 {
    while (msgr.clients.size() > 0)
    {
        msgr.stop_client(msgr.clients.begin()->first);
    }
 }
 void cluster_client_t::continue_ops(bool up_retry)
 {
    for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); )
    {
        if ((*op_it)->up_wait)
        {
            if (up_retry)
            {
                (*op_it)->up_wait = false;
                continue_rw(*op_it++);
            }
            else
                op_it++;
        }
        else
            continue_rw(*op_it++);
    }
 }
 static uint32_t is_power_of_two(uint64_t value)
 {
    uint32_t l = 0;
    while (value > 1)
    {
        if (value & 1)
        {
            return 64;
        }
        value = value >> 1;
        l++;
    }
    return l;
 }
 void cluster_client_t::on_load_config_hook(json11::Json::object & config)
 {
    bs_block_size = config["block_size"].uint64_value();
    bs_disk_alignment = config["disk_alignment"].uint64_value();
    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
    if (!bs_block_size)
    {
        bs_block_size = DEFAULT_BLOCK_SIZE;
    }
    if (!bs_disk_alignment)
    {
        bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
    }
    if (!bs_bitmap_granularity)
    {
        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
    }
    uint32_t block_order;
    if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
    {
        throw std::runtime_error("Bad block size");
    }
    if (config["immediate_commit"] == "all")
    {
        // Cluster-wide immediate_commit mode
        immediate_commit = true;
    }
    else if (config.find("client_dirty_limit") != config.end())
    {
        client_dirty_limit = config["client_dirty_limit"].uint64_value();
    }
    if (!client_dirty_limit)
    {
        client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
    }
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
    {
        up_wait_retry_interval = 500;
    }
    else if (up_wait_retry_interval < 50)
    {
        up_wait_retry_interval = 50;
    }
    msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
    if (!msgr.peer_connect_interval)
    {
        msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
    }
    msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
    if (!msgr.peer_connect_timeout)
    {
        msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
    }
    st_cli.load_pgs();
 }
 void cluster_client_t::on_load_pgs_hook(bool success)
 {
    for (auto pool_item: st_cli.pool_config)
    {
        pg_counts[pool_item.first] = pool_item.second.real_pg_count;
    }
    pgs_loaded = true;
    for (auto fn: on_ready_hooks)
    {
        fn();
    }
    on_ready_hooks.clear();
    for (auto op: offline_ops)
    {
        execute(op);
    }
    offline_ops.clear();
    continue_ops();
 }
 void cluster_client_t::on_change_hook(json11::Json::object & changes)
 {
    for (auto pool_item: st_cli.pool_config)
    {
        if (pg_counts[pool_item.first] != pool_item.second.real_pg_count)
        {
            // At this point, all pool operations should have been suspended
            // And now they have to be resliced!
            for (auto op: cur_ops)
            {
                if (INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
            }
            for (auto op: unsynced_writes)
            {
                if (INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
            }
            for (auto op: syncing_writes)
            {
                if (INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
            }
            pg_counts[pool_item.first] = pool_item.second.real_pg_count;
        }
    }
    continue_ops();
 }
 void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
 {
    if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
    {
        msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
    }
 }
 void cluster_client_t::on_ready(std::function<void(void)> fn)
 {
    if (pgs_loaded)
    {
        fn();
    }
    else
    {
        on_ready_hooks.push_back(fn);
    }
 }
 /**
 * How writes are synced when immediate_commit is false
 *
 * 1) accept up to <client_dirty_limit> write operations for execution,
 *    queue all subsequent writes into <next_writes>
 * 2) accept exactly one SYNC, queue all subsequent SYNCs into <next_writes>, too
 * 3) "continue" all accepted writes
 *
 * "Continue" WRITE:
 * 1) if the operation is not a copy yet - copy it (required for replay)
 * 2) if the operation is not sliced yet - slice it
 * 3) if the operation doesn't require reslice - try to connect & send all remaining parts
 * 4) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
 * 5) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
 * 6) if PG count changes before all parts are done, wait for all in-progress parts to finish,
 *    throw all results away, reslice and resubmit op
 * 7) when all parts are done, try to "continue" the current SYNC
 * 8) if the operation succeeds, but then some OSDs drop their connections, repeat
 *    parts from the current "unsynced batch" previously sent to those OSDs in any order
 *
 * "Continue" current SYNC:
 * 1) take all unsynced operations from the current batch
 * 2) check if all affected OSDs are still alive
 * 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
 * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
 * 5) if any of them fail due to other errors, fail the SYNC operation
 */
 void cluster_client_t::execute(cluster_op_t *op)
 {
    if (!pgs_loaded)
    {
        // We're offline
        offline_ops.push_back(op);
        return;
    }
    op->retval = 0;
    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE ||
        (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && (!op->inode || !op->len ||
        op->offset % bs_disk_alignment || op->len % bs_disk_alignment))
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return;
    }
    if (op->opcode == OSD_OP_SYNC)
    {
        execute_sync(op);
        return;
    }
    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
    {
        if (next_writes.size() > 0)
        {
            assert(cur_sync);
            next_writes.push_back(op);
            return;
        }
        if (queued_bytes >= client_dirty_limit)
        {
            // Push an extra SYNC operation to flush previous writes
            next_writes.push_back(op);
            cluster_op_t *sync_op = new cluster_op_t;
            sync_op->is_internal = true;
            sync_op->opcode = OSD_OP_SYNC;
            sync_op->callback = [](cluster_op_t* sync_op) {};
            execute_sync(sync_op);
            return;
        }
        queued_bytes += op->len;
    }
    cur_ops.insert(op);
    continue_rw(op);
 }
 void cluster_client_t::continue_rw(cluster_op_t *op)
 {
    pool_id_t pool_id = INODE_POOL(op->inode);
    if (!pool_id)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return;
    }
    if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
        st_cli.pool_config[pool_id].real_pg_count == 0)
    {
        // Postpone operations to unknown pools
        return;
    }
    if (op->opcode == OSD_OP_WRITE && !immediate_commit && !op->is_internal)
    {
        // Save operation for replay when PG goes out of sync
        // (primary OSD drops our connection in this case)
        cluster_op_t *op_copy = new cluster_op_t();
        op_copy->is_internal = true;
        op_copy->orig_op = op;
        op_copy->opcode = op->opcode;
        op_copy->inode = op->inode;
        op_copy->offset = op->offset;
        op_copy->len = op->len;
        op_copy->buf = malloc_or_die(op->len);
        op_copy->iov.push_back(op_copy->buf, op->len);
        op_copy->callback = [](cluster_op_t* op_copy)
        {
            if (op_copy->orig_op)
            {
                // Acknowledge write and forget the original pointer
                op_copy->orig_op->retval = op_copy->retval;
                std::function<void(cluster_op_t*)>(op_copy->orig_op->callback)(op_copy->orig_op);
                op_copy->orig_op = NULL;
            }
        };
        void *cur_buf = op_copy->buf;
        for (int i = 0; i < op->iov.count; i++)
        {
            memcpy(cur_buf, op->iov.buf[i].iov_base, op->iov.buf[i].iov_len);
            cur_buf += op->iov.buf[i].iov_len;
        }
        unsynced_writes.push_back(op_copy);
        cur_ops.erase(op);
        cur_ops.insert(op_copy);
        op = op_copy;
    }
    if (!op->parts.size())
    {
        // Slice the operation into parts
        slice_rw(op);
    }
    if (!op->needs_reslice)
    {
        // Send unsent parts, if they're not subject to change
        for (auto & op_part: op->parts)
        {
            if (!op_part.sent && !op_part.done)
            {
                try_send(op, &op_part);
            }
        }
    }
    if (!op->sent_count)
    {
        if (op->done_count >= op->parts.size())
        {
            // Finished successfully
            // Even if the PG count has changed in meanwhile we treat it as success
            // because if some operations were invalid for the new PG count we'd get errors
            cur_ops.erase(op);
            op->retval = op->len;
            std::function<void(cluster_op_t*)>(op->callback)(op);
            continue_sync();
            return;
        }
        else if (op->retval != 0 && op->retval != -EPIPE)
        {
            // Fatal error (not -EPIPE)
            cur_ops.erase(op);
            if (!immediate_commit && op->opcode == OSD_OP_WRITE)
            {
                for (int i = 0; i < unsynced_writes.size(); i++)
                {
                    if (unsynced_writes[i] == op)
                    {
                        unsynced_writes.erase(unsynced_writes.begin()+i, unsynced_writes.begin()+i+1);
                        break;
                    }
                }
            }
            bool del = op->is_internal;
            std::function<void(cluster_op_t*)>(op->callback)(op);
            if (del)
            {
                if (op->buf)
                    free(op->buf);
                delete op;
            }
            continue_sync();
            return;
        }
        else
        {
            // -EPIPE or no error - clear the error
            op->retval = 0;
            if (op->needs_reslice)
            {
                op->parts.clear();
                op->done_count = 0;
                op->needs_reslice = false;
                continue_rw(op);
            }
        }
    }
 }
 void cluster_client_t::slice_rw(cluster_op_t *op)
 {
    // Slice the request into individual object stripe requests
    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
    uint64_t pg_block_size = bs_block_size * (
        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize
    );
    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
    op->retval = 0;
    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
    int iov_idx = 0;
    size_t iov_pos = 0;
    int i = 0;
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
        pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
        uint64_t begin = (op->offset < stripe ? stripe : op->offset);
        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
            ? (stripe + pg_block_size) : (op->offset + op->len);
        op->parts[i] = (cluster_op_part_t){
            .parent = op,
            .offset = begin,
            .len = (uint32_t)(end - begin),
            .pg_num = pg_num,
            .sent = false,
            .done = false,
        };
        int left = end-begin;
        while (left > 0 && iov_idx < op->iov.count)
        {
            if (op->iov.buf[iov_idx].iov_len - iov_pos < left)
            {
                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, op->iov.buf[iov_idx].iov_len - iov_pos);
                left -= (op->iov.buf[iov_idx].iov_len - iov_pos);
                iov_pos = 0;
                iov_idx++;
            }
            else
            {
                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
                iov_pos += left;
                left = 0;
            }
        }
        assert(left == 0);
        i++;
    }
 }
 bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
 {
    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
    if (pg_it != pool_cfg.pg_config.end() &&
        !pg_it->second.pause && pg_it->second.cur_primary)
    {
        osd_num_t primary_osd = pg_it->second.cur_primary;
        auto peer_it = msgr.osd_peer_fds.find(primary_osd);
        if (peer_it != msgr.osd_peer_fds.end())
        {
            int peer_fd = peer_it->second;
            part->osd_num = primary_osd;
            part->sent = true;
            op->sent_count++;
            part->op = (osd_op_t){
                .op_type = OSD_OP_OUT,
                .peer_fd = peer_fd,
                .req = { .rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
                        .id = op_id++,
                        .opcode = op->opcode,
                    },
                    .inode = op->inode,
                    .offset = part->offset,
                    .len = part->len,
                } },
                .callback = [this, part](osd_op_t *op_part)
                {
                    handle_op_part(part);
                },
            };
            part->op.iov = part->iov;
            msgr.outbox_push(&part->op);
            return true;
        }
        else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
        {
            msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
        }
    }
    return false;
 }
 void cluster_client_t::execute_sync(cluster_op_t *op)
 {
    if (immediate_commit)
    {
        // Syncs are not required in the immediate_commit mode
        op->retval = 0;
        std::function<void(cluster_op_t*)>(op->callback)(op);
    }
    else if (cur_sync != NULL)
    {
        next_writes.push_back(op);
    }
    else
    {
        cur_sync = op;
        continue_sync();
    }
 }
 void cluster_client_t::continue_sync()
 {
    if (!cur_sync || cur_sync->parts.size() > 0)
    {
        // Already submitted
        return;
    }
    cur_sync->retval = 0;
    std::set<osd_num_t> sync_osds;
    for (auto prev_op: unsynced_writes)
    {
        if (prev_op->done_count < prev_op->parts.size())
        {
            // Writes not finished yet
            return;
        }
        for (auto & part: prev_op->parts)
        {
            if (part.osd_num)
            {
                sync_osds.insert(part.osd_num);
            }
        }
    }
    if (!sync_osds.size())
    {
        // No dirty writes
        finish_sync();
        return;
    }
    // Check that all OSD connections are still alive
    for (auto sync_osd: sync_osds)
    {
        auto peer_it = msgr.osd_peer_fds.find(sync_osd);
        if (peer_it == msgr.osd_peer_fds.end())
        {
            // SYNC is pointless to send to a non connected OSD
            return;
        }
    }
    syncing_writes.swap(unsynced_writes);
    // Post sync to affected OSDs
    cur_sync->parts.resize(sync_osds.size());
    int i = 0;
    for (auto sync_osd: sync_osds)
    {
        cur_sync->parts[i] = {
            .parent = cur_sync,
            .osd_num = sync_osd,
            .sent = false,
            .done = false,
        };
        send_sync(cur_sync, &cur_sync->parts[i]);
        i++;
    }
 }
 void cluster_client_t::finish_sync()
 {
    int retval = cur_sync->retval;
    if (retval != 0)
    {
        for (auto op: syncing_writes)
        {
            if (op->done_count < op->parts.size())
            {
                cur_ops.insert(op);
            }
        }
        unsynced_writes.insert(unsynced_writes.begin(), syncing_writes.begin(), syncing_writes.end());
        syncing_writes.clear();
    }
    if (retval == -EPIPE)
    {
        // Retry later
        cur_sync->parts.clear();
        cur_sync->retval = 0;
        cur_sync->sent_count = 0;
        cur_sync->done_count = 0;
        return;
    }
    std::function<void(cluster_op_t*)>(cur_sync->callback)(cur_sync);
    if (!retval)
    {
        for (auto op: syncing_writes)
        {
            assert(op->sent_count == 0);
            if (op->is_internal)
            {
                if (op->buf)
                    free(op->buf);
                delete op;
            }
        }
        syncing_writes.clear();
    }
    cur_sync = NULL;
    queued_bytes = 0;
    std::vector<cluster_op_t*> next_wr_copy;
    next_wr_copy.swap(next_writes);
    for (auto next_op: next_wr_copy)
    {
        execute(next_op);
    }
 }
 void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
 {
    auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
    assert(peer_it != msgr.osd_peer_fds.end());
    part->sent = true;
    op->sent_count++;
    part->op = (osd_op_t){
        .op_type = OSD_OP_OUT,
        .peer_fd = peer_it->second,
        .req = {
            .hdr = {
                .magic = SECONDARY_OSD_OP_MAGIC,
                .id = op_id++,
                .opcode = OSD_OP_SYNC,
            },
        },
        .callback = [this, part](osd_op_t *op_part)
        {
            handle_op_part(part);
        },
    };
    msgr.outbox_push(&part->op);
 }
 void cluster_client_t::handle_op_part(cluster_op_part_t *part)
 {
    cluster_op_t *op = part->parent;
    part->sent = false;
    op->sent_count--;
    int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
    if (part->op.reply.hdr.retval != expected)
    {
        // Operation failed, retry
        printf(
            "Operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
            part->osd_num, part->op.reply.hdr.retval, expected
        );
        msgr.stop_client(part->op.peer_fd);
        if (part->op.reply.hdr.retval == -EPIPE)
        {
            op->up_wait = true;
            if (!retry_timeout_id)
            {
                retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
                {
                    retry_timeout_id = 0;
                    continue_ops(true);
                });
            }
        }
        if (!op->retval || op->retval == -EPIPE)
        {
            // Don't overwrite other errors with -EPIPE
            op->retval = part->op.reply.hdr.retval;
        }
    }
    else
    {
        // OK
        part->done = true;
        op->done_count++;
    }
    if (op->sent_count == 0)
    {
        if (op->opcode == OSD_OP_SYNC)
        {
            assert(op == cur_sync);
            finish_sync();
        }
        else if (!op->up_wait)
        {
            continue_rw(op);
        }
    }
 }
--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-gcc -E -o fio_headers.i fio_headers.h
+gcc -I. -E -o fio_headers.i src/fio_headers.h
 rm -rf fio-copy
 for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@@ -5,7 +5,7 @@
 #cd b/qemu; make qapi
 gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-    -I qemu/include -E -o qemu_driver.i qemu_driver.c
+    -I qemu/include -E -o qemu_driver.i src/qemu_driver.c
 rm -rf qemu-copy
 for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
--- a/debian/build-vitastor-bullseye.sh
+++ b/debian/build-vitastor-bullseye.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 sed 's/$REL/bullseye/g' < vitastor.Dockerfile > ../Dockerfile
 cd ..
 mkdir -p packages
 sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
 rm Dockerfile
--- a/debian/build-vitastor-buster.sh
+++ b/debian/build-vitastor-buster.sh
@@ -0,0 +1,7 @@
 #!/bin/bash
 sed 's/$REL/buster/g' < vitastor.Dockerfile > ../Dockerfile
 cd ..
 mkdir -p packages
 sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
 rm Dockerfile
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,15 @@
 vitastor (0.5.13-1) unstable; urgency=medium
  * Bugfixes
 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Tue, 02 Feb 2021 23:01:24 +0300
 vitastor (0.5.1-1) unstable; urgency=medium
  * Add jerasure support
 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sat, 05 Dec 2020 17:02:26 +0300
 vitastor (0.5-1) unstable; urgency=medium
  * First packaging for Debian
--- a/debian/control
+++ b/debian/control
@@ -2,14 +2,14 @@ Source: vitastor
 Section: admin
 Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
 Rules-Requires-Root: no
 Package: vitastor
-Architecture: any
+Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 12), node-sprintf-js, node-ws (>= 7)
+Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2, lp-solve
 Description: Vitastor, a fast software-defined clustered block storage
 Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
 architecturally similar to Ceph which means strong consistency, primary-replication,
--- a/debian/copyright
+++ b/debian/copyright
@@ -5,16 +5,17 @@ Source: https://vitastor.io
 Files: *
 Copyright: 2019+ Vitaliy Filippov <vitalif@yourcmc.ru>
-License: Multiple licenses VNPL-1.0 and/or GPL-2.0+
+License: Multiple licenses VNPL-1.1 and/or GPL-2.0+
 All server-side code (OSD, Monitor and so on) is licensed under the terms of
- Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
+ Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
 GNU GPLv3.0 with the additional "Network Interaction" clause which requires
 opensourcing all programs directly or indirectly interacting with Vitastor
- through a computer network ("Proxy Programs"). Proxy Programs may be made public
+ through a computer network and expressly designed to be used in conjunction
- not only under the terms of the same license, but also under the terms of any
+ with it ("Proxy Programs"). Proxy Programs may be made public not only under
- GPL-Compatible Free Software License, as listed by the Free Software Foundation.
+ the terms of the same license, but also under the terms of any GPL-Compatible
 Free Software License, as listed by the Free Software Foundation.
 This is a stricter copyleft license than the Affero GPL.
 .
 Client libraries (cluster_client and so on) are dual-licensed under the same
- VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+ VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
 software like QEMU and fio.
--- a/debian/install
+++ b/debian/install
@@ -1,3 +1,3 @@
-VNPL-1.0.txt usr/share/doc/vitastor
+VNPL-1.1.txt usr/share/doc/vitastor
 GPL-2.0.txt usr/share/doc/vitastor
 mon usr/lib/vitastor
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -0,0 +1,44 @@
 # Build patched QEMU for Debian Buster or Bullseye/Sid inside a container
 # cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/patched-qemu.Dockerfile .
 FROM debian:$REL
 WORKDIR /root
 RUN if [ "$REL" = "buster" ]; then \
        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
    fi; \
    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
 RUN apt-get update
 RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
 RUN apt-get -y build-dep qemu
 RUN apt-get -y build-dep fio
 RUN apt-get --download-only source qemu
 RUN apt-get --download-only source fio
 ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
 RUN set -e; \
    mkdir -p /root/packages/qemu-$REL; \
    rm -rf /root/packages/qemu-$REL/*; \
    cd /root/packages/qemu-$REL; \
    dpkg-source -x /root/qemu*.dsc; \
    if [ -d /root/packages/qemu-$REL/qemu-5.0 ]; then \
        cp /root/vitastor/qemu-5.0-vitastor.patch /root/packages/qemu-$REL/qemu-5.0/debian/patches; \
        echo qemu-5.0-vitastor.patch >> /root/packages/qemu-$REL/qemu-5.0/debian/patches/series; \
    else \
        cp /root/vitastor/qemu-5.1-vitastor.patch /root/packages/qemu-$REL/qemu-*/debian/patches; \
        P=`ls -d /root/packages/qemu-$REL/qemu-*/debian/patches`; \
        echo qemu-5.1-vitastor.patch >> $P/series; \
    fi; \
    cd /root/packages/qemu-$REL/qemu-*/; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/vitastor-bullseye.Dockerfile
+++ b/debian/vitastor-bullseye.Dockerfile
@@ -1,86 +0,0 @@
 # Build packages for Debian Bullseye/Sid inside a container
 # cd ..; podman build -t vitastor-bullseye -v `pwd`/build:/root/build -f debian/vitastor-bullseye.Dockerfile .
 ARG REL=bullseye
 FROM debian:$REL
 # again, it doesn't work otherwise
 ARG REL=bullseye
 WORKDIR /root
 RUN grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
    echo 'APT::Install-Recommends false;' > /etc/apt/apt.conf
 RUN apt-get update
 RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
 RUN apt-get -y build-dep qemu
 RUN apt-get -y build-dep fio
 RUN apt-get --download-only source qemu
 RUN apt-get --download-only source fio
 ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
 RUN set -e; \
    mkdir -p /root/build/qemu-$REL; \
    rm -rf /root/build/qemu-$REL/*; \
    cd /root/build/qemu-$REL; \
    dpkg-source -x /root/qemu*.dsc; \
    if [ -d /root/build/qemu-$REL/qemu-5.0 ]; then \
        cp /root/vitastor/qemu-5.0-vitastor.patch /root/build/qemu-$REL/qemu-5.0/debian/patches; \
        echo qemu-5.0-vitastor.patch >> /root/build/qemu-$REL/qemu-5.0/debian/patches/series; \
    else \
        cp /root/vitastor/qemu-5.1-vitastor.patch /root/build/qemu-$REL/qemu-*/debian/patches; \
        P=`ls -d /root/build/qemu-$REL/qemu-*/debian/patches`; \
        echo qemu-5.1-vitastor.patch >> $P/series; \
    fi; \
    cd /root/build/qemu-$REL/qemu-*/; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
    echo ">>> VERSION: $V"; \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/build/qemu-$REL/qemu-*/
 RUN cd /root/build/qemu-$REL && apt-get -y install ./qemu-system-data*.deb ./qemu-system-common_*.deb ./qemu-system-x86_*.deb ./qemu_*.deb
 ADD . /root/vitastor
 RUN set -e -x; \
    mkdir -p /root/fio-build/; \
    cd /root/fio-build/; \
    rm -rf /root/fio-build/*; \
    dpkg-source -x /root/fio*.dsc; \
    cd /root/build/qemu-$REL/; \
    rm -rf qemu*/; \
    dpkg-source -x qemu*.dsc; \
    cd /root/build/qemu-$REL/qemu*/; \
    debian/rules b/configure-stamp; \
    cd b/qemu; \
    make -j8 qapi; \
    mkdir -p /root/build/vitastor-$REL; \
    rm -rf /root/build/vitastor-$REL/*; \
    cd /root/build/vitastor-$REL; \
    cp -r /root/vitastor vitastor-0.5; \
    ln -s /root/build/qemu-$REL/qemu-*/ vitastor-0.5/qemu; \
    ln -s /root/fio-build/fio-*/ vitastor-0.5/fio; \
    cd vitastor-0.5; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    sh copy-qemu-includes.sh; \
    sh copy-fio-includes.sh; \
    rm qemu fio; \
    mkdir -p a b debian/patches; \
    mv qemu-copy b/qemu; \
    mv fio-copy b/fio; \
    diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
    echo qemu-fio-headers.patch >> debian/patches/series; \
    rm -rf a b; \
    rm -rf /root/build/qemu-$REL/qemu*/; \
    echo "dep:fio=$FIO" > debian/substvars; \
    echo "dep:qemu=$QEMU" >> debian/substvars; \
    cd /root/build/vitastor-$REL; \
    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.orig.tar.xz vitastor-0.5; \
    cd vitastor-0.5; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/build/vitastor-$REL/vitastor-*/
--- a/debian/vitastor-buster.Dockerfile
+++ b/debian/vitastor-buster.Dockerfile
@@ -1,80 +0,0 @@
 # Build packages for Debian 10 inside a container
 # cd ..; podman build -t vitastor-buster -v `pwd`/build:/root/build -f debian/vitastor-buster.Dockerfile .
 FROM debian:buster
 WORKDIR /root
 RUN echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
    echo 'APT::Install-Recommends false;' > /etc/apt/apt.conf
 RUN apt-get update
 RUN apt-get -t buster-backports -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
 RUN apt-get -t buster-backports -y build-dep qemu
 RUN apt-get -y build-dep fio
 RUN apt-get -t buster-backports --download-only source qemu-kvm
 RUN apt-get --download-only source fio
 ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
 RUN set -e; \
    mkdir -p /root/build/qemu-buster; \
    rm -rf /root/build/qemu-buster/*; \
    cd /root/build/qemu-buster; \
    dpkg-source -x /root/qemu*.dsc; \
    if [ -d /root/build/qemu-buster/qemu-5.0 ]; then \
        cp /root/vitastor/qemu-5.0-vitastor.patch /root/build/qemu-buster/qemu-5.0/debian/patches; \
        echo qemu-5.0-vitastor.patch >> /root/build/qemu-buster/qemu-5.0/debian/patches/series; \
    else \
        cp /root/vitastor/qemu-5.1-vitastor.patch /root/build/qemu-buster/qemu-*/debian/patches; \
        echo qemu-5.1-vitastor.patch >> /root/build/qemu-buster/qemu-*/debian/patches/series; \
    fi; \
    cd /root/build/qemu-buster/qemu-*/; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)\).*$/$1/')+vitastor1; \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D buster -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/build/qemu-buster/qemu-*/
 RUN cd /root/build/qemu-buster && apt-get -y -t buster-backports install ./qemu-system-data*.deb ./qemu-system-common_*.deb ./qemu-system-x86_*.deb ./qemu_*.deb
 ADD . /root/vitastor
 RUN set -e -x; \
    mkdir -p /root/fio-build/; \
    cd /root/fio-build/; \
    rm -rf /root/fio-build/*; \
    dpkg-source -x /root/fio*.dsc; \
    cd /root/build/qemu-buster/; \
    rm -rf qemu*/; \
    dpkg-source -x qemu*.dsc; \
    cd /root/build/qemu-buster/qemu*/; \
    debian/rules b/configure-stamp; \
    cd b/qemu; \
    make -j8 qapi; \
    mkdir -p /root/build/vitastor-buster; \
    rm -rf /root/build/vitastor-buster/*; \
    cd /root/build/vitastor-buster; \
    cp -r /root/vitastor vitastor-0.5; \
    ln -s /root/build/qemu-buster/qemu-*/ vitastor-0.5/qemu; \
    ln -s /root/fio-build/fio-*/ vitastor-0.5/fio; \
    cd vitastor-0.5; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    sh copy-qemu-includes.sh; \
    sh copy-fio-includes.sh; \
    rm qemu fio; \
    mkdir -p a b debian/patches; \
    mv qemu-copy b/qemu; \
    mv fio-copy b/fio; \
    diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
    echo qemu-fio-headers.patch >> debian/patches/series; \
    rm -rf a b; \
    rm -rf /root/build/qemu-buster/qemu*/; \
    echo "dep:fio=$FIO" > debian/substvars; \
    echo "dep:qemu=$QEMU" >> debian/substvars; \
    cd /root/build/vitastor-buster; \
    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.orig.tar.xz vitastor-0.5; \
    cd vitastor-0.5; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D buster -v "$V""buster" "Rebuild for buster"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/build/vitastor-buster/vitastor-*/
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -0,0 +1,67 @@
 # Build Vitastor packages for Debian Buster or Bullseye/Sid inside a container
 # cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
 FROM debian:$REL
 WORKDIR /root
 RUN if [ "$REL" = "buster" ]; then \
        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
    fi; \
    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
 RUN apt-get update
 RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
 RUN apt-get -y build-dep qemu
 RUN apt-get -y build-dep fio
 RUN apt-get --download-only source qemu
 RUN apt-get --download-only source fio
 RUN apt-get -y install libjerasure-dev cmake
 ADD . /root/vitastor
 RUN set -e -x; \
    mkdir -p /root/fio-build/; \
    cd /root/fio-build/; \
    rm -rf /root/fio-build/*; \
    dpkg-source -x /root/fio*.dsc; \
    cd /root/packages/qemu-$REL/; \
    rm -rf qemu*/; \
    dpkg-source -x qemu*.dsc; \
    cd /root/packages/qemu-$REL/qemu*/; \
    debian/rules b/configure-stamp; \
    cd b/qemu; \
    make -j8 qapi/qapi-builtin-types.h; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
    cp -r /root/vitastor vitastor-0.5.13; \
    ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.13/qemu; \
    ln -s /root/fio-build/fio-*/ vitastor-0.5.13/fio; \
    cd vitastor-0.5.13; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    sh copy-qemu-includes.sh; \
    sh copy-fio-includes.sh; \
    rm qemu fio; \
    mkdir -p a b debian/patches; \
    mv qemu-copy b/qemu; \
    mv fio-copy b/fio; \
    diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
    echo qemu-fio-headers.patch >> debian/patches/series; \
    rm -rf a b; \
    rm -rf /root/packages/qemu-$REL/qemu*/; \
    echo "dep:fio=$FIO" > debian/substvars; \
    echo "dep:qemu=$QEMU" >> debian/substvars; \
    cd /root/packages/vitastor-$REL; \
    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.13.orig.tar.xz vitastor-0.5.13; \
    cd vitastor-0.5.13; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/vitastor-$REL/vitastor-*/
--- a/lambda_size.cpp
+++ b/lambda_size.cpp
@@ -1,51 +0,0 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 (see README.md for details)
 #include <iostream>
 #include <functional>
 #include <array>
 #include <cstdlib> // for malloc() and free()
 using namespace std;
 // replace operator new and delete to log allocations
 void* operator new(std::size_t n)
 {
    cout << "Allocating " << n << " bytes" << endl;
    return malloc(n);
 }
 void operator delete(void* p) throw()
 {
    free(p);
 }
 class test
 {
 public:
    std::string s;
    void a(std::function<void()> & f, const char *str)
    {
        auto l = [this, str]() { cout << str << " ? " << s << " from this\n"; };
        cout << "Assigning lambda3 of size " << sizeof(l) << endl;
        f = l;
    }
 };
 int main()
 {
    std::array<char, 16> arr1;
    auto lambda1 = [arr1](){};
    cout << "Assigning lambda1 of size " << sizeof(lambda1) << endl;
    std::function<void()> f1 = lambda1;
    std::array<char, 17> arr2;
    auto lambda2 = [arr2](){};
    cout << "Assigning lambda2 of size " << sizeof(lambda2) << endl;
    std::function<void()> f2 = lambda2;
    test t;
    std::function<void()> f3;
    t.s = "str";
    t.a(f3, "huyambda");
    f3();
 }
--- a/mon/PGUtil.js
+++ b/mon/PGUtil.js
@@ -1,22 +1,59 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 module.exports = {
    scale_pg_count,
 };
 function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_pg)
 {
    if (!new_pg_history[new_pg])
    {
        new_pg_history[new_pg] = {
            osd_sets: {},
            all_peers: {},
            epoch: 0,
        };
    }
    const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
    nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
    if (oh && oh.osd_sets && oh.osd_sets.length)
    {
        for (const pg of oh.osd_sets)
        {
            nh.osd_sets[pg.join(' ')] = pg;
        }
    }
    if (oh && oh.all_peers && oh.all_peers.length)
    {
        for (const osd_num of oh.all_peers)
        {
            nh.all_peers[osd_num] = Number(osd_num);
        }
    }
    if (oh && oh.epoch)
    {
        nh.epoch = nh.epoch < oh.epoch ? oh.epoch : nh.epoch;
    }
 }
 function finish_pg_history(merged_history)
 {
    merged_history.osd_sets = Object.values(merged_history.osd_sets);
    merged_history.all_peers = Object.values(merged_history.all_peers);
 }
 function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
 {
    const old_pg_count = prev_pgs.length;
-    // Add all possibly intersecting PGs into the history of new PGs
+    // Add all possibly intersecting PGs to the history of new PGs
    if (!(new_pg_count % old_pg_count))
    {
-        // New PG count is a multiple of the old PG count
+        // New PG count is a multiple of old PG count
        const mul = (new_pg_count / old_pg_count);
        for (let i = 0; i < new_pg_count; i++)
        {
-            const old_i = Math.floor(new_pg_count / mul);
+            add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i % old_pg_count);
-            new_pg_history[i] = JSON.parse(JSON.stringify(prev_pg_history[1+old_i]));
+            finish_pg_history(new_pg_history[i]);
        }
    }
    else if (!(old_pg_count % new_pg_count))
@@ -25,68 +62,26 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
        const mul = (old_pg_count / new_pg_count);
        for (let i = 0; i < new_pg_count; i++)
        {
            new_pg_history[i] = {
                osd_sets: [],
                all_peers: [],
                epoch: 0,
            };
            for (let j = 0; j < mul; j++)
            {
-                new_pg_history[i].osd_sets.push(prev_pgs[i*mul]);
+                add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i+j*new_pg_count);
                const hist = prev_pg_history[1+i*mul+j];
                if (hist && hist.osd_sets && hist.osd_sets.length)
                {
                    Array.prototype.push.apply(new_pg_history[i].osd_sets, hist.osd_sets);
                }
                if (hist && hist.all_peers && hist.all_peers.length)
                {
                    Array.prototype.push.apply(new_pg_history[i].all_peers, hist.all_peers);
                }
                if (hist && hist.epoch)
                {
                    new_pg_history[i].epoch = new_pg_history[i].epoch < hist.epoch ? hist.epoch : new_pg_history[i].epoch;
                }
            }
            finish_pg_history(new_pg_history[i]);
        }
    }
    else
    {
        // Any PG may intersect with any PG after non-multiple PG count change
        // So, merge ALL PGs history
-        let all_sets = {};
+        let merged_history = {};
-        let all_peers = {};
+        for (let i = 0; i < old_pg_count; i++)
        let max_epoch = 0;
        for (const pg of prev_pgs)
        {
-            all_sets[pg.join(' ')] = pg;
+            add_pg_history(merged_history, 1, prev_pgs, prev_pg_history, i);
        }
-        for (const pg in prev_pg_history)
+        finish_pg_history(merged_history[1]);
        {
            const hist = prev_pg_history[pg];
            if (hist && hist.osd_sets)
            {
                for (const pg of hist.osd_sets)
                {
                    all_sets[pg.join(' ')] = pg;
                }
            }
            if (hist && hist.all_peers)
            {
                for (const osd_num of hist.all_peers)
                {
                    all_peers[osd_num] = Number(osd_num);
                }
            }
            if (hist && hist.epoch)
            {
                max_epoch = max_epoch < hist.epoch ? hist.epoch : max_epoch;
            }
        }
        all_sets = Object.values(all_sets);
        all_peers = Object.values(all_peers);
        for (let i = 0; i < new_pg_count; i++)
        {
-            new_pg_history[i] = { osd_sets: all_sets, all_peers, epoch: max_epoch };
+            new_pg_history[i] = { ...merged_history[1] };
        }
    }
    // Mark history keys for removed PGs as removed
@@ -94,19 +89,16 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
    {
        new_pg_history[i] = null;
    }
    // Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
    if (old_pg_count < new_pg_count)
    {
-        for (let i = new_pg_count-1; i >= 0; i--)
+        for (let i = old_pg_count; i < new_pg_count; i++)
        {
-            prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
+            prev_pgs[i] = prev_pgs[i % old_pg_count];
        }
    }
    else if (old_pg_count > new_pg_count)
    {
        for (let i = 0; i < new_pg_count; i++)
        {
            prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
        }
        prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
    }
 }
--- a/mon/afr.js
+++ b/mon/afr.js
@@ -1,31 +1,16 @@
 // Functions to calculate Annualized Failure Rate of your cluster
 // if you know AFR of your drives, number of drives, expected rebalance time
 // and replication factor
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0
-
+// Author: Vitaliy Filippov, 2020+
 const { sprintf } = require('sprintf-js');
 module.exports = {
    cluster_afr_fullmesh,
    failure_rate_fullmesh,
    cluster_afr,
    print_cluster_afr,
    c_n_k,
 };
 print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, capacity: 4000, speed: 0.1, replicas: 2 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, capacity: 4000, speed: 0.1, ec: [ 2, 1 ] });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: [ 2, 1 ] });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 2 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 3 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
 /******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/
 // Estimate AFR of the cluster
@@ -56,93 +41,38 @@ function failure_rate_fullmesh(n, a, f)
 /******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/
 // <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
-// <k> replicas, <pgs> unique peer PGs per OSD
+// <k> replicas, <pgs> unique peer PGs per OSD (~50 for 100 PG-per-OSD in a big cluster)
 //
 // For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
 // More peers per OSD increase rebalance speed (more drives work together to resilver) if you
-// let them finish rebalance BEFORE replacing the failed drive.
+// let them finish rebalance BEFORE replacing the failed drive (degraded_replacement=false).
 // At the same time, more peers per OSD increase probability of any of them to fail!
 // osd_rm=true means that failed OSDs' data is rebalanced over all other hosts,
 // not over the same host as it's in Ceph by default (dead OSDs are marked 'out').
 //
 // Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
 // So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
 // is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
 // because it gets increased linearly by increased number of peers to fail
 // and decreased linearly by reduced rebalance time.
-function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
+function cluster_afr({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec, ec_data, ec_parity, replicas, pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
 {
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
+    const pg_size = (ec ? ec_data+ec_parity : replicas);
-    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
+    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(pg_size-1));
-    return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
+    const host_pgs = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(pg_size-1));
-}
+    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
-
+    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
-function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
+    const host_heal_time = (down_out_interval + n_drives*capacity/pgs/speed)/86400/365;
-{
+    const disk_heal_fail = ((afr_drive+afr_host/n_drives)*disk_heal_time);
-    const ec_total = ec_data+ec_parity;
+    const host_heal_fail = ((afr_drive+afr_host/n_drives)*host_heal_time);
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
+    const disk_pg_fail = ec
-    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
+        ? failure_rate_fullmesh(ec_data+ec_parity-1, disk_heal_fail, ec_parity)
-    return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
+        : disk_heal_fail**(replicas-1);
-}
+    const host_pg_fail = ec
-
+        ? failure_rate_fullmesh(ec_data+ec_parity-1, host_heal_fail, ec_parity)
-// Same as above, but also take server failures into account
+        : host_heal_fail**(replicas-1);
-function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
+    return 1 - ((1 - afr_drive * (1-(1-disk_pg_fail)**pgs)) ** (n_hosts*n_drives))
-{
+        * ((1 - afr_host * (1-(1-host_pg_fail)**host_pgs)) ** n_hosts);
    let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
    let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
    const lh = n_drives*capacity/pgs/speed/86400/365;
    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
    return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
        ((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
 }
 function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
 {
    const ec_total = ec_data+ec_parity;
    const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
    const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
    const lh = n_drives*capacity/pgs/speed/86400/365;
    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
    return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
        ((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
 }
 // Wrapper for 4 above functions
 function cluster_afr(config)
 {
    if (config.ec && config.afr_host)
    {
        return cluster_afr_pgs_ec_hosts(config);
    }
    else if (config.ec)
    {
        return cluster_afr_pgs_ec(config);
    }
    else if (config.afr_host)
    {
        return cluster_afr_pgs_hosts(config);
    }
    else
    {
        return cluster_afr_pgs(config);
    }
 }
 function print_cluster_afr(config)
 {
    console.log(
        `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
        `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
        (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
        (config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+
        `, ${config.pgs||1} PG per OSD`+
        (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
    );
    console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
 }
 /******** UTILITY ********/
--- a/mon/afr_test.js
+++ b/mon/afr_test.js
@@ -0,0 +1,28 @@
 const { sprintf } = require('sprintf-js');
 const { cluster_afr } = require('./afr.js');
 print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, replicas: 2 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
 print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 2 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
 print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
 function print_cluster_afr(config)
 {
    console.log(
        `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
        `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
        (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
        (config.ec ? `, EC ${config.ec_data}+${config.ec_parity}` : `, ${config.replicas} replicas`)+
        `, ${config.pgs||1} PG per OSD`+
        (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
    );
    console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
 }
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 // Data distribution optimizer using linear programming (lp_solve)
@@ -58,7 +58,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
    }
    const all_weights = Object.assign({}, ...Object.values(osd_tree));
    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
-    const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations));
+    const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
    const pg_per_osd = {};
    for (const pg of all_pgs)
    {
@@ -104,6 +104,17 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
    return res;
 }
 function shuffle(array)
 {
    for (let i = array.length - 1, j, x; i > 0; i--)
    {
        j = Math.floor(Math.random() * (i + 1));
        x = array[i];
        array[i] = array[j];
        array[j] = x;
    }
 }
 function make_int_pgs(weights, pg_count)
 {
    const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
@@ -120,6 +131,7 @@ function make_int_pgs(weights, pg_count)
        weight_left -= weights[pg_name];
        pg_left -= n;
    }
    shuffle(int_pgs);
    return int_pgs;
 }
@@ -249,7 +261,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
        }
    }
    // Get all combinations
-    let all_pgs = random_combinations(osd_tree, pg_size, max_combinations);
+    let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
    add_valid_previous(osd_tree, prev_weights, all_pgs);
    all_pgs = Object.values(all_pgs);
    const pg_per_osd = {};
@@ -275,6 +287,11 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
    lp += 'max: '+all_pg_names.map(pg_name => (
        prev_weights[pg_name] ? `${pg_size+1}*add_${pg_name} - ${pg_size+1}*del_${pg_name}` : `${pg_size+1-move_weights[pg_name]}*${pg_name}`
    )).join(' + ')+';\n';
    lp += all_pg_names
        .map(pg_name => (prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : `${pg_name}`))
        .join(' + ')+' = '+(pg_count
            - Object.keys(prev_weights).reduce((a, old_pg_name) => (a + (all_pgs_hash[old_pg_name] ? prev_weights[old_pg_name] : 0)), 0)
        )+';\n';
    for (const osd in pg_per_osd)
    {
        if (osd !== NO_OSD)
@@ -488,7 +505,8 @@ function extract_osds(osd_tree, levels, osd_level, osds = {})
    return osds;
 }
-function random_combinations(osd_tree, pg_size, count)
+// ordered = don't treat (x,y) and (y,x) as equal
 function random_combinations(osd_tree, pg_size, count, ordered)
 {
    let seed = 0x5f020e43;
    let rng = () =>
@@ -516,25 +534,47 @@ function random_combinations(osd_tree, pg_size, count)
                pg.push(osds[cur_hosts[next_host]][next_osd]);
                cur_hosts.splice(next_host, 1);
            }
-            while (pg.length < pg_size)
+            const cyclic_pgs = [ pg ];
            if (ordered)
            {
-                pg.push(NO_OSD);
+                for (let i = 1; i < pg.size; i++)
                {
                    cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
                }
            }
            for (const pg of cyclic_pgs)
            {
                while (pg.length < pg_size)
                {
                    pg.push(NO_OSD);
                }
                r['pg_'+pg.join('_')] = pg;
            }
            r['pg_'+pg.join('_')] = pg;
        }
    }
    // Generate purely random combinations
-    restart: while (count > 0)
+    while (count > 0)
    {
        let host_idx = [];
-        for (let i = 0; i < pg_size && i < hosts.length; i++)
+        const cur_hosts = [ ...hosts.map((h, i) => i) ];
        const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
        if (ordered)
        {
-            let start = i > 0 ? host_idx[i-1]+1 : 0;
+            for (let i = 0; i < max_hosts; i++)
            if (start >= hosts.length)
            {
-                continue restart;
+                const r = rng() % cur_hosts.length;
                host_idx[i] = cur_hosts[r];
                cur_hosts.splice(r, 1);
            }
        }
        else
        {
            for (let i = 0; i < max_hosts; i++)
            {
                const r = rng() % (cur_hosts.length - (max_hosts - i - 1));
                host_idx[i] = cur_hosts[r];
                cur_hosts.splice(0, r+1);
            }
            host_idx[i] = start + rng() % (hosts.length-start);
        }
        let pg = host_idx.map(h => osds[hosts[h]][rng() % osds[hosts[h]].length]);
        while (pg.length < pg_size)
--- a/mon/make-osd.sh
+++ b/mon/make-osd.sh
@@ -0,0 +1,75 @@
 #!/bin/bash
 # Very simple systemd unit generator for vitastor-osd services
 # Not the final solution yet, mostly for tests
 # Copyright (c) Vitaliy Filippov, 2019+
 # License: MIT
 # USAGE: ./make-osd.sh /dev/disk/by-partuuid/xxx [ /dev/disk/by-partuuid/yyy]...
 IP_SUBSTR="10.200.1."
 ETCD_HOSTS="etcd0=http://10.200.1.10:2380,etcd1=http://10.200.1.11:2380,etcd2=http://10.200.1.12:2380"
 set -e -x
 IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
 [ "$IP" != "" ] || exit 1
 ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
 D=`dirname $0`
 # Create OSDs on all passed devices
 OSD_NUM=1
 for DEV in $*; do
 # Ugly :) -> node.js rework pending
 while true; do
    ST=$(etcdctl --endpoints="$ETCD_MON" get --print-value-only /vitastor/osd/stats/$OSD_NUM)
    if [ "$ST" = "" ]; then
        break
    fi
    OSD_NUM=$((OSD_NUM+1))
 done
 etcdctl --endpoints="$ETCD_MON" put /vitastor/osd/stats/$OSD_NUM '{}'
 echo Creating OSD $OSD_NUM on $DEV
 OPT=`node $D/simple-offsets.js --device $DEV --format options | tr '\n' ' '`
 META=`echo $OPT | grep -Po '(?<=data_offset )\d+'`
 dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct
 cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
 [Unit]
 Description=Vitastor object storage daemon osd.$OSD_NUM
 After=network-online.target local-fs.target time-sync.target
 Wants=network-online.target local-fs.target time-sync.target
 PartOf=vitastor.target
 [Service]
 LimitNOFILE=1048576
 LimitNPROC=1048576
 LimitMEMLOCK=infinity
 ExecStart=/usr/bin/vitastor-osd \\
    --etcd_address $IP:2379/v3 \\
    --bind_address $IP \\
    --osd_num $OSD_NUM \\
    --disable_data_fsync 1 \\
    --immediate_commit all \\
    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
    --journal_no_same_sector_overwrites true \\
    --journal_sector_buffer_count 1024 \\
    $OPT
 WorkingDirectory=/
 ExecStartPre=+chown vitastor:vitastor $DEV
 User=vitastor
 PrivateTmp=false
 TasksMax=infinity
 Restart=always
 StartLimitInterval=0
 RestartSec=10
 [Install]
 WantedBy=vitastor.target
 EOF
 systemctl enable vitastor-osd$OSD_NUM
 done
--- a/mon/make-units.sh
+++ b/mon/make-units.sh
@@ -1,19 +1,25 @@
 #!/bin/bash
-# Example startup script generator
+# Very simple systemd unit generator for etcd & vitastor-mon services
-# Of course this isn't a production solution yet, this is just for tests
+# Not the final solution yet, mostly for tests
 # Copyright (c) Vitaliy Filippov, 2019+
 # License: MIT
-IP=`ip -json a s | jq -r '.[].addr_info[] | select(.broadcast == "10.115.0.255") | .local'`
+# USAGE: ./make-units.sh
 IP_SUBSTR="10.200.1."
 ETCD_HOSTS="etcd0=http://10.200.1.10:2380,etcd1=http://10.200.1.11:2380,etcd2=http://10.200.1.12:2380"
 # determine IP
 IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
 [ "$IP" != "" ] || exit 1
 ETCD_NUM=${ETCD_HOSTS/$IP*/}
 [ "$ETCD_NUM" != "$ETCD_HOSTS" ] || exit 1
 ETCD_NUM=$(echo $ETCD_NUM | tr -d -c , | wc -c)
-BASE=${IP/*./}
+# etcd
 BASE=$((BASE-10))
 useradd etcd
-mkdir -p /var/lib/etcd$BASE.etcd
+mkdir -p /var/lib/etcd$ETCD_NUM.etcd
 cat >/etc/systemd/system/etcd.service <<EOF
 [Unit]
 Description=etcd for vitastor
@@ -22,19 +28,19 @@ Wants=network-online.target local-fs.target time-sync.target
 [Service]
 Restart=always
-ExecStart=/usr/local/bin/etcd -name etcd$BASE --data-dir /var/lib/etcd$BASE.etcd \\
+ExecStart=/usr/local/bin/etcd -name etcd$ETCD_NUM --data-dir /var/lib/etcd$ETCD_NUM.etcd \\
    --advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
    --initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
-    --initial-cluster-token vitastor-etcd-1 --initial-cluster etcd0=http://10.115.0.10:2380,etcd1=http://10.115.0.11:2380,etcd2=http://10.115.0.12:2380,etcd3=http://10.115.0.13:2380 \\
+    --initial-cluster-token vitastor-etcd-1 --initial-cluster $ETCD_HOSTS \\
-    --initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
+    --initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
-WorkingDirectory=/var/lib/etcd$BASE.etcd
+    --auto-compaction-retention=10 --auto-compaction-mode=revision
-ExecStartPre=+chown -R etcd /var/lib/etcd$BASE.etcd
+WorkingDirectory=/var/lib/etcd$ETCD_NUM.etcd
 ExecStartPre=+chown -R etcd /var/lib/etcd$ETCD_NUM.etcd
 User=etcd
 PrivateTmp=false
 TasksMax=infinity
 Restart=always
 StartLimitInterval=0
 StartLimitIntervalSec=0
 RestartSec=10
 [Install]
@@ -48,9 +54,7 @@ systemctl start etcd
 useradd vitastor
 chmod 755 /root
-BASE=${IP/*./}
+# Vitastor target
 BASE=$(((BASE-10)*12))
 cat >/etc/systemd/system/vitastor.target <<EOF
 [Unit]
 Description=vitastor target
@@ -58,116 +62,25 @@ Description=vitastor target
 WantedBy=multi-user.target
 EOF
-i=1
+# Monitor unit
-for DEV in `ls /dev/disk/by-id/ | grep ata-INTEL_SSDSC2KB`; do
+ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
-    dd if=/dev/zero of=/dev/disk/by-id/$DEV bs=1048576 count=$(((427814912+1048575)/1048576+2))
+cat >/etc/systemd/system/vitastor-mon.service <<EOF
    dd if=/dev/zero of=/dev/disk/by-id/$DEV bs=1048576 count=$(((427814912+1048575)/1048576+2)) seek=$((1920377991168/1048576))
 cat >/etc/systemd/system/vitastor-osd$((BASE+i)).service <<EOF
 [Unit]
-Description=Vitastor object storage daemon osd.$((BASE+i))
+Description=Vitastor monitor
 After=network-online.target local-fs.target time-sync.target
 Wants=network-online.target local-fs.target time-sync.target
 PartOf=vitastor.target
 [Service]
-LimitNOFILE=1048576
+Restart=always
-LimitNPROC=1048576
+ExecStart=node /usr/lib/vitastor/mon/mon-main.js --etcd_url '$ETCD_MON' --etcd_prefix '/vitastor' --etcd_start_timeout 5
-LimitMEMLOCK=infinity
+WorkingDirectory=/
 ExecStart=/root/vitastor/osd \\
    --etcd_address $IP:2379/v3 \\
    --bind_address $IP \\
    --osd_num $((BASE+i)) \\
    --disable_data_fsync 1 \\
    --disable_device_lock 1 \\
    --immediate_commit all \\
    --flusher_count 8 \\
    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
    --journal_no_same_sector_overwrites true \\
    --journal_sector_buffer_count 1024 \\
    --journal_offset 0 \\
    --meta_offset 16777216 \\
    --data_offset 427814912 \\
    --data_size $((1920377991168-427814912)) \\
    --data_device /dev/disk/by-id/$DEV
 WorkingDirectory=/root/vitastor
 ExecStartPre=+chown vitastor:vitastor /dev/disk/by-id/$DEV
 User=vitastor
 PrivateTmp=false
 TasksMax=infinity
 Restart=always
 StartLimitInterval=0
 StartLimitIntervalSec=0
 RestartSec=10
 [Install]
 WantedBy=vitastor.target
 EOF
    systemctl enable vitastor-osd$((BASE+i))
    i=$((i+1))
 cat >/etc/systemd/system/vitastor-osd$((BASE+i)).service <<EOF
 [Unit]
 Description=Vitastor object storage daemon osd.$((BASE+i))
 After=network-online.target local-fs.target time-sync.target
 Wants=network-online.target local-fs.target time-sync.target
 PartOf=vitastor.target
 [Service]
 LimitNOFILE=1048576
 LimitNPROC=1048576
 LimitMEMLOCK=infinity
 ExecStart=/root/vitastor/osd \\
    --etcd_address $IP:2379/v3 \\
    --bind_address $IP \\
    --osd_num $((BASE+i)) \\
    --disable_data_fsync 1 \\
    --immediate_commit all \\
    --flusher_count 8 \\
    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
    --journal_no_same_sector_overwrites true \\
    --journal_sector_buffer_count 1024 \\
    --journal_offset 1920377991168 \\
    --meta_offset $((1920377991168+16777216)) \\
    --data_offset $((1920377991168+427814912)) \\
    --data_size $((1920377991168-427814912)) \\
    --data_device /dev/disk/by-id/$DEV
 WorkingDirectory=/root/vitastor
 ExecStartPre=+chown vitastor:vitastor /dev/disk/by-id/$DEV
 User=vitastor
 PrivateTmp=false
 TasksMax=infinity
 Restart=always
 StartLimitInterval=0
 StartLimitIntervalSec=0
 RestartSec=10
 [Install]
 WantedBy=vitastor.target
 EOF
    systemctl enable vitastor-osd$((BASE+i))
    i=$((i+1))
 done
 exit
 node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5
 podman run -d --network host --restart always -v /var/lib/etcd0.etcd:/etcd0.etcd --name etcd quay.io/coreos/etcd:v3.4.13 etcd -name etcd0 \
    -advertise-client-urls http://10.115.0.10:2379 -listen-client-urls http://10.115.0.10:2379 \
    -initial-advertise-peer-urls http://10.115.0.10:2380 -listen-peer-urls http://10.115.0.10:2380 \
    -initial-cluster-token vitastor-etcd-1 -initial-cluster etcd0=http://10.115.0.10:2380,etcd1=http://10.115.0.11:2380,etcd2=http://10.115.0.12:2380,etcd3=http://10.115.0.13:2380 \
    -initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
 etcdctl --endpoints http://10.115.0.10:2379 put /vitastor/config/global '{"immediate_commit":"all"}'
 etcdctl --endpoints http://10.115.0.10:2379 put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":48,"failure_domain":"host"}}'
 #let pgs = {};
 #for (let n = 0; n < 48; n++) { let i = n/2 | 0; pgs[1+n] = { osd_set: [ (1+i%12+(i/12 | 0)*24), (1+12+i%12+(i/12 | 0)*24) ], primary: (1+(n%2)*12+i%12+(i/12 | 0)*24) }; };
 #console.log(JSON.stringify({ items: { 1: pgs } }));
 #etcdctl --endpoints http://10.115.0.10:2379 put /vitastor/config/pgs ...
 #    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
 #    --data_offset 427814912 \\
 #    --disk_alignment 4096 --journal_block_size 512 --meta_block_size 512 \\
 #    --data_offset 433434624 \\
--- a/mon/mon-main.js
+++ b/mon/mon-main.js
@@ -1,7 +1,7 @@
 #!/usr/bin/node
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 const Mon = require('./mon.js');
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 const http = require('http');
 const crypto = require('crypto');
@@ -10,6 +10,14 @@ const stableStringify = require('./stable-stringify.js');
 const PGUtil = require('./PGUtil.js');
 // FIXME document all etcd keys and config variables in the form of JSON schema or similar
 const etcd_nonempty_keys = {
    'config/global': 1,
    'config/node_placement': 1,
    'config/pools': 1,
    'config/pgs': 1,
    'history/last_clean_pgs': 1,
    'stats': 1,
 };
 const etcd_allow = new RegExp('^'+[
    'config/global',
    'config/node_placement',
@@ -22,6 +30,7 @@ const etcd_allow = new RegExp('^'+[
    'pg/state/[1-9]\\d*/[1-9]\\d*',
    'pg/stats/[1-9]\\d*/[1-9]\\d*',
    'pg/history/[1-9]\\d*/[1-9]\\d*',
    'history/last_clean_pgs',
    'stats',
 ].join('$|^')+'$');
@@ -34,7 +43,7 @@ const etcd_tree = {
            etcd_mon_retries: 5, // min: 0
            mon_change_timeout: 1000, // ms. min: 100
            mon_stats_timeout: 1000, // ms. min: 100
-            osd_out_time: 1800, // seconds. min: 0
+            osd_out_time: 600, // seconds. min: 0
            placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
            // client and osd
            use_sync_send_recv: false,
@@ -46,6 +55,8 @@ const etcd_tree = {
            client_dirty_limit: 33554432,
            peer_connect_interval: 5, // seconds. min: 1
            peer_connect_timeout: 5, // seconds. min: 1
            osd_idle_timeout: 5, // seconds. min: 1
            osd_ping_timeout: 5, // seconds. min: 1
            up_wait_retry_interval: 500, // ms. min: 50
            // osd
            etcd_report_interval: 30, // min: 10
@@ -55,8 +66,12 @@ const etcd_tree = {
            autosync_interval: 5,
            client_queue_depth: 128, // unused
            recovery_queue_depth: 4,
            recovery_sync_batch: 16,
            readonly: false,
            no_recovery: false,
            no_rebalance: false,
            print_stats_interval: 3,
            slow_log_interval: 10,
            // blockstore - fixed in superblock
            block_size,
            disk_alignment,
@@ -76,7 +91,9 @@ const etcd_tree = {
            disable_meta_fsync,
            disable_device_lock,
            // blockstore - configurable
-            flusher_count,
+            max_write_iodepth,
            min_flusher_count: 1,
            max_flusher_count: 256,
            inmemory_metadata,
            inmemory_journal,
            journal_sector_buffer_count,
@@ -166,7 +183,7 @@ const etcd_tree = {
            /* <pool_id>: {
                <pg_id>: {
                    primary: osd_num_t,
-                    state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
+                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
                        "has_invalid"|"left_on_dead")[],
                }
@@ -213,6 +230,9 @@ const etcd_tree = {
            incomplete: uint64_t,
        }, */
    },
    history: {
        last_clean_pgs: {},
    },
 };
 // FIXME Split into several files
@@ -253,7 +273,10 @@ class Mon
        const res = await this.etcd_call('/kv/txn', { success: [
            { requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
        ] }, this.etcd_start_timeout, -1);
-        this.parse_kv(res.responses[0].response_range.kvs[0]);
+        if (res.responses[0].response_range.kvs)
        {
            this.parse_kv(res.responses[0].response_range.kvs[0]);
        }
        this.check_config();
    }
@@ -288,7 +311,7 @@ class Mon
        this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
        if (!this.config.osd_out_time)
        {
-            this.config.osd_out_time = 30*60; // 30 minutes by default
+            this.config.osd_out_time = 600; // 10 minutes by default
        }
    }
@@ -310,8 +333,14 @@ class Mon
                    ok(false);
                }, this.config.etcd_mon_timeout);
                this.ws = new WebSocket(base+'/watch');
                const fail = () =>
                {
                    ok(false);
                };
                this.ws.on('error', fail);
                this.ws.on('open', () =>
                {
                    this.ws.removeListener('error', fail);
                    if (timer_id)
                        clearTimeout(timer_id);
                    ok(true);
@@ -356,7 +385,7 @@ class Mon
            }
            else
            {
-                let stats_changed = false, changed = false;
+                let stats_changed = false, changed = false, pg_states_changed = false;
                if (this.verbose)
                {
                    console.log('Revision '+data.result.header.revision+' events: ');
@@ -370,15 +399,23 @@ class Mon
                    {
                        stats_changed = true;
                    }
                    else if (key.substr(0, 10) == '/pg/state/')
                    {
                        pg_states_changed = true;
                    }
                    else if (key != '/stats')
                    {
                        changed = true;
                    }
                    if (this.verbose)
                    {
-                        console.log(e);
+                        console.log(JSON.stringify(e));
                    }
                }
                if (pg_states_changed)
                {
                    this.save_last_clean().catch(console.error);
                }
                if (stats_changed)
                {
                    this.schedule_update_stats();
@@ -391,10 +428,46 @@ class Mon
        });
    }
    async save_last_clean()
    {
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
        for (const pool_id in this.state.config.pools)
        {
            const pool_cfg = this.state.config.pools[pool_id];
            if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
            {
                continue;
            }
            for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
            {
                if (!this.state.pg.state[pool_id] ||
                    !this.state.pg.state[pool_id][pg_num] ||
                    !(this.state.pg.state[pool_id][pg_num].state instanceof Array))
                {
                    // Unclean
                    return;
                }
                let st = this.state.pg.state[pool_id][pg_num].state.join(',');
                if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
                {
                    // Unclean
                    return;
                }
            }
        }
        this.state.history.last_clean_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
        await this.etcd_call('/kv/txn', {
            success: [ { requestPut: {
                key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
                value: b64(JSON.stringify(this.state.history.last_clean_pgs))
            } } ],
        }, this.etcd_start_timeout, 0);
    }
    async get_lease()
    {
        const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
-        const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
+        const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
        this.etcd_lease_id = res.ID;
        setInterval(async () =>
        {
@@ -469,7 +542,7 @@ class Mon
        for (const osd_num of this.all_osds().sort((a, b) => a - b))
        {
            const stat = this.state.osd.stats[osd_num];
-            if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
+            if (stat && stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
            {
                // Numeric IDs are reserved for OSDs
                const osd_cfg = this.state.config.osd[osd_num];
@@ -570,34 +643,61 @@ class Mon
        return !has_online;
    }
    reset_rng()
    {
        this.seed = 0x5f020e43;
    }
    rng()
    {
        this.seed ^= this.seed << 13;
        this.seed ^= this.seed >> 17;
        this.seed ^= this.seed << 5;
        return this.seed + 2147483648;
    }
    pick_primary(pool_id, osd_set, up_osds)
    {
        let alive_set;
        if (this.state.config.pools[pool_id].scheme === 'replicated')
            alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
        else
        {
            // Prefer data OSDs for EC because they can actually read something without an additional network hop
            const pg_data_size = (this.state.config.pools[pool_id].pg_size||0) -
                (this.state.config.pools[pool_id].parity_chunks||0);
            alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
            if (!alive_set.length)
                alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
        }
        if (!alive_set.length)
            return 0;
        return alive_set[this.rng() % alive_set.length];
    }
    save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
    {
        const replicated = new_pgs.length && this.state.config.pools[pool_id].scheme === 'replicated';
        const pg_minsize = new_pgs.length && this.state.config.pools[pool_id].pg_minsize;
        const pg_items = {};
        this.reset_rng();
        new_pgs.map((osd_set, i) =>
        {
            osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
            let alive_set;
            if (replicated)
                alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
            else
            {
                // Prefer data OSDs for EC because they can actually read something without an additional network hop
                alive_set = osd_set.slice(0, pg_minsize).filter(osd_num => osd_num && up_osds[osd_num]);
                if (!alive_set.length)
                    alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
            }
            pg_items[i+1] = {
                osd_set,
-                primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
+                primary: this.pick_primary(pool_id, osd_set, up_osds),
            };
-            if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
+            if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
                prev_pgs[i].filter(osd_num => osd_num).length > 0)
            {
                pg_history[i] = pg_history[i] || {};
                pg_history[i].osd_sets = pg_history[i].osd_sets || [];
                pg_history[i].osd_sets.push(prev_pgs[i]);
            }
            if (pg_history[i] && pg_history[i].osd_sets)
            {
                pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
                    .reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
            }
        });
        for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
        {
@@ -748,7 +848,7 @@ class Mon
    {
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
-        // FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
+        // FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
        const { up_osds, levels, osd_tree } = this.get_osd_tree();
        const tree_cfg = {
            osd_tree,
@@ -788,13 +888,33 @@ class Mon
                pool_tree = pool_tree ? pool_tree.children : [];
                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
-                const prev_pgs = [];
+                // These are for the purpose of building history.osd_sets
-                for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
+                const real_prev_pgs = [];
                let pg_history = [];
                for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
                {
-                    prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
+                    real_prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
                    if (this.state.pg.history[pool_id] &&
                        this.state.pg.history[pool_id][pg])
                    {
                        pg_history[pg-1] = this.state.pg.history[pool_id][pg];
                    }
                }
-                const pg_history = [];
+                // And these are for the purpose of minimizing data movement
-                const old_pg_count = prev_pgs.length;
+                let prev_pgs = [];
                for (const pg in ((this.state.history.last_clean_pgs.items||{})[pool_id]||{}))
                {
                    prev_pgs[pg-1] = this.state.history.last_clean_pgs.items[pool_id][pg].osd_set;
                }
                prev_pgs = JSON.parse(JSON.stringify(prev_pgs.length ? prev_pgs : real_prev_pgs));
                const old_pg_count = real_prev_pgs.length;
                const optimize_cfg = {
                    osd_tree: pool_tree,
                    pg_count: pool_cfg.pg_count,
                    pg_size: pool_cfg.pg_size,
                    pg_minsize: pool_cfg.pg_minsize,
                    max_combinations: pool_cfg.max_osd_combinations,
                };
                let optimize_result;
                if (old_pg_count > 0)
                {
@@ -806,25 +926,37 @@ class Mon
                            this.schedule_recheck();
                            return;
                        }
-                        PGUtil.scale_pg_count(prev_pgs, this.state.pg.history[pool_id]||{}, pg_history, pool_cfg.pg_count);
+                        const new_pg_history = [];
                        PGUtil.scale_pg_count(prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
                        pg_history = new_pg_history;
                    }
                    for (const pg of prev_pgs)
                    {
                        while (pg.length < pool_cfg.pg_size)
                        {
                            pg.push(0);
                        }
                        while (pg.length > pool_cfg.pg_size)
                        {
                            pg.pop();
                        }
                    }
                    if (!this.state.config.pgs.hash)
                    {
                        // Re-shuffle PGs
                        optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
                    }
                    else
                    {
                        optimize_result = await LPOptimizer.optimize_change({
                            prev_pgs,
                            ...optimize_cfg,
                        });
                    }
                    optimize_result = await LPOptimizer.optimize_change({
                        prev_pgs,
                        osd_tree: pool_tree,
                        pg_size: pool_cfg.pg_size,
                        pg_minsize: pool_cfg.pg_minsize,
                        max_combinations: pool_cfg.max_osd_combinations,
                    });
                }
                else
                {
-                    optimize_result = await LPOptimizer.optimize_initial({
+                    optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
                        osd_tree: pool_tree,
                        pg_count: pool_cfg.pg_count,
                        pg_size: pool_cfg.pg_size,
                        pg_minsize: pool_cfg.pg_minsize,
                        max_combinations: pool_cfg.max_osd_combinations,
                    });
                }
                if (old_pg_count != optimize_result.int_pgs.length)
                {
@@ -832,16 +964,21 @@ class Mon
                        `PG count for pool ${pool_id} (${pool_cfg.name || 'unnamed'})`+
                        ` changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`
                    );
                    // Drop stats
                    etcd_request.success.push({ requestDeleteRange: {
                        key: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'/'),
                        range_end: b64(this.etcd_prefix+'/pg/stats/'+pool_id+'0'),
                    } });
                }
                LPOptimizer.print_change_stats(optimize_result);
-                this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, optimize_result.int_pgs, pg_history);
+                this.save_new_pgs_txn(etcd_request, pool_id, up_osds, real_prev_pgs, optimize_result.int_pgs, pg_history);
            }
            this.state.config.pgs.hash = tree_hash;
-            await this.save_pg_config();
+            await this.save_pg_config(etcd_request);
        }
        else
        {
-            // Nothing changed, but we still want to check for down OSDs
+            // Nothing changed, but we still want to recheck the distribution of primaries
            let changed = false;
            for (const pool_id in this.state.config.pools)
            {
@@ -851,22 +988,13 @@ class Mon
                    continue;
                }
                const replicated = pool_cfg.scheme === 'replicated';
-                for (const pg_num in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
+                this.reset_rng();
                for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
                {
                    const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
-                    if (!Number(pg_cfg.primary) || !up_osds[pg_cfg.primary])
+                    if (pg_cfg)
                    {
-                        let alive_set;
+                        const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds);
                        if (replicated)
                            alive_set = pg_cfg.osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
                        else
                        {
                            // Prefer data OSDs for EC because they can actually read something without an additional network hop
                            alive_set = pg_cfg.osd_set.slice(0, pool_cfg.pg_minsize).filter(osd_num => osd_num && up_osds[osd_num]);
                            if (!alive_set.length)
                                alive_set = pg_cfg.osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
                        }
                        const new_primary = alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0;
                        if (pg_cfg.primary != new_primary)
                        {
                            console.log(
@@ -956,25 +1084,25 @@ class Mon
        const op_stats = {}, subop_stats = {}, recovery_stats = {};
        for (const osd in this.state.osd.stats)
        {
-            const st = this.state.osd.stats[osd];
+            const st = this.state.osd.stats[osd]||{};
            for (const op in st.op_stats||{})
            {
                op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
-                op_stats[op].count += BigInt(st.op_stats.count||0);
+                op_stats[op].count += BigInt(st.op_stats[op].count||0);
-                op_stats[op].usec += BigInt(st.op_stats.usec||0);
+                op_stats[op].usec += BigInt(st.op_stats[op].usec||0);
-                op_stats[op].bytes += BigInt(st.op_stats.bytes||0);
+                op_stats[op].bytes += BigInt(st.op_stats[op].bytes||0);
            }
            for (const op in st.subop_stats||{})
            {
                subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
-                subop_stats[op].count += BigInt(st.subop_stats.count||0);
+                subop_stats[op].count += BigInt(st.subop_stats[op].count||0);
-                subop_stats[op].usec += BigInt(st.subop_stats.usec||0);
+                subop_stats[op].usec += BigInt(st.subop_stats[op].usec||0);
            }
            for (const op in st.recovery_stats||{})
            {
                recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
-                recovery_stats[op].count += BigInt(st.recovery_stats.count||0);
+                recovery_stats[op].count += BigInt(st.recovery_stats[op].count||0);
-                recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
+                recovery_stats[op].bytes += BigInt(st.recovery_stats[op].bytes||0);
            }
        }
        for (const op in op_stats)
@@ -1029,11 +1157,14 @@ class Mon
            for (const pg_num in this.state.pg.stats[pool_id])
            {
                const st = this.state.pg.stats[pool_id][pg_num];
-                for (const k in object_counts)
+                if (st)
                {
-                    if (st[k+'_count'])
+                    for (const k in object_counts)
                    {
-                        object_counts[k] += BigInt(st[k+'_count']);
+                        if (st[k+'_count'])
                        {
                            object_counts[k] += BigInt(st[k+'_count']);
                        }
                    }
                }
            }
@@ -1108,16 +1239,20 @@ class Mon
            console.log('Bad value in etcd: '+kv.key+' = '+kv.value);
            return;
        }
-        key = key.split('/');
+        let key_parts = key.split('/');
        let cur = this.state;
-        for (let i = 0; i < key.length-1; i++)
+        for (let i = 0; i < key_parts.length-1; i++)
        {
-            cur = (cur[key[i]] = cur[key[i]] || {});
+            cur = (cur[key_parts[i]] = cur[key_parts[i]] || {});
        }
-        cur[key[key.length-1]] = kv.value;
+        if (etcd_nonempty_keys[key])
-        if (key.join('/') === 'config/global')
+        {
            // Do not clear these to null
            kv.value = kv.value || {};
        }
        cur[key_parts[key_parts.length-1]] = kv.value;
        if (key === 'config/global')
        {
            this.state.config.global = this.state.config.global || {};
            this.config = this.state.config.global;
            this.check_config();
            for (const osd_num in this.state.osd.stats)
@@ -1128,7 +1263,7 @@ class Mon
                );
            }
        }
-        else if (key.join('/') === 'config/pools')
+        else if (key === 'config/pools')
        {
            for (const pool_id in this.state.config.pools)
            {
@@ -1137,7 +1272,7 @@ class Mon
                this.validate_pool_cfg(pool_id, pool_cfg, true);
            }
        }
-        else if (key[0] === 'osd' && key[1] === 'stats')
+        else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
        {
            // Recheck PGs <osd_out_time> later
            this.schedule_next_recheck_at(
@@ -1169,6 +1304,11 @@ class Mon
                    console.error('etcd returned error: '+res.json.error);
                    break;
                }
                if (this.etcd_urls.length > 1)
                {
                    // Stick to the same etcd for the rest of calls
                    this.etcd_urls = [ base ];
                }
                return res.json;
            }
            retry++;
--- a/mon/simple-offsets.js
+++ b/mon/simple-offsets.js
@@ -4,6 +4,7 @@
 // Simple tool to calculate journal and metadata offsets for a single device
 // Will be replaced by smarter tools in the future
 const fs = require('fs').promises;
 const child_process = require('child_process');
 async function run()
@@ -15,6 +16,7 @@ async function run()
        device_block_size: 4096,
        journal_offset: 0,
        device_size: 0,
        format: 'text',
    };
    for (let i = 2; i < process.argv.length; i++)
    {
@@ -24,7 +26,22 @@ async function run()
            i++;
        }
    }
-    const device_size = Number(options.device_size || await system("blockdev --getsize64 "+options.device));
+    if (!options.device)
    {
        process.stderr.write('USAGE: nodejs '+process.argv[1]+' --device /dev/sdXXX\n');
        process.exit(1);
    }
    options.device_size = Number(options.device_size);
    let device_size = options.device_size;
    if (!device_size)
    {
        const st = await fs.stat(options.device);
        options.device_block_size = st.blksize;
        if (st.isBlockDevice())
            device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device))
        else
            device_size = st.size;
    }
    if (!device_size)
    {
        process.stderr.write('Failed to get device size\n');
@@ -32,25 +49,45 @@ async function run()
    }
    options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
    const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
-    const entries_per_block = Math.floor(options.device_block_size / (24 + options.object_size/options.bitmap_granularity/8));
+    const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
    const object_count = Math.floor((device_size-meta_offset)/options.object_size);
    const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size;
    const data_offset = meta_offset + meta_size;
    const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
        : Math.round(meta_size/1024/1024*100)/100+" MB");
-    process.stdout.write(
+    if (options.format == 'text' || options.format == 'options')
-        `Metadata size: ${meta_size_fmt}\n`+
+    {
-        `Options for the OSD:\n`+
+        if (options.format == 'text')
-        `    --journal_offset ${options.journal_offset}\n`+
+        {
-        `    --meta_offset ${meta_offset}\n`+
+            process.stderr.write(
-        `    --data_offset ${data_offset}\n`+
+                `Metadata size: ${meta_size_fmt}\n`+
-        (options.device_size ? `    --data_size ${device_size-data_offset}\n` : '')
+                `Options for the OSD:\n`
-    );
+            );
        }
        process.stdout.write(
            `    --data_device ${options.device}\n`+
            `    --journal_offset ${options.journal_offset}\n`+
            `    --meta_offset ${meta_offset}\n`+
            `    --data_offset ${data_offset}\n`+
            (options.device_size ? `    --data_size ${device_size-data_offset}\n` : '')
        );
    }
    else if (options.format == 'env')
    {
        process.stdout.write(
            `journal_offset=${options.journal_offset}\n`+
            `meta_offset=${meta_offset}\n`+
            `data_offset=${data_offset}\n`+
            `data_size=${device_size-data_offset}\n`
        );
    }
    else
        process.stdout.write('Unknown format: '+options.format);
 }
 function system(cmd)
 {
-    return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err) : ok(stdout))));
+    return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err.message) : ok(stdout))));
 }
-run().catch(console.error);
+run().catch(err => { console.error(err); process.exit(1); });
--- a/mon/test-nonuniform.js
+++ b/mon/test-nonuniform.js
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 // Interesting real-world example coming from Ceph with EC and compression enabled.
 // EC parity chunks can't be compressed as efficiently as data chunks,
--- a/mon/test-optimize-simple.js
+++ b/mon/test-optimize-simple.js
@@ -0,0 +1,25 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)
 const LPOptimizer = require('./lp-optimizer.js');
 async function run()
 {
    const osd_tree = { a: { 1: 1 }, b: { 2: 1 }, c: { 3: 1 } };
    let res;
    console.log('16 PGs, size=3');
    res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16 });
    LPOptimizer.print_change_stats(res, false);
    console.log('\nReduce PG size to 2');
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs.map(pg => pg.slice(0, 2)), osd_tree, pg_size: 2 });
    LPOptimizer.print_change_stats(res, false);
    console.log('\nRemove OSD 3');
    delete osd_tree['c'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
    LPOptimizer.print_change_stats(res, false);
 }
 run().catch(console.error);
--- a/mon/test-optimize-undersized.js
+++ b/mon/test-optimize-undersized.js
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 const LPOptimizer = require('./lp-optimizer.js');
--- a/mon/test-optimize.js
+++ b/mon/test-optimize.js
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 const LPOptimizer = require('./lp-optimizer.js');
--- a/osd_primary.cpp
+++ b/osd_primary.cpp
@@ -1,822 +0,0 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 (see README.md for details)
 #include "osd_primary.h"
 // read: read directly or read paired stripe(s), reconstruct, return
 // write: read paired stripe(s), reconstruct, modify, calculate parity, write
 //
 // nuance: take care to read the same version from paired stripes!
 // to do so, we remember "last readable" version until a write request completes
 // and we postpone other write requests to the same stripe until completion of previous ones
 //
 // sync: sync peers, get unstable versions, stabilize them
 bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
    // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
    pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
    // FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
    auto pool_cfg_it = st_cli.pool_config.find(pool_id);
    if (pool_cfg_it == st_cli.pool_config.end())
    {
        // Pool config is not loaded yet
        finish_op(cur_op, -EPIPE);
        return false;
    }
    auto & pool_cfg = pool_cfg_it->second;
    uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
    uint64_t pg_block_size = bs_block_size * pg_data_size;
    object_id oid = {
        .inode = cur_op->req.rw.inode,
        // oid.stripe = starting offset of the parity stripe
        .stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
    };
    pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1;
    auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
    {
        // This OSD is not primary for this PG or the PG is inactive
        // FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
        finish_op(cur_op, -EPIPE);
        return false;
    }
    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
        (cur_op->req.rw.len % bs_disk_alignment) != 0)
    {
        finish_op(cur_op, -EINVAL);
        return false;
    }
    osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
        1, sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size)
    );
    op_data->pg_num = pg_num;
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    op_data->scheme = pool_cfg.scheme;
    op_data->pg_data_size = pg_data_size;
    cur_op->op_data = op_data;
    split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
    pg_it->second.inflight++;
    return true;
 }
 static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
 {
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
        *object_state = NULL;
        return def;
    }
    auto st_it = pg.incomplete_objects.find(oid);
    if (st_it != pg.incomplete_objects.end())
    {
        *object_state = st_it->second;
        return st_it->second->read_target.data();
    }
    st_it = pg.degraded_objects.find(oid);
    if (st_it != pg.degraded_objects.end())
    {
        *object_state = st_it->second;
        return st_it->second->read_target.data();
    }
    st_it = pg.misplaced_objects.find(oid);
    if (st_it != pg.misplaced_objects.end())
    {
        *object_state = st_it->second;
        return st_it->second->read_target.data();
    }
    *object_state = NULL;
    return def;
 }
 void osd_t::continue_primary_read(osd_op_t *cur_op)
 {
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    {
        auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
        for (int role = 0; role < op_data->pg_data_size; role++)
        {
            op_data->stripes[role].read_start = op_data->stripes[role].req_start;
            op_data->stripes[role].read_end = op_data->stripes[role].req_end;
        }
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver,
                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
            op_data->st = 1;
        }
        else
        {
            // PG may be degraded or have misplaced objects
            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
            if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
            {
                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
            op_data->pg_size = pg.pg_size;
            op_data->scheme = pg.scheme;
            op_data->degraded = 1;
            cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
            op_data->st = 1;
        }
    }
 resume_1:
    return;
 resume_2:
    if (op_data->errors > 0)
    {
        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    if (op_data->degraded)
    {
        // Reconstruct missing stripes
        osd_rmw_stripe_t *stripes = op_data->stripes;
        if (op_data->scheme == POOL_SCHEME_XOR)
        {
            reconstruct_stripes_xor(stripes, op_data->pg_size);
        }
        else if (op_data->scheme == POOL_SCHEME_JERASURE)
        {
            reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size);
        }
        for (int role = 0; role < op_data->pg_size; role++)
        {
            if (stripes[role].req_end != 0)
            {
                // Send buffer in parts to avoid copying
                cur_op->iov.push_back(
                    stripes[role].read_buf + (stripes[role].req_start - stripes[role].read_start),
                    stripes[role].req_end - stripes[role].req_start
                );
            }
        }
    }
    else
    {
        cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
    }
    finish_op(cur_op, cur_op->req.rw.len);
 }
 bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
    // Check if actions are pending for this object
    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
        .oid = op_data->oid,
        .osd_num = 0,
    });
    if (act_it != pg.flush_actions.end() &&
        act_it->first.oid.inode == op_data->oid.inode &&
        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
    {
        pg.write_queue.emplace(op_data->oid, cur_op);
        return false;
    }
    // Check if there are other write requests to the same object
    auto vo_it = pg.write_queue.find(op_data->oid);
    if (vo_it != pg.write_queue.end())
    {
        op_data->st = 1;
        pg.write_queue.emplace(op_data->oid, cur_op);
        return false;
    }
    pg.write_queue.emplace(op_data->oid, cur_op);
    return true;
 }
 void osd_t::continue_primary_write(osd_op_t *cur_op)
 {
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    else if (op_data->st == 6) goto resume_6;
    else if (op_data->st == 7) goto resume_7;
    else if (op_data->st == 8) goto resume_8;
    else if (op_data->st == 9) goto resume_9;
    else if (op_data->st == 10) goto resume_10;
    assert(op_data->st == 0);
    if (!check_write_queue(cur_op, pg))
    {
        return;
    }
 resume_1:
    // Determine blocks to read and write
    // Missing chunks are allowed to be overwritten even in incomplete objects
    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Simplified algorithm
        op_data->stripes[0].write_start = op_data->stripes[0].req_start;
        op_data->stripes[0].write_end = op_data->stripes[0].req_end;
        op_data->stripes[0].write_buf = cur_op->buf;
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
            // Object is degraded/misplaced and will be moved to <write_osd_set>
            op_data->stripes[0].read_start = 0;
            op_data->stripes[0].read_end = bs_block_size;
            cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
        }
    }
    else
    {
        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
            pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
            cur_op->reply.hdr.retval = -EINVAL;
            goto continue_others;
        }
    }
    // Read required blocks
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Only (possibly) copy new data from the request into the recovery buffer
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
            memcpy(
                op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
                op_data->stripes[0].write_buf,
                op_data->stripes[0].req_end - op_data->stripes[0].req_start
            );
            op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
            op_data->stripes[0].write_start = 0;
            op_data->stripes[0].write_end = bs_block_size;
        }
    }
    else
    {
        // Recover missing stripes, calculate parity
        if (pg.scheme == POOL_SCHEME_XOR)
        {
            calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
        }
        else if (pg.scheme == POOL_SCHEME_JERASURE)
        {
            calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
        }
    }
    // Send writes
    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
    {
        op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
    }
    else
    {
        if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
        {
            assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
            pg.epoch++;
        }
        op_data->target_ver = op_data->fact_ver + 1;
    }
    if (pg.epoch > pg.reported_epoch)
    {
        // Report newer epoch before writing
        // FIXME: We may report only one PG state here...
        this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
        pg.history_changed = true;
        report_pg_states();
 resume_10:
        if (pg.epoch > pg.reported_epoch)
        {
            op_data->st = 10;
            return;
        }
    }
    submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
 resume_4:
    op_data->st = 4;
    return;
 resume_5:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
 resume_6:
 resume_7:
    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
    {
        // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
        return;
    }
    if (op_data->fact_ver == 1)
    {
        // Object is created
        pg.clean_count++;
        pg.total_count++;
    }
    if (op_data->object_state)
    {
        {
            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
            recovery_stat_count[0][recovery_type]++;
            if (!recovery_stat_count[0][recovery_type])
            {
                recovery_stat_count[0][recovery_type]++;
                recovery_stat_bytes[0][recovery_type] = 0;
            }
            for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
            {
                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
            }
        }
        if (op_data->object_state->state & OBJ_MISPLACED)
        {
            // Remove extra chunks
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
            if (op_data->n_subops > 0)
            {
 resume_8:
                op_data->st = 8;
                return;
 resume_9:
                if (op_data->errors > 0)
                {
                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
                    return;
                }
            }
        }
        // Clear object state
        remove_object_from_state(op_data->oid, op_data->object_state, pg);
        pg.clean_count++;
    }
    cur_op->reply.hdr.retval = cur_op->req.rw.len;
 continue_others:
    // Remove version override
    pg.ver_override.erase(op_data->oid);
    object_id oid = op_data->oid;
    finish_op(cur_op, cur_op->reply.hdr.retval);
    // Continue other write operations to the same object
    auto next_it = pg.write_queue.find(oid);
    auto this_it = next_it;
    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
        next_it++;
        pg.write_queue.erase(this_it);
        if (next_it != pg.write_queue.end() && next_it->first == oid)
        {
            osd_op_t *next_op = next_it->second;
            continue_primary_write(next_op);
        }
    }
 }
 bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == base_state)
    {
        goto resume_6;
    }
    else if (op_data->st == base_state+1)
    {
        goto resume_7;
    }
    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
    if (immediate_commit == IMMEDIATE_ALL)
    {
        if (op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            // Send STABILIZE ops immediately
            op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
            op_data->unstable_writes = new obj_ver_id[loc_set.size()];
            {
                int last_start = 0;
                for (auto & chunk: loc_set)
                {
                    op_data->unstable_writes[last_start] = (obj_ver_id){
                        .oid = {
                            .inode = op_data->oid.inode,
                            .stripe = op_data->oid.stripe | chunk.role,
                        },
                        .version = op_data->fact_ver,
                    };
                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = chunk.osd_num,
                        .start = last_start,
                        .len = 1,
                    });
                    last_start++;
                }
            }
            submit_primary_stab_subops(cur_op);
 resume_6:
            op_data->st = 6;
            return false;
 resume_7:
            // FIXME: Free those in the destructor?
            delete op_data->unstable_write_osds;
            delete[] op_data->unstable_writes;
            op_data->unstable_writes = NULL;
            op_data->unstable_write_osds = NULL;
            if (op_data->errors > 0)
            {
                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
                return false;
            }
        }
    }
    else
    {
        if (op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            // Remember version as unstable for EC/XOR
            for (auto & chunk: loc_set)
            {
                this->dirty_osds.insert(chunk.osd_num);
                this->unstable_writes[(osd_object_id_t){
                    .osd_num = chunk.osd_num,
                    .oid = {
                        .inode = op_data->oid.inode,
                        .stripe = op_data->oid.stripe | chunk.role,
                    },
                }] = op_data->fact_ver;
            }
        }
        else
        {
            // Only remember to sync OSDs for replicated pools
            for (auto & chunk: loc_set)
            {
                this->dirty_osds.insert(chunk.osd_num);
            }
        }
        // Remember PG as dirty to drop the connection when PG goes offline
        // (this is required because of the "lazy sync")
        c_cli.clients[cur_op->peer_fd]->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    }
    return true;
 }
 // Save and clear unstable_writes -> SYNC all -> STABLE all
 void osd_t::continue_primary_sync(osd_op_t *cur_op)
 {
    if (!cur_op->op_data)
    {
        cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    else if (op_data->st == 6) goto resume_6;
    assert(op_data->st == 0);
    if (syncs_in_progress.size() > 0)
    {
        // Wait for previous syncs, if any
        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
        syncs_in_progress.push_back(cur_op);
        op_data->st = 1;
 resume_1:
        return;
    }
    else
    {
        syncs_in_progress.push_back(cur_op);
    }
 resume_2:
    if (dirty_osds.size() == 0)
    {
        // Nothing to sync
        goto finish;
    }
    // Save and clear unstable_writes
    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
    if (unstable_writes.size() > 0)
    {
        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
        osd_num_t last_osd = 0;
        int last_start = 0, last_end = 0;
        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
        {
            if (last_osd != it->first.osd_num)
            {
                if (last_osd != 0)
                {
                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = last_osd,
                        .start = last_start,
                        .len = last_end - last_start,
                    });
                }
                last_osd = it->first.osd_num;
                last_start = last_end;
            }
            op_data->unstable_writes[last_end] = (obj_ver_id){
                .oid = it->first.oid,
                .version = it->second,
            };
            last_end++;
        }
        if (last_osd != 0)
        {
            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                .osd_num = last_osd,
                .start = last_start,
                .len = last_end - last_start,
            });
        }
        this->unstable_writes.clear();
    }
    {
        void *dirty_buf = malloc_or_die(sizeof(pool_pg_num_t)*dirty_pgs.size() + sizeof(osd_num_t)*dirty_osds.size());
        op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
        op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
        op_data->dirty_pg_count = dirty_pgs.size();
        op_data->dirty_osd_count = dirty_osds.size();
        int dpg = 0;
        for (auto dirty_pg_num: dirty_pgs)
        {
            pgs[dirty_pg_num].inflight++;
            op_data->dirty_pgs[dpg++] = dirty_pg_num;
        }
        dirty_pgs.clear();
        dpg = 0;
        for (auto osd_num: dirty_osds)
        {
            op_data->dirty_osds[dpg++] = osd_num;
        }
        dirty_osds.clear();
    }
    if (immediate_commit != IMMEDIATE_ALL)
    {
        // SYNC
        submit_primary_sync_subops(cur_op);
 resume_3:
        op_data->st = 3;
        return;
 resume_4:
        if (op_data->errors > 0)
        {
            goto resume_6;
        }
    }
    if (op_data->unstable_writes)
    {
        // Stabilize version sets, if any
        submit_primary_stab_subops(cur_op);
 resume_5:
        op_data->st = 5;
        return;
    }
 resume_6:
    if (op_data->errors > 0)
    {
        // Return PGs and OSDs back into their dirty sets
        for (int i = 0; i < op_data->dirty_pg_count; i++)
        {
            dirty_pgs.insert(op_data->dirty_pgs[i]);
        }
        for (int i = 0; i < op_data->dirty_osd_count; i++)
        {
            dirty_osds.insert(op_data->dirty_osds[i]);
        }
        if (op_data->unstable_writes)
        {
            // Return objects back into the unstable write set
            for (auto unstable_osd: *(op_data->unstable_write_osds))
            {
                for (int i = 0; i < unstable_osd.len; i++)
                {
                    // Except those from peered PGs
                    auto & w = op_data->unstable_writes[i];
                    pool_pg_num_t wpg = {
                        .pool_id = INODE_POOL(w.oid.inode),
                        .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
                    };
                    if (pgs[wpg].state & PG_ACTIVE)
                    {
                        uint64_t & dest = this->unstable_writes[(osd_object_id_t){
                            .osd_num = unstable_osd.osd_num,
                            .oid = w.oid,
                        }];
                        dest = dest < w.version ? w.version : dest;
                        dirty_pgs.insert(wpg);
                    }
                }
            }
        }
    }
    for (int i = 0; i < op_data->dirty_pg_count; i++)
    {
        auto & pg = pgs.at(op_data->dirty_pgs[i]);
        pg.inflight--;
        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
        {
            finish_stop_pg(pg);
        }
    }
    // FIXME: Free those in the destructor?
    free(op_data->dirty_pgs);
    op_data->dirty_pgs = NULL;
    op_data->dirty_osds = NULL;
    if (op_data->unstable_writes)
    {
        delete op_data->unstable_write_osds;
        delete[] op_data->unstable_writes;
        op_data->unstable_writes = NULL;
        op_data->unstable_write_osds = NULL;
    }
    if (op_data->errors > 0)
    {
        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
    }
    else
    {
 finish:
        if (cur_op->peer_fd)
        {
            auto it = c_cli.clients.find(cur_op->peer_fd);
            if (it != c_cli.clients.end())
                it->second->dirty_pgs.clear();
        }
        finish_op(cur_op, 0);
    }
    assert(syncs_in_progress.front() == cur_op);
    syncs_in_progress.pop_front();
    if (syncs_in_progress.size() > 0)
    {
        cur_op = syncs_in_progress.front();
        op_data = cur_op->op_data;
        op_data->st++;
        goto resume_2;
    }
 }
 // Decrement pg_osd_set_state_t's object_count and change PG state accordingly
 void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
 {
    if (object_state->state & OBJ_INCOMPLETE)
    {
        // Successful write means that object is not incomplete anymore
        this->incomplete_objects--;
        pg.incomplete_objects.erase(oid);
        if (!pg.incomplete_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
            report_pg_state(pg);
        }
    }
    else if (object_state->state & OBJ_DEGRADED)
    {
        this->degraded_objects--;
        pg.degraded_objects.erase(oid);
        if (!pg.degraded_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_DEGRADED;
            report_pg_state(pg);
        }
    }
    else if (object_state->state & OBJ_MISPLACED)
    {
        this->misplaced_objects--;
        pg.misplaced_objects.erase(oid);
        if (!pg.misplaced_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_MISPLACED;
            report_pg_state(pg);
        }
    }
    else
    {
        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
    }
    object_state->object_count--;
    if (!object_state->object_count)
    {
        pg.state_dict.erase(object_state->osd_set);
    }
 }
 void osd_t::continue_primary_del(osd_op_t *cur_op)
 {
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
    auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    assert(op_data->st == 0);
    // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
    if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
    {
        finish_op(cur_op, -EBUSY);
        return;
    }
    if (!check_write_queue(cur_op, pg))
    {
        return;
    }
 resume_1:
    // Determine which OSDs contain this object and delete it
    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    // Submit 1 read to determine the actual version number
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
    // Submit deletes
    op_data->fact_ver++;
    submit_primary_del_subops(cur_op, NULL, 0, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
 resume_4:
    op_data->st = 4;
    return;
 resume_5:
    if (op_data->errors > 0)
    {
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    // Remove version override
    pg.ver_override.erase(op_data->oid);
    // Adjust PG stats after "instant stabilize", because we need object_state above
    if (!op_data->object_state)
    {
        pg.clean_count--;
    }
    else
    {
        remove_object_from_state(op_data->oid, op_data->object_state, pg);
    }
    pg.total_count--;
    object_id oid = op_data->oid;
    finish_op(cur_op, cur_op->req.rw.len);
    // Continue other write operations to the same object
    auto next_it = pg.write_queue.find(oid);
    auto this_it = next_it;
    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
        next_it++;
        pg.write_queue.erase(this_it);
        if (next_it != pg.write_queue.end() &&
            next_it->first == oid)
        {
            osd_op_t *next_op = next_it->second;
            continue_primary_write(next_op);
        }
    }
 }
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
 QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
 perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.5.13/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.13$(rpm --eval '%dist').tar.gz *
--- a/rpm/qemu-el8.Dockerfile
+++ b/rpm/qemu-el8.Dockerfile
@@ -0,0 +1,31 @@
 # Build packages for CentOS 8 inside a container
 # cd ..; podman build -t qemu-el8 -v `pwd`/packages:/root/packages -f rpm/qemu-el8.Dockerfile .
 FROM centos:8
 WORKDIR /root
 RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
 RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core rpm-build
 RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='centos-advanced-virtualization-source' --source qemu-kvm
 RUN rpm --nomd5 -i qemu*.src.rpm
 RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec qemu-kvm.spec
 ADD qemu-*-vitastor.patch /root/vitastor/
 RUN set -e; \
    mkdir -p /root/packages/qemu-el8; \
    rm -rf /root/packages/qemu-el8/*; \
    rpm --nomd5 -i /root/qemu*.src.rpm; \
    cd ~/rpmbuild/SPECS; \
    PN=$(grep ^Patch qemu-kvm.spec | tail -n1 | perl -pe 's/Patch(\d+).*/$1/'); \
    csplit qemu-kvm.spec "/^Patch$PN/"; \
    cat xx00 > qemu-kvm.spec; \
    head -n 1 xx01 >> qemu-kvm.spec; \
    echo "Patch$((PN+1)): qemu-4.2-vitastor.patch" >> qemu-kvm.spec; \
    tail -n +2 xx01 >> qemu-kvm.spec; \
    perl -i -pe 's/(^Release:\s*\d+)/$1.vitastor/' qemu-kvm.spec; \
    cp /root/vitastor/qemu-4.2-vitastor.patch ~/rpmbuild/SOURCES; \
    rpmbuild --nocheck -ba qemu-kvm.spec; \
    cp ~/rpmbuild/RPMS/*/*qemu* /root/packages/qemu-el8/; \
    cp ~/rpmbuild/SRPMS/*qemu* /root/packages/qemu-el8/
--- a/rpm/qemu-kvm-el7.spec.patch
+++ b/rpm/qemu-kvm-el7.spec.patch
@@ -0,0 +1,257 @@
 --- qemu-kvm.spec.orig	2020-11-09 23:41:03.000000000 +0000
 +++ qemu-kvm.spec	2020-12-06 10:44:24.207640963 +0000
@@ -2,7 +2,7 @@
 %global SLOF_gittagcommit 899d9883
 %global have_usbredir 1
 -%global have_spice    1
 +%global have_spice    0
 %global have_opengl   1
 %global have_fdt      0
 %global have_gluster  1
@@ -56,7 +56,7 @@ Requires: %{name}-block-curl = %{epoch}:
 Requires: %{name}-block-gluster = %{epoch}:%{version}-%{release} \
 %endif                                                           \
 Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release}   \
 -Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release}     \
 +#Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release}     \
 Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
 # Macro to properly setup RHEL/RHEV conflict handling
@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
 Summary: QEMU is a machine emulator and virtualizer
 Name: qemu-kvm
 Version: 4.2.0
 -Release: 29.vitastor%{?dist}.6
 +Release: 30.vitastor%{?dist}.6
 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
 Epoch: 15
 License: GPLv2 and GPLv2+ and CC-BY
@@ -99,8 +99,8 @@ Source30: kvm-s390x.conf
 Source31: kvm-x86.conf
 Source32: qemu-pr-helper.service
 Source33: qemu-pr-helper.socket
 -Source34: 81-kvm-rhel.rules
 -Source35: udev-kvm-check.c
 +#Source34: 81-kvm-rhel.rules
 +#Source35: udev-kvm-check.c
 Source36: README.tests
@@ -825,7 +825,9 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
 Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
 # For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
 Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
 -Patch335: qemu-4.2-vitastor.patch
 +Patch335: qemu-use-sphinx-1.2.patch
 +Patch336: qemu-config-tcmalloc-warning.patch
 +Patch337: qemu-4.2-vitastor.patch
 BuildRequires: wget
 BuildRequires: rpm-build
@@ -842,7 +844,8 @@ BuildRequires: pciutils-devel
 BuildRequires: libiscsi-devel
 BuildRequires: ncurses-devel
 BuildRequires: libattr-devel
 -BuildRequires: libusbx-devel >= 1.0.22
 +BuildRequires: gperftools-devel
 +BuildRequires: libusbx-devel >= 1.0.21
 %if %{have_usbredir}
 BuildRequires: usbredir-devel >= 0.7.1
 %endif
@@ -856,12 +859,12 @@ BuildRequires: virglrenderer-devel
 # For smartcard NSS support
 BuildRequires: nss-devel
 %endif
 -BuildRequires: libseccomp-devel >= 2.4.0
 +#Requires: libseccomp >= 2.4.0
 # For network block driver
 BuildRequires: libcurl-devel
 BuildRequires: libssh-devel
 -BuildRequires: librados-devel
 -BuildRequires: librbd-devel
 +#BuildRequires: librados-devel
 +#BuildRequires: librbd-devel
 %if %{have_gluster}
 # For gluster block driver
 BuildRequires: glusterfs-api-devel
@@ -955,25 +958,25 @@ hardware for a full system such as a PC
 %package -n qemu-kvm-core
 Summary: qemu-kvm core components
 +Requires: gperftools-libs
 Requires: qemu-img = %{epoch}:%{version}-%{release}
 %ifarch %{ix86} x86_64
 Requires: seabios-bin >= 1.10.2-1
 Requires: sgabios-bin
 -Requires: edk2-ovmf
 %endif
 %ifarch aarch64
 Requires: edk2-aarch64
 %endif
 %ifnarch aarch64 s390x
 -Requires: seavgabios-bin >= 1.12.0-3
 -Requires: ipxe-roms-qemu >= 20170123-1
 +Requires: seavgabios-bin >= 1.11.0-1
 +Requires: ipxe-roms-qemu >= 20181214-1
 +Requires: /usr/share/ipxe.efi
 %endif
 %ifarch %{power64}
 Requires: SLOF >= %{SLOF_gittagdate}-1.git%{SLOF_gittagcommit}
 %endif
 Requires: %{name}-common = %{epoch}:%{version}-%{release}
 -Requires: libseccomp >= 2.4.0
 # For compressed guest memory dumps
 Requires: lzo snappy
 %if %{have_kvm_setup}
@@ -1085,15 +1088,15 @@ This package provides the additional iSC
 Install this package if you want to access iSCSI volumes.
 -%package  block-rbd
 -Summary: QEMU Ceph/RBD block driver
 -Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
 -
 -%description block-rbd
 -This package provides the additional Ceph/RBD block driver for QEMU.
 -
 -Install this package if you want to access remote Ceph volumes
 -using the rbd protocol.
 +#%package  block-rbd
 +#Summary: QEMU Ceph/RBD block driver
 +#Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
 +#
 +#%description block-rbd
 +#This package provides the additional Ceph/RBD block driver for QEMU.
 +#
 +#Install this package if you want to access remote Ceph volumes
 +#using the rbd protocol.
 %package  block-ssh
@@ -1117,12 +1120,14 @@ the Secure Shell (SSH) protocol.
 # --build-id option is used for giving info to the debug packages.
 buildldflags="VL_LDFLAGS=-Wl,--build-id"
 -%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
 +#%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
 +%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,blkdebug,luks,null-co,nvme,copy-on-read,throttle
 %if 0%{have_gluster}
     %global block_drivers_list %{block_drivers_list},gluster
 %endif
 +[ -e /usr/bin/sphinx-build ] || ln -s sphinx-build-3 /usr/bin/sphinx-build
 ./configure  \
  --prefix="%{_prefix}" \
  --libdir="%{_libdir}" \
@@ -1152,15 +1157,15 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
 %else
   --disable-numa \
 %endif
 -  --enable-rbd \
 +  --disable-rbd \
 %if 0%{have_librdma}
   --enable-rdma \
 %else
   --disable-rdma \
 %endif
   --disable-pvrdma \
 -  --enable-seccomp \
 -%if 0%{have_spice}
 +  --disable-seccomp \
 +%if %{have_spice}
   --enable-spice \
   --enable-smartcard \
   --enable-virglrenderer \
@@ -1179,7 +1184,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
 %else
   --disable-usb-redir \
 %endif
 -  --disable-tcmalloc \
 +  --enable-tcmalloc \
 %ifarch x86_64
   --enable-libpmem \
 %else
@@ -1193,9 +1198,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
 %endif
   --python=%{__python3} \
   --target-list="%{buildarch}" \
 -  --block-drv-rw-whitelist=%{block_drivers_list} \
   --audio-drv-list= \
 -  --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
   --with-coroutine=ucontext \
   --tls-priority=NORMAL \
   --disable-bluez \
@@ -1262,7 +1265,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
   --disable-sanitizers \
   --disable-hvf \
   --disable-whpx \
 -  --enable-malloc-trim \
 +  --disable-malloc-trim \
   --disable-membarrier \
   --disable-vhost-crypto \
   --disable-libxml2 \
@@ -1308,7 +1311,7 @@ make V=1 %{?_smp_mflags} $buildldflags
 cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm
 gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl
 -gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check
 +#gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check
 %install
 %define _udevdir %(pkg-config --variable=udevdir udev)
@@ -1343,8 +1346,8 @@ mkdir -p $RPM_BUILD_ROOT%{testsdir}/test
 mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests
 mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp
 -install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir}
 -install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir}
 +#install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir}
 +#install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir}
 install -m 0644 scripts/dump-guest-memory.py \
                 $RPM_BUILD_ROOT%{_datadir}/%{name}
@@ -1562,6 +1565,8 @@ rm -rf $RPM_BUILD_ROOT%{qemudocdir}/inte
 # Remove spec
 rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs
 +%global __os_install_post %(echo '%{__os_install_post}' | sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g')
 +
 %check
 export DIFF=diff; make check V=1
@@ -1645,8 +1650,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
 %config(noreplace) %{_sysconfdir}/sysconfig/ksm
 %{_unitdir}/ksmtuned.service
 %{_sbindir}/ksmtuned
 -%{_udevdir}/udev-kvm-check
 -%{_udevrulesdir}/81-kvm-rhel.rules
 +#%{_udevdir}/udev-kvm-check
 +#%{_udevrulesdir}/81-kvm-rhel.rules
 %ghost %{_sysconfdir}/kvm
 %config(noreplace) %{_sysconfdir}/ksmtuned.conf
 %dir %{_sysconfdir}/%{name}
@@ -1711,8 +1716,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
 %{_libexecdir}/vhost-user-gpu
 %{_datadir}/%{name}/vhost-user/50-qemu-gpu.json
 %endif
 -%{_libexecdir}/virtiofsd
 -%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json
 +#%{_libexecdir}/virtiofsd
 +#%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json
 %files -n qemu-img
 %defattr(-,root,root)
@@ -1748,8 +1753,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
 %files block-iscsi
 %{_libdir}/qemu-kvm/block-iscsi.so
 -%files block-rbd
 -%{_libdir}/qemu-kvm/block-rbd.so
 +#%files block-rbd
 +#%{_libdir}/qemu-kvm/block-rbd.so
 %files block-ssh
 %{_libdir}/qemu-kvm/block-ssh.so
--- a/rpm/qemu-kvm.spec.patch
+++ b/rpm/qemu-kvm.spec.patch
@@ -1,5 +1,5 @@
--- qemu-kvm.spec	2020-11-07 22:48:46.312124920 +0000
+--- qemu-kvm.spec	2020-12-05 13:13:54.388623517 +0000
-+++ qemu-kvm.spec	2020-11-07 23:04:06.246772766 +0000
+++ qemu-kvm.spec	2020-12-05 13:13:58.728696598 +0000
@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
 Summary: QEMU is a machine emulator and virtualizer
 Name: qemu-kvm
@@ -9,12 +9,21 @@
 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
 Epoch: 15
 License: GPLv2 and GPLv2+ and CC-BY
-@@ -825,6 +825,8 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
+@@ -825,6 +825,7 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
 Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
 # For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
 Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
 +# Vitastor
 +Patch335: qemu-4.2-vitastor.patch
 BuildRequires: wget
 BuildRequires: rpm-build
@@ -1192,9 +1193,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
 %endif
   --python=%{__python3} \
   --target-list="%{buildarch}" \
 -  --block-drv-rw-whitelist=%{block_drivers_list} \
   --audio-drv-list= \
 -  --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
   --with-coroutine=ucontext \
   --tls-priority=NORMAL \
   --disable-bluez \
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -1,14 +1,15 @@
 # Build packages for CentOS 7 inside a container
-# cd ..; podman build -t vitastor-el7 -v `pwd`/build:/root/build -f rpm/vitastor-el7.Dockerfile .
+# cd ..; podman build -t vitastor-el7 -v `pwd`/packages:/root/packages -f rpm/vitastor-el7.Dockerfile .
 # localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
 FROM centos:7
 WORKDIR /root
 RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
 RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
 RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
-RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel qemu-kvm fio rh-nodejs12
+RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel qemu-kvm fio rh-nodejs12 jerasure-devel gf-complete-devel
 RUN yumdownloader --disablerepo=centos-sclo-rh --source qemu-kvm
 RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
 RUN rpm --nomd5 -i qemu*.src.rpm
@@ -24,23 +25,23 @@ RUN set -e; \
    cd ~/rpmbuild/SPECS/; \
    . /opt/rh/devtoolset-9/enable; \
    rpmbuild -ba liburing.spec; \
-    mkdir -p /root/build/liburing-el7; \
+    mkdir -p /root/packages/liburing-el7; \
-    rm -rf /root/build/liburing-el7/*; \
+    rm -rf /root/packages/liburing-el7/*; \
-    cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el7/; \
+    cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el7/; \
-    cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el7/
+    cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el7/
-RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
+RUN rpm -i `ls /root/packages/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
 ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.5.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.5.13.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
-    mkdir -p /root/build/vitastor-el7; \
+    mkdir -p /root/packages/vitastor-el7; \
-    rm -rf /root/build/vitastor-el7/*; \
+    rm -rf /root/packages/vitastor-el7/*; \
-    cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el7/; \
+    cp ~/rpmbuild/RPMS/*/vitastor* /root/packages/vitastor-el7/; \
-    cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el7/
+    cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el7/
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,22 +1,27 @@
 Name:           vitastor
-Version:        0.5
+Version:        0.5.13
-Release:        2%{?dist}
+Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage
-License:        Vitastor Network Public License 1.0
+License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.5.el7.tar.gz
+Source0:        vitastor-0.5.13.el7.tar.gz
 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
 BuildRequires:  devtoolset-9-gcc-c++
 BuildRequires:  rh-nodejs12
 BuildRequires:  rh-nodejs12-npm
 BuildRequires:  jerasure-devel
 BuildRequires:  gf-complete-devel
 BuildRequires:  cmake
 Requires:       fio = 3.7-1.el7
 Requires:       qemu-kvm = 2.0.0-1.el7.6
 Requires:       rh-nodejs12
 Requires:       rh-nodejs12-npm
 Requires:       liburing >= 0.6
 Requires:       libJerasure2
 Requires:       lpsolve
 %description
 Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
@@ -31,12 +36,13 @@ size with configurable redundancy (replication or erasure codes/XOR).
 %build
 . /opt/rh/devtoolset-9/enable
-make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+%cmake . -DQEMU_PLUGINDIR=qemu-kvm
 %make_build
 %install
 rm -rf $RPM_BUILD_ROOT
-%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+%make_install
 . /opt/rh/rh-nodejs12/enable
 cd mon
 npm install
@@ -52,7 +58,11 @@ cp -r mon %buildroot/usr/lib/vitastor/mon
 %_bindir/vitastor-osd
 %_bindir/vitastor-rm
 %_libdir/qemu-kvm/block-vitastor.so
-%_libdir/vitastor
+%_libdir/libfio_vitastor.so
 %_libdir/libfio_vitastor_blk.so
 %_libdir/libfio_vitastor_sec.so
 %_libdir/libvitastor_blk.so
 %_libdir/libvitastor_client.so
 /usr/lib/vitastor
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -1,5 +1,5 @@
 # Build packages for CentOS 8 inside a container
-# cd ..; podman build -t vitastor-el8 -v `pwd`/build:/root/build -f rpm/vitastor-el8.Dockerfile .
+# cd ..; podman build -t vitastor-el8 -v `pwd`/packages:/root/packages -f rpm/vitastor-el8.Dockerfile .
 FROM centos:8
@@ -7,13 +7,14 @@ WORKDIR /root
 RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
 RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
-RUN dnf --enablerepo='centos-advanced-virtualization' -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel qemu-kvm fio nodejs rpm-build
+RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
-RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='centos-advanced-virtualization-source' --source qemu-kvm
+RUN dnf --enablerepo='centos-advanced-virtualization' -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel qemu-kvm fio nodejs rpm-build jerasure-devel gf-complete-devel
 RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='vitastor' --source qemu-kvm
 RUN dnf download --source fio
 RUN rpm --nomd5 -i qemu*.src.rpm
 RUN rpm --nomd5 -i fio*.src.rpm
-RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo='*' --spec qemu-kvm.spec
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec qemu-kvm.spec
-RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo='*' --spec fio.spec
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec && dnf install -y cmake
 ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
@@ -22,44 +23,23 @@ RUN set -e; \
    cd ~/rpmbuild/SPECS/; \
    . /opt/rh/gcc-toolset-9/enable; \
    rpmbuild -ba liburing.spec; \
-    mkdir -p /root/build/liburing-el8; \
+    mkdir -p /root/packages/liburing-el8; \
-    rm -rf /root/build/liburing-el8/*; \
+    rm -rf /root/packages/liburing-el8/*; \
-    cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el8/; \
+    cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el8/; \
-    cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el8/
+    cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el8/
-RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
+RUN rpm -i `ls /root/packages/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
 ADD qemu-*-vitastor.patch /root/vitastor/
 RUN set -e; \
    mkdir -p /root/build/qemu-el8; \
    rm -rf /root/build/qemu-el8/*; \
    rpm --nomd5 -i /root/qemu*.src.rpm; \
    cd ~/rpmbuild/SPECS; \
    PN=$(grep ^Patch qemu-kvm.spec | tail -n1 | perl -pe 's/Patch(\d+).*/$1/'); \
    csplit qemu-kvm.spec "/^Patch$PN/"; \
    cat xx00 > qemu-kvm.spec; \
    head -n 1 xx01 >> qemu-kvm.spec; \
    echo "Patch$((PN+1)): qemu-4.2-vitastor.patch" >> qemu-kvm.spec; \
    tail -n +2 xx01 >> qemu-kvm.spec; \
    perl -i -pe 's/(^Release:\s*\d+)/$1.vitastor/' qemu-kvm.spec; \
    cp /root/vitastor/qemu-4.2-vitastor.patch ~/rpmbuild/SOURCES; \
    rpmbuild --nocheck -ba qemu-kvm.spec; \
    cp ~/rpmbuild/RPMS/*/*qemu* /root/build/qemu-el8/; \
    cp ~/rpmbuild/SRPMS/*qemu* /root/build/qemu-el8/
 RUN cd /root/build/qemu-el8; dnf -y install `ls qemu*.rpm | grep -vP 'debug|guest|tests|src'`
 ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.5.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.5.13.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
-    mkdir -p /root/build/vitastor-el8; \
+    mkdir -p /root/packages/vitastor-el8; \
-    rm -rf /root/build/vitastor-el8/*; \
+    rm -rf /root/packages/vitastor-el8/*; \
-    cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el8/; \
+    cp ~/rpmbuild/RPMS/*/vitastor* /root/packages/vitastor-el8/; \
-    cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el8/
+    cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el8/
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,20 +1,25 @@
 Name:           vitastor
-Version:        0.5
+Version:        0.5.13
-Release:        2%{?dist}
+Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage
-License:        Vitastor Network Public License 1.0
+License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.5.el8.tar.gz
+Source0:        vitastor-0.5.13.el8.tar.gz
 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
 BuildRequires:  gcc-toolset-9-gcc-c++
 BuildRequires:  nodejs >= 10
 BuildRequires:  jerasure-devel
 BuildRequires:  gf-complete-devel
 BuildRequires:  cmake
 Requires:       fio = 3.7-3.el8
 Requires:       qemu-kvm = 4.2.0-29.el8.6
 Requires:       nodejs >= 10
 Requires:       liburing >= 0.6
 Requires:       libJerasure2
 Requires:       lpsolve
 %description
 Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
@@ -29,12 +34,13 @@ size with configurable redundancy (replication or erasure codes/XOR).
 %build
 . /opt/rh/gcc-toolset-9/enable
-make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+%cmake . -DQEMU_PLUGINDIR=qemu-kvm
 %make_build
 %install
 rm -rf $RPM_BUILD_ROOT
-%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+%make_install
 cd mon
 npm install
 cd ..
@@ -49,7 +55,11 @@ cp -r mon %buildroot/usr/lib/vitastor
 %_bindir/vitastor-osd
 %_bindir/vitastor-rm
 %_libdir/qemu-kvm/block-vitastor.so
-%_libdir/vitastor
+%_libdir/libfio_vitastor.so
 %_libdir/libfio_vitastor_blk.so
 %_libdir/libfio_vitastor_sec.so
 %_libdir/libvitastor_blk.so
 %_libdir/libvitastor_client.so
 /usr/lib/vitastor
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,199 @@
 cmake_minimum_required(VERSION 2.8)
 project(vitastor)
 include(GNUInstallDirs)
 set(QEMU_PLUGINDIR qemu CACHE STRING "QEMU plugin directory suffix (qemu-kvm on RHEL)")
 set(WITH_ASAN false CACHE BOOL "Build with AddressSanitizer")
 if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	if(EXISTS "/etc/debian_version")
 		set(CMAKE_INSTALL_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
 	endif()
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()
 add_definitions(-DVERSION="0.6-dev")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
 	add_link_options(-fsanitize=address -fno-omit-frame-pointer)
 endif (${WITH_ASAN})
 set(CMAKE_BUILD_TYPE RelWithDebInfo)
 string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
 string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
 string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
 string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
 string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
 string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
 string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
 string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
 string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
 string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
 find_package(PkgConfig)
 pkg_check_modules(LIBURING REQUIRED liburing)
 pkg_check_modules(GLIB REQUIRED glib-2.0)
 include_directories(
 	../
 	/usr/include/jerasure
 	${LIBURING_INCLUDE_DIRS}
 )
 # libvitastor_blk.so
 add_library(vitastor_blk SHARED
 	allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
 	blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp crc32c.c ringloop.cpp
 )
 target_link_libraries(vitastor_blk
 	${LIBURING_LIBRARIES}
 	tcmalloc_minimal
 )
 # libfio_vitastor_blk.so
 add_library(fio_vitastor_blk SHARED
 	fio_engine.cpp
 	../json11/json11.cpp
 )
 target_link_libraries(fio_vitastor_blk
 	vitastor_blk
 )
 # vitastor-osd
 add_executable(vitastor-osd
 	osd_main.cpp osd.cpp osd_secondary.cpp msgr_receive.cpp msgr_send.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
 	osd_primary.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
 	etcd_state_client.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
 	osd_rmw.cpp base64.cpp timerfd_manager.cpp epoll_manager.cpp ../json11/json11.cpp
 )
 target_link_libraries(vitastor-osd
 	vitastor_blk
 	Jerasure
 )
 # libfio_vitastor_sec.so
 add_library(fio_vitastor_sec SHARED
 	fio_sec_osd.cpp
 	rw_blocking.cpp
 )
 target_link_libraries(fio_vitastor_sec
 	tcmalloc_minimal
 )
 # libvitastor_client.so
 add_library(vitastor_client SHARED
 	cluster_client.cpp epoll_manager.cpp etcd_state_client.cpp
 	messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
 	http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
 )
 target_link_libraries(vitastor_client
 	tcmalloc_minimal
 	${LIBURING_LIBRARIES}
 )
 # libfio_vitastor.so
 add_library(fio_vitastor SHARED
 	fio_cluster.cpp
 )
 target_link_libraries(fio_vitastor
 	vitastor_client
 )
 # vitastor-nbd
 add_executable(vitastor-nbd
 	nbd_proxy.cpp
 )
 target_link_libraries(vitastor-nbd
 	vitastor_client
 )
 # vitastor-rm
 add_executable(vitastor-rm
 	rm_inode.cpp
 )
 target_link_libraries(vitastor-rm
 	vitastor_client
 )
 # vitastor-dump-journal
 add_executable(vitastor-dump-journal
 	dump_journal.cpp crc32c.c
 )
 # qemu_driver.so
 add_library(qemu_proxy STATIC qemu_proxy.cpp)
 target_compile_options(qemu_proxy PUBLIC -fPIC)
 target_include_directories(qemu_proxy PUBLIC
 	../qemu/b/qemu
 	../qemu/include
 	${GLIB_INCLUDE_DIRS}
 )
 target_link_libraries(qemu_proxy
 	vitastor_client
 )
 add_library(qemu_vitastor SHARED
 	qemu_driver.c
 )
 target_link_libraries(qemu_vitastor
 	qemu_proxy
 )
 set_target_properties(qemu_vitastor PROPERTIES
 	PREFIX ""
 	OUTPUT_NAME "block-vitastor"
 )
 ### Test stubs
 # stub_osd, stub_bench, osd_test
 add_executable(stub_osd stub_osd.cpp rw_blocking.cpp)
 target_link_libraries(stub_osd tcmalloc_minimal)
 add_executable(stub_bench stub_bench.cpp rw_blocking.cpp)
 target_link_libraries(stub_bench tcmalloc_minimal)
 add_executable(osd_test osd_test.cpp rw_blocking.cpp)
 target_link_libraries(osd_test tcmalloc_minimal)
 # osd_rmw_test
 add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
 target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)
 # stub_uring_osd
 add_executable(stub_uring_osd
 	stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp
 	msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
 )
 target_link_libraries(stub_uring_osd
 	${LIBURING_LIBRARIES}
 	tcmalloc_minimal
 )
 # osd_peering_pg_test
 add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
 target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
 # test_allocator
 add_executable(test_allocator test_allocator.cpp allocator.cpp)
 # test_cluster_client
 add_executable(test_cluster_client
 	test_cluster_client.cpp
 	pg_states.cpp osd_ops.cpp cluster_client.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
 	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
 target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
 ## test_blockstore, test_shit
 #add_executable(test_blockstore test_blockstore.cpp timerfd_interval.cpp)
 #target_link_libraries(test_blockstore blockstore)
 #add_executable(test_shit test_shit.cpp osd_peering_pg.cpp)
 #target_link_libraries(test_shit ${LIBURING_LIBRARIES} m)
 ### Install
 install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-rm RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 install(TARGETS fio_vitastor fio_vitastor_blk fio_vitastor_sec vitastor_blk vitastor_client LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 install(TARGETS qemu_vitastor LIBRARY DESTINATION /usr/${CMAKE_INSTALL_LIBDIR}/${QEMU_PLUGINDIR})
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include <stdexcept>
 #include "allocator.h"
@@ -13,19 +13,19 @@ allocator::allocator(uint64_t blocks)
    {
        throw std::invalid_argument("blocks");
    }
-    uint64_t p2 = 1, total = 1;
+    uint64_t p2 = 1;
    total = 0;
    while (p2 * 64 < blocks)
    {
        p2 = p2 * 64;
        total += p2;
        p2 = p2 * 64;
    }
    total -= p2;
    total += (blocks+63) / 64;
-    mask = new uint64_t[2 + total];
+    mask = new uint64_t[total];
    size = free = blocks;
    last_one_mask = (blocks % 64) == 0
        ? UINT64_MAX
-        : ~(UINT64_MAX << (64 - blocks % 64));
+        : ((1l << (blocks % 64)) - 1);
    for (uint64_t i = 0; i < total; i++)
    {
        mask[i] = 0;
@@ -37,6 +37,21 @@ allocator::~allocator()
    delete[] mask;
 }
 bool allocator::get(uint64_t addr)
 {
    if (addr >= size)
    {
        return false;
    }
    uint64_t p2 = 1, offset = 0;
    while (p2 * 64 < size)
    {
        offset += p2;
        p2 = p2 * 64;
    }
    return ((mask[offset + addr/64] >> (addr % 64)) & 1);
 }
 void allocator::set(uint64_t addr, bool value)
 {
    if (addr >= size)
@@ -99,6 +114,10 @@ uint64_t allocator::find_free()
    uint64_t p2 = 1, offset = 0, addr = 0, f, i;
    while (p2 < size)
    {
        if (offset+addr >= total)
        {
            return UINT64_MAX;
        }
        uint64_t m = mask[offset + addr];
        for (i = 0, f = 1; i < 64; i++, f <<= 1)
        {
@@ -113,11 +132,6 @@ uint64_t allocator::find_free()
            return UINT64_MAX;
        }
        addr = (addr * 64) | i;
        if (addr >= size)
        {
            // No space
            return UINT64_MAX;
        }
        offset += p2;
        p2 = p2 * 64;
    }
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
@@ -8,6 +8,7 @@
 // Hierarchical bitmap allocator
 class allocator
 {
    uint64_t total;
    uint64_t size;
    uint64_t free;
    uint64_t last_one_mask;
@@ -15,6 +16,7 @@ class allocator
 public:
    allocator(uint64_t blocks);
    ~allocator();
    bool get(uint64_t addr);
    void set(uint64_t addr, bool value);
    uint64_t find_free();
    uint64_t get_free_count();
--- a/src/base64.cpp
+++ b/src/base64.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "base64.h"
--- a/src/base64.h
+++ b/src/base64.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
 #include <string>
--- a/src/blockstore.cpp
+++ b/src/blockstore.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -35,12 +35,7 @@ bool blockstore_t::is_safe_to_stop()
 void blockstore_t::enqueue_op(blockstore_op_t *op)
 {
-    impl->enqueue_op(op, false);
+    impl->enqueue_op(op);
 }
 void blockstore_t::enqueue_op_first(blockstore_op_t *op)
 {
    impl->enqueue_op(op, true);
 }
 std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
@@ -63,7 +58,7 @@ uint64_t blockstore_t::get_free_block_count()
    return impl->get_free_block_count();
 }
-uint32_t blockstore_t::get_disk_alignment()
+uint32_t blockstore_t::get_bitmap_granularity()
 {
-    return impl->get_disk_alignment();
+    return impl->get_bitmap_granularity();
 }
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
@@ -175,10 +175,6 @@ public:
    // Submission
    void enqueue_op(blockstore_op_t *op);
    // Insert operation into the beginning of the queue
    // Intended for the OSD syncer "thread" to be able to stabilize something when the journal is full
    void enqueue_op_first(blockstore_op_t *op);
    // Unstable writes are added here (map of object_id -> version)
    std::unordered_map<object_id, uint64_t> & get_unstable_writes();
@@ -187,5 +183,5 @@ public:
    uint64_t get_block_count();
    uint64_t get_free_block_count();
-    uint32_t get_disk_alignment();
+    uint32_t get_bitmap_granularity();
 };
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -1,12 +1,15 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
-journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
+journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
 {
    this->bs = bs;
-    this->flusher_count = flusher_count;
+    this->max_flusher_count = bs->max_flusher_count;
    this->min_flusher_count = bs->min_flusher_count;
    this->cur_flusher_count = bs->min_flusher_count;
    this->target_flusher_count = bs->min_flusher_count;
    dequeuing = false;
    trimming = false;
    active_flushers = 0;
@@ -14,11 +17,11 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
    // FIXME: allow to configure flusher_start_threshold and journal_trim_interval
    flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
    journal_trim_interval = 512;
-    journal_trim_counter = 0;
+    journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
-    trim_wanted = 0;
+    trim_wanted = bs->journal.flush_journal ? 1 : 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
-    co = new journal_flusher_co[flusher_count];
+    co = new journal_flusher_co[max_flusher_count];
-    for (int i = 0; i < flusher_count; i++)
+    for (int i = 0; i < max_flusher_count; i++)
    {
        co[i].bs = bs;
        co[i].flusher = this;
@@ -68,14 +71,31 @@ bool journal_flusher_t::is_active()
 void journal_flusher_t::loop()
 {
-    for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
+    target_flusher_count = bs->write_iodepth*2;
    if (target_flusher_count < min_flusher_count)
        target_flusher_count = min_flusher_count;
    else if (target_flusher_count > max_flusher_count)
        target_flusher_count = max_flusher_count;
    if (target_flusher_count > cur_flusher_count)
        cur_flusher_count = target_flusher_count;
    else if (target_flusher_count < cur_flusher_count)
    {
-        co[i].loop();
+        while (target_flusher_count < cur_flusher_count)
        {
            if (co[cur_flusher_count-1].wait_state)
                break;
            cur_flusher_count--;
        }
    }
    for (int i = 0; (active_flushers > 0 || dequeuing) && i < cur_flusher_count; i++)
        co[i].loop();
 }
 void journal_flusher_t::enqueue_flush(obj_ver_id ov)
 {
 #ifdef BLOCKSTORE_DEBUG
    printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
 #endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@@ -94,8 +114,11 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
    }
 }
-void journal_flusher_t::unshift_flush(obj_ver_id ov)
+void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
 {
 #ifdef BLOCKSTORE_DEBUG
    printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
 #endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@@ -105,15 +128,38 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
    else
    {
        flush_versions[ov.oid] = ov.version;
        if (!force)
            flush_queue.push_front(ov.oid);
    }
-    flush_queue.push_front(ov.oid);
+    if (force)
-    if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
+        flush_queue.push_front(ov.oid);
    if (force || !dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
    {
        dequeuing = true;
        bs->ringloop->wakeup();
    }
 }
 void journal_flusher_t::remove_flush(object_id oid)
 {
 #ifdef BLOCKSTORE_DEBUG
    printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
 #endif
    auto v_it = flush_versions.find(oid);
    if (v_it != flush_versions.end())
    {
        flush_versions.erase(v_it);
        for (auto q_it = flush_queue.begin(); q_it != flush_queue.end(); q_it++)
        {
            if (*q_it == oid)
            {
                flush_queue.erase(q_it);
                break;
            }
        }
    }
 }
 void journal_flusher_t::request_trim()
 {
    dequeuing = true;
@@ -192,8 +238,10 @@ bool journal_flusher_co::loop()
    else if (wait_state == 21)
        goto resume_21;
 resume_0:
-    if (!flusher->flush_queue.size() || !flusher->dequeuing)
+    if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
        !flusher->flush_queue.size() || !flusher->dequeuing)
    {
 stop_flusher:
        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
        {
            // Attempt forced trim
@@ -298,9 +346,7 @@ resume_0:
 #ifdef BLOCKSTORE_DEBUG
                    printf("No older flushes, stopping\n");
 #endif
-                    flusher->dequeuing = false;
+                    goto stop_flusher;
                    wait_state = 0;
                    return true;
                }
            }
        }
@@ -319,8 +365,8 @@ resume_1:
            return false;
        }
        // Writes and deletes shouldn't happen at the same time
-        assert(!(copy_count > 0 || has_writes) || !has_delete);
+        assert(!has_writes || !has_delete);
-        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
+        if (!has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
        {
            // Nothing to flush
            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
@@ -438,6 +484,14 @@ resume_1:
        }
        if (has_delete)
        {
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
            if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
            {
                printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx) while deleting %lx:%lx\n",
                    clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
                exit(1);
            }
            // zero out new metadata entry
            memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
        }
        else
@@ -445,8 +499,8 @@ resume_1:
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
            if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
            {
-                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lx (%lx:%lx) with %lx:%lx\n",
+                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx) with %lx:%lx\n",
-                    clean_loc, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
+                    clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
                exit(1);
            }
            new_entry->oid = cur.oid;
@@ -513,7 +567,7 @@ resume_1:
        if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
        {
            // Requeue version
-            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
+            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second }, false);
        }
        flusher->sync_to_repeat.erase(repeat_it);
    trim_journal:
@@ -572,6 +626,12 @@ resume_1:
 #endif
                flusher->trimming = false;
            }
            if (bs->journal.flush_journal && !flusher->flush_queue.size())
            {
                assert(bs->journal.used_start == bs->journal.next_free);
                printf("Journal flushed\n");
                exit(0);
            }
        }
        // All done
        flusher->active_flushers--;
@@ -602,7 +662,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        {
            char err[1024];
            snprintf(
-                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu state during flush: %d",
+                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
            );
            throw std::runtime_error(err);
@@ -731,7 +791,10 @@ void journal_flusher_co::update_clean_db()
    if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
    {
 #ifdef BLOCKSTORE_DEBUG
-        printf("Free block %lu (new location is %lu)\n", old_clean_loc >> bs->block_order, clean_loc >> bs->block_order);
+        printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
            old_clean_loc >> bs->block_order,
            cur.oid.inode, cur.oid.stripe, cur.version,
            clean_loc >> bs->block_order);
 #endif
        bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
    }
@@ -739,6 +802,11 @@ void journal_flusher_co::update_clean_db()
    {
        auto clean_it = bs->clean_db.find(cur.oid);
        bs->clean_db.erase(clean_it);
 #ifdef BLOCKSTORE_DEBUG
        printf("Free block %lu from %lx:%lx v%lu (delete)\n",
            clean_loc >> bs->block_order,
            cur.oid.inode, cur.oid.stripe, cur.version);
 #endif
        bs->data_alloc->set(clean_loc >> bs->block_order, false);
        clean_loc = UINT64_MAX;
    }
@@ -760,7 +828,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
        goto resume_1;
    else if (wait_state == wait_base+2)
        goto resume_2;
-    if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_journal_fsync))
+    if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_data_fsync))
    {
        cur_sync = flusher->syncs.end();
        while (cur_sync != flusher->syncs.begin())
@@ -779,31 +847,34 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
    sync_found:
        cur_sync->ready_count++;
        flusher->syncing_flushers++;
-        if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
+    resume_1:
        if (!cur_sync->state)
        {
-            // Sync batch is ready. Do it.
+            if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size())
            await_sqe(0);
            data->iov = { 0 };
            data->callback = simple_callback_w;
            my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
            cur_sync->state = 1;
            wait_count++;
        resume_1:
            if (wait_count > 0)
            {
                // Sync batch is ready. Do it.
                await_sqe(0);
                data->iov = { 0 };
                data->callback = simple_callback_w;
                my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
                cur_sync->state = 1;
                wait_count++;
            resume_2:
                if (wait_count > 0)
                {
                    wait_state = 2;
                    return false;
                }
                // Sync completed. All previous coroutines waiting for it must be resumed
                cur_sync->state = 2;
                bs->ringloop->wakeup();
            }
            else
            {
                // Wait until someone else sends and completes a sync.
                wait_state = 1;
                return false;
            }
            // Sync completed. All previous coroutines waiting for it must be resumed
            cur_sync->state = 2;
            bs->ringloop->wakeup();
        }
        // Wait until someone else sends and completes a sync.
    resume_2:
        if (!cur_sync->state)
        {
            wait_state = 2;
            return false;
        }
        flusher->syncing_flushers--;
        cur_sync->ready_count--;
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 struct copy_buffer_t
 {
@@ -80,7 +80,7 @@ class journal_flusher_t
 {
    int trim_wanted = 0;
    bool dequeuing;
-    int flusher_count;
+    int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
    int flusher_start_threshold;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
@@ -99,7 +99,7 @@ class journal_flusher_t
    std::deque<object_id> flush_queue;
    std::map<object_id, uint64_t> flush_versions;
 public:
-    journal_flusher_t(int flusher_count, blockstore_impl_t *bs);
+    journal_flusher_t(blockstore_impl_t *bs);
    ~journal_flusher_t();
    void loop();
    bool is_active();
@@ -107,5 +107,6 @@ public:
    void request_trim();
    void release_trim();
    void enqueue_flush(obj_ver_id oid);
-    void unshift_flush(obj_ver_id oid);
+    void unshift_flush(obj_ver_id oid, bool force);
    void remove_flush(object_id oid);
 };
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -10,9 +10,9 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    ring_consumer.loop = [this]() { loop(); };
    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
    data_fd = meta_fd = journal.fd = -1;
    parse_config(config);
    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
    try
    {
        open_data();
@@ -31,7 +31,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
            close(journal.fd);
        throw;
    }
-    flusher = new journal_flusher_t(flusher_count, this);
+    flusher = new journal_flusher_t(this);
 }
 blockstore_impl_t::~blockstore_impl_t()
@@ -92,35 +92,36 @@ void blockstore_impl_t::loop()
            {
                delete journal_init_reader;
                journal_init_reader = NULL;
-                initialized = 10;
+                if (journal.flush_journal)
                    initialized = 3;
                else
                    initialized = 10;
                ringloop->wakeup();
            }
        }
        if (initialized == 3)
        {
            if (readonly)
            {
                printf("Can't flush the journal in readonly mode\n");
                exit(1);
            }
            flusher->loop();
            ringloop->submit();
        }
    }
    else
    {
        // try to submit ops
        unsigned initial_ring_space = ringloop->space_left();
-        // FIXME: rework this "sync polling"
+        // has_writes == 0 - no writes before the current queue item
-        auto cur_sync = in_progress_syncs.begin();
+        // has_writes == 1 - some writes in progress
-        while (cur_sync != in_progress_syncs.end())
+        // has_writes == 2 - tried to submit some writes, but failed
        int has_writes = 0, op_idx = 0, new_idx = 0;
        for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
        {
-            if (continue_sync(*cur_sync) != 2)
+            auto op = submit_queue[op_idx];
-            {
+            submit_queue[new_idx] = op;
                // List is unmodified
                cur_sync++;
            }
            else
            {
                cur_sync = in_progress_syncs.begin();
            }
        }
        auto cur = submit_queue.begin();
        int has_writes = 0;
        while (cur != submit_queue.end())
        {
            auto op_ptr = cur;
            auto op = *(cur++);
            // FIXME: This needs some simplification
            // Writes should not block reads if the ring is not full and reads don't depend on them
            // In all other cases we should stop submission
@@ -142,10 +143,13 @@ void blockstore_impl_t::loop()
            }
            unsigned ring_space = ringloop->space_left();
            unsigned prev_sqe_pos = ringloop->save();
-            bool dequeue_op = false;
+            // 0 = can't submit
            // 1 = in progress
            // 2 = can be removed from queue
            int wr_st = 0;
            if (op->opcode == BS_OP_READ)
            {
-                dequeue_op = dequeue_read(op);
+                wr_st = dequeue_read(op);
            }
            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
            {
@@ -154,8 +158,8 @@ void blockstore_impl_t::loop()
                    // Some writes already could not be submitted
                    continue;
                }
-                dequeue_op = dequeue_write(op);
+                wr_st = dequeue_write(op);
-                has_writes = dequeue_op ? 1 : 2;
+                has_writes = wr_st > 0 ? 1 : 2;
            }
            else if (op->opcode == BS_OP_DELETE)
            {
@@ -164,8 +168,8 @@ void blockstore_impl_t::loop()
                    // Some writes already could not be submitted
                    continue;
                }
-                dequeue_op = dequeue_del(op);
+                wr_st = dequeue_del(op);
-                has_writes = dequeue_op ? 1 : 2;
+                has_writes = wr_st > 0 ? 1 : 2;
            }
            else if (op->opcode == BS_OP_SYNC)
            {
@@ -178,29 +182,31 @@ void blockstore_impl_t::loop()
                    // Can't submit SYNC before previous writes
                    continue;
                }
-                dequeue_op = dequeue_sync(op);
+                wr_st = continue_sync(op, false);
                if (wr_st != 2)
                {
                    has_writes = wr_st > 0 ? 1 : 2;
                }
            }
            else if (op->opcode == BS_OP_STABLE)
            {
-                dequeue_op = dequeue_stable(op);
+                wr_st = dequeue_stable(op);
            }
            else if (op->opcode == BS_OP_ROLLBACK)
            {
-                dequeue_op = dequeue_rollback(op);
+                wr_st = dequeue_rollback(op);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                // LIST doesn't need to be blocked by previous modifications,
+                // LIST doesn't need to be blocked by previous modifications
                // it only needs to include all in-progress writes as they're guaranteed
                // to be readable and stabilizable/rollbackable by subsequent operations
                process_list(op);
-                dequeue_op = true;
+                wr_st = 2;
            }
-            if (dequeue_op)
+            if (wr_st == 2)
            {
-                submit_queue.erase(op_ptr);
+                new_idx--;
            }
-            else
+            if (wr_st == 0)
            {
                ringloop->restore(prev_sqe_pos);
                if (PRIV(op)->wait_for == WAIT_SQE)
@@ -211,6 +217,14 @@ void blockstore_impl_t::loop()
                }
            }
        }
        if (op_idx != new_idx)
        {
            while (op_idx < submit_queue.size())
            {
                submit_queue[new_idx++] = submit_queue[op_idx++];
            }
            submit_queue.resize(new_idx);
        }
        if (!readonly)
        {
            flusher->loop();
@@ -233,7 +247,7 @@ bool blockstore_impl_t::is_safe_to_stop()
 {
    // It's safe to stop blockstore when there are no in-flight operations,
    // no in-progress syncs and flusher isn't doing anything
-    if (submit_queue.size() > 0 || in_progress_syncs.size() > 0 || !readonly && flusher->is_active())
+    if (submit_queue.size() > 0 || !readonly && flusher->is_active())
    {
        return false;
    }
@@ -287,7 +301,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
    {
        int next = ((journal.cur_sector + 1) % journal.sector_count);
-        if (journal.sector_info[next].usage_count > 0 ||
+        if (journal.sector_info[next].flush_count > 0 ||
            journal.sector_info[next].dirty)
        {
            // do not submit
@@ -300,7 +314,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    }
    else if (PRIV(op)->wait_for == WAIT_FREE)
    {
-        if (!data_alloc->get_free_count() && !flusher->is_active())
+        if (!data_alloc->get_free_count() && flusher->is_active())
        {
 #ifdef BLOCKSTORE_DEBUG
            printf("Still waiting for free space on the data device\n");
@@ -315,7 +329,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    }
 }
-void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
+void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
 {
    if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
        ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
@@ -323,8 +337,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
            op->len > block_size-op->offset ||
            (op->len % disk_alignment)
        )) ||
-        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
+        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST)
        first && (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE))
    {
        // Basic verification not passed
        op->retval = -EINVAL;
@@ -374,25 +387,12 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
    {
        op->retval = 0;
        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
-    if (!first)
+    submit_queue.push_back(op);
    {
        submit_queue.push_back(op);
    }
    else
    {
        submit_queue.push_front(op);
    }
    ringloop->wakeup();
 }
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
@@ -160,8 +160,6 @@ struct blockstore_op_private_t
    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
    int sync_small_checked, sync_big_checked;
    std::list<blockstore_op_t*>::iterator in_progress_ptr;
    int prev_sync_count;
 };
 // https://github.com/algorithm-ninja/cpp-btree
@@ -199,7 +197,10 @@ class blockstore_impl_t
    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
-    int flusher_count;
+    // Maximum and minimum flusher count
    unsigned max_flusher_count, min_flusher_count;
    // Maximum queue depth
    unsigned max_write_iodepth = 128;
    /******* END OF OPTIONS *******/
    struct ring_consumer_t ring_consumer;
@@ -207,9 +208,9 @@ class blockstore_impl_t
    blockstore_clean_db_t clean_db;
    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
-    std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
+    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
+    int unsynced_big_write_count = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;
@@ -226,6 +227,7 @@ class blockstore_impl_t
    struct journal_t journal;
    journal_flusher_t *flusher;
    int write_iodepth = 0;
    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
@@ -267,6 +269,7 @@ class blockstore_impl_t
    // Write
    bool enqueue_write(blockstore_op_t *op);
    void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
    int dequeue_write(blockstore_op_t *op);
    int dequeue_del(blockstore_op_t *op);
    int continue_write(blockstore_op_t *op);
@@ -274,16 +277,14 @@ class blockstore_impl_t
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);
    // Sync
-    int dequeue_sync(blockstore_op_t *op);
+    int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
    void handle_sync_event(ring_data_t *data, blockstore_op_t *op);
-    int continue_sync(blockstore_op_t *op);
+    void ack_sync(blockstore_op_t *op);
    void ack_one_sync(blockstore_op_t *op);
    int ack_sync(blockstore_op_t *op);
    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
    int continue_stable(blockstore_op_t *op);
-    void mark_stable(const obj_ver_id & ov);
+    void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
    void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
    void stabilize_object(object_id oid, uint64_t max_ver);
@@ -318,7 +319,7 @@ public:
    bool is_stalled();
    // Submission
-    void enqueue_op(blockstore_op_t *op, bool first = false);
+    void enqueue_op(blockstore_op_t *op);
    // Unstable writes are added here (map of object_id -> version)
    std::unordered_map<object_id, uint64_t> unstable_writes;
@@ -326,5 +327,5 @@ public:
    inline uint32_t get_block_size() { return block_size; }
    inline uint64_t get_block_count() { return block_count; }
    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
-    inline uint32_t get_disk_alignment() { return disk_alignment; }
+    inline uint32_t get_bitmap_granularity() { return disk_alignment; }
 };
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -111,7 +111,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
                {
                    // free the previous block
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i >> block_order);
+                    printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
                        clean_it->second.location >> block_order,
                        clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
                        done_cnt+i);
 #endif
                    bs->data_alloc->set(clean_it->second.location >> block_order, false);
                }
@@ -399,6 +402,18 @@ resume_1:
            }
        }
    }
    for (auto ov: double_allocs)
    {
        auto dirty_it = bs->dirty_db.find(ov);
        if (dirty_it != bs->dirty_db.end() &&
            IS_BIG_WRITE(dirty_it->second.state) &&
            dirty_it->second.location == UINT64_MAX)
        {
            printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
            exit(1);
        }
    }
    bs->flusher->mark_trim_possible();
    bs->journal.dirty_start = bs->journal.next_free;
    printf(
@@ -549,7 +564,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_SMALL_WRITE_INSTANT)
                    {
-                        bs->mark_stable(ov);
+                        bs->mark_stable(ov, true);
                    }
                }
            }
@@ -557,9 +572,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            {
 #ifdef BLOCKSTORE_DEBUG
                printf(
-                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
+                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
                    je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
-                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
+                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
                );
 #endif
                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
@@ -570,35 +585,19 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                {
                    dirty_it--;
                    if (dirty_it->first.oid == je->big_write.oid &&
                        dirty_it->first.version >= je->big_write.version &&
                        (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
                    {
                        // It is allowed to overwrite a deleted object with a
-                        // version number less than deletion version number,
+                        // version number smaller than deletion version number,
                        // because the presence of a BIG_WRITE entry means that
-                        // the data for it is already on disk.
+                        // its data and metadata are already flushed.
-                        // Purge all dirty and clean entries for this object.
+                        // We don't know if newer versions are flushed, but
-                        auto dirty_end = dirty_it;
+                        // the previous delete definitely is.
-                        dirty_end++;
+                        // So we forget previous dirty entries, but retain the clean one.
-                        while (1)
+                        // This feature is required for writes happening shortly
-                        {
+                        // after deletes.
-                            if (dirty_it == bs->dirty_db.begin())
+                        erase_dirty_object(dirty_it);
                            {
                                break;
                            }
                            dirty_it--;
                            if (dirty_it->first.oid != je->big_write.oid)
                            {
                                dirty_it++;
                                break;
                            }
                        }
                        bs->erase_dirty(dirty_it, dirty_end, UINT64_MAX);
                        auto clean_it = bs->clean_db.find(je->big_write.oid);
                        if (clean_it != bs->clean_db.end())
                        {
                            bs->data_alloc->set(clean_it->second.location >> bs->block_order, false);
                            bs->clean_db.erase(clean_it);
                        }
                    }
                }
                auto clean_it = bs->clean_db.find(je->big_write.oid);
@@ -610,18 +609,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .oid = je->big_write.oid,
                        .version = je->big_write.version,
                    };
-                    bs->dirty_db.emplace(ov, (dirty_entry){
+                    auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
                        .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
                        .flags = 0,
                        .location = je->big_write.location,
                        .offset = je->big_write.offset,
                        .len = je->big_write.len,
                        .journal_sector = proc_pos,
-                    });
+                    }).first;
                    if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
                    {
                        // This is probably a big_write that's already flushed and freed, but it may
                        // also indicate a bug. So we remember such entries and recheck them afterwards.
                        // If it's not a bug they won't be present after reading the whole journal.
                        dirty_it->second.location = UINT64_MAX;
                        double_allocs.push_back(ov);
                    }
                    else
                    {
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
+                        printf(
                            "Allocate block (journal) %lu: %lx:%lx v%lu\n",
                            je->big_write.location >> bs->block_order,
                            ov.oid.inode, ov.oid.stripe, ov.version
                        );
 #endif
-                    bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
+                        bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
                    }
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
                    printf(
@@ -633,7 +647,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_BIG_WRITE_INSTANT)
                    {
-                        bs->mark_stable(ov);
+                        bs->mark_stable(ov, true);
                    }
                }
            }
@@ -647,7 +661,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    .oid = je->stable.oid,
                    .version = je->stable.version,
                };
-                bs->mark_stable(ov);
+                bs->mark_stable(ov, true);
            }
            else if (je->type == JE_ROLLBACK)
            {
@@ -666,9 +680,26 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
 #ifdef BLOCKSTORE_DEBUG
                printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
 #endif
                bool dirty_exists = false;
                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
                    .oid = je->del.oid,
                    .version = UINT64_MAX,
                });
                if (dirty_it != bs->dirty_db.begin())
                {
                    dirty_it--;
                    dirty_exists = dirty_it->first.oid == je->del.oid;
                }
                auto clean_it = bs->clean_db.find(je->del.oid);
-                if (clean_it == bs->clean_db.end() ||
+                bool clean_exists = (clean_it != bs->clean_db.end() &&
-                    clean_it->second.version < je->del.version)
+                    clean_it->second.version < je->del.version);
                if (!clean_exists && dirty_exists)
                {
                    // Clean entry doesn't exist. This means that the delete is already flushed.
                    // So we must not flush this object anymore.
                    erase_dirty_object(dirty_it);
                }
                else if (clean_exists || dirty_exists)
                {
                    // oid, version
                    obj_ver_id ov = {
@@ -686,8 +717,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    bs->journal.used_sectors[proc_pos]++;
                    // Deletions are treated as immediately stable, because
                    // "2-phase commit" (write->stabilize) isn't sufficient for them anyway
-                    bs->mark_stable(ov);
+                    bs->mark_stable(ov, true);
                }
                // Ignore delete if neither preceding dirty entries nor the clean one are present
            }
            started = true;
            pos += je->size;
@@ -698,3 +730,30 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
    bs->journal.next_free = next_free;
    return 1;
 }
 void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it)
 {
    auto oid = dirty_it->first.oid;
    auto dirty_end = dirty_it;
    dirty_end++;
    while (1)
    {
        if (dirty_it == bs->dirty_db.begin())
        {
            break;
        }
        dirty_it--;
        if (dirty_it->first.oid != oid)
        {
            dirty_it++;
            break;
        }
    }
    auto clean_it = bs->clean_db.find(oid);
    uint64_t clean_loc = clean_it != bs->clean_db.end()
        ? clean_it->second.location : UINT64_MAX;
    bs->erase_dirty(dirty_it, dirty_end, clean_loc);
    // Remove it from the flusher's queue, too
    // Otherwise it may end up referring to a small unstable write after reading the rest of the journal
    bs->flusher->remove_flush(oid);
 }
--- a/src/blockstore_init.h
+++ b/src/blockstore_init.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
@@ -36,6 +36,7 @@ class blockstore_init_journal
    bool started = false;
    uint64_t next_free;
    std::vector<bs_init_journal_done> done;
    std::vector<obj_ver_id> double_allocs;
    uint64_t journal_pos = 0;
    uint64_t continue_pos = 0;
    void *init_write_buf = NULL;
@@ -48,6 +49,7 @@ class blockstore_init_journal
    std::function<void(ring_data_t*)> simple_callback;
    int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
    void handle_event(ring_data_t *data);
    void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
 public:
    blockstore_init_journal(blockstore_impl_t* bs);
    int loop();
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -1,12 +1,12 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
 blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
 {
    this->bs = bs;
-    sectors_required = 0;
+    sectors_to_write = 0;
    next_pos = bs->journal.next_free;
    next_sector = bs->journal.cur_sector;
    first_sector = -1;
@@ -20,23 +20,26 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    int required = entries_required;
    while (1)
    {
-        int fits = bs->journal.no_same_sector_overwrites && bs->journal.sector_info[next_sector].written
+        int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
            ? 0
            : (bs->journal.block_size - next_in_pos) / size;
        if (fits > 0)
        {
            if (fits > required)
            {
                fits = required;
            }
            if (first_sector == -1)
            {
                first_sector = next_sector;
            }
            required -= fits;
            next_in_pos += fits * size;
-            sectors_required++;
+            sectors_to_write++;
        }
        else if (bs->journal.sector_info[next_sector].dirty)
        {
-            // sectors_required is more like "sectors to write"
+            sectors_to_write++;
            sectors_required++;
        }
        if (required <= 0)
        {
@@ -59,7 +62,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
            );
        }
-        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
+        if (bs->journal.sector_info[next_sector].flush_count > 0 ||
            bs->journal.sector_info[next_sector].dirty)
        {
            // No memory buffer available. Wait for it.
@@ -71,17 +74,18 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                    dirty++;
                    used++;
                }
-                if (bs->journal.sector_info[i].usage_count > 0)
+                if (bs->journal.sector_info[i].flush_count > 0)
                {
                    used++;
                }
            }
            // In fact, it's even more rare than "ran out of journal space", so print a warning
            printf(
-                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
+                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
                " is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
                used, bs->journal.sector_count, dirty, next_sector,
                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
-                bs->journal.sector_info[next_sector].usage_count
+                bs->journal.sector_info[next_sector].flush_count
            );
            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
            return 0;
@@ -100,11 +104,8 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    {
        // No space in the journal. Wait until used_start changes.
        printf(
-            "Ran out of journal space (free space: %lu bytes, sectors to write: %d)\n",
+            "Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
-            (bs->journal.next_free >= bs->journal.used_start
+            bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
                ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
                : bs->journal.used_start - bs->journal.next_free),
            sectors_required
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
        bs->flusher->request_trim();
@@ -116,22 +117,21 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
 journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
 {
-    if (journal.block_size - journal.in_sector_pos < size ||
+    if (!journal.entry_fits(size))
        journal.no_same_sector_overwrites && journal.sector_info[journal.cur_sector].written)
    {
        assert(!journal.sector_info[journal.cur_sector].dirty);
        // Move to the next journal sector
-        journal.sector_info[journal.cur_sector].written = false;
+        if (journal.sector_info[journal.cur_sector].flush_count > 0)
        if (journal.sector_info[journal.cur_sector].usage_count > 0)
        {
            // Also select next sector buffer in memory
            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
-            assert(!journal.sector_info[journal.cur_sector].usage_count);
+            assert(!journal.sector_info[journal.cur_sector].flush_count);
        }
        else
        {
            journal.dirty_start = journal.next_free;
        }
        journal.sector_info[journal.cur_sector].written = false;
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
@@ -157,7 +157,7 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_
 {
    journal.sector_info[cur_sector].dirty = false;
    journal.sector_info[cur_sector].written = true;
-    journal.sector_info[cur_sector].usage_count++;
+    journal.sector_info[cur_sector].flush_count++;
    ring_data_t *data = ((ring_data_t*)sqe->user_data);
    data->iov = (struct iovec){
        (journal.inmemory
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
@@ -133,7 +133,7 @@ inline uint32_t je_crc32(journal_entry *je)
 struct journal_sector_info_t
 {
    uint64_t offset;
-    uint64_t usage_count;
+    uint64_t flush_count;
    bool written;
    bool dirty;
 };
@@ -143,6 +143,7 @@ struct journal_t
    int fd;
    uint64_t device_size;
    bool inmemory = false;
    bool flush_journal = false;
    void *buffer = NULL;
    uint64_t block_size;
@@ -170,13 +171,18 @@ struct journal_t
    ~journal_t();
    bool trim();
    uint64_t get_trim_pos();
    inline bool entry_fits(int size)
    {
        return !(block_size - in_sector_pos < size ||
            no_same_sector_overwrites && sector_info[cur_sector].written);
    }
 };
 struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_required, first_sector;
+    int sectors_to_write, first_sector;
    bool right_dir; // writing to the end or the beginning of the ring buffer
    blockstore_journal_check_t(blockstore_impl_t *bs);
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include <sys/file.h>
 #include "blockstore_impl.h"
@@ -42,6 +42,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        disable_flock = true;
    }
    if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
    {
        // Only flush journal and exit
        journal.flush_journal = true;
    }
    if (config["immediate_commit"] == "all")
    {
        immediate_commit = IMMEDIATE_ALL;
@@ -69,7 +74,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
    meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
    bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
-    flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
+    max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
    if (!max_flusher_count)
        max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
    // Validate
    if (!block_size)
    {
@@ -79,9 +88,17 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        throw std::runtime_error("Bad block size");
    }
-    if (!flusher_count)
+    if (!max_flusher_count)
    {
-        flusher_count = 32;
+        max_flusher_count = 256;
    }
    if (!min_flusher_count || journal.flush_journal)
    {
        min_flusher_count = 1;
    }
    if (!max_write_iodepth)
    {
        max_write_iodepth = 128;
    }
    if (!disk_alignment)
    {
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -112,7 +112,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        read_op->version = 0;
        read_op->retval = read_op->len;
        FINISH_OP(read_op);
-        return 1;
+        return 2;
    }
    uint64_t fulfilled = 0;
    PRIV(read_op)->pending_ops = 0;
@@ -191,8 +191,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                    if (bmp_end > bmp_start)
                    {
                        // fill with zeroes
-                        fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                        assert(fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                            bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
+                            bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
                    }
                    bmp_start = bmp_end;
                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
@@ -218,7 +218,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    else if (fulfilled < read_op->len)
    {
        // fill remaining parts with zeroes
-        fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
+        assert(fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
    }
    assert(fulfilled == read_op->len);
    read_op->version = result_version;
@@ -232,10 +232,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        }
        read_op->retval = read_op->len;
        FINISH_OP(read_op);
-        return 1;
+        return 2;
    }
    read_op->retval = 0;
-    return 1;
+    return 2;
 }
 void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -50,7 +50,7 @@ skip_ov:
                {
                    op->retval = -EBUSY;
                    FINISH_OP(op);
-                    return 1;
+                    return 2;
                }
                if (dirty_it == dirty_db.begin())
                {
@@ -66,7 +66,7 @@ skip_ov:
        // Already rolled back
        op->retval = 0;
        FINISH_OP(op);
-        return 1;
+        return 2;
    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
@@ -75,44 +75,35 @@ skip_ov:
        return 0;
    }
    // There is sufficient space. Get SQEs
-    struct io_uring_sqe *sqe[space_check.sectors_required];
+    struct io_uring_sqe *sqe[space_check.sectors_to_write];
-    for (i = 0; i < space_check.sectors_required; i++)
+    for (i = 0; i < space_check.sectors_to_write; i++)
    {
        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
    }
    // Prepare and submit journal entries
    auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
    int s = 0, cur_sector = -1;
    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_rollback) &&
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
            if (cur_sector == -1)
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
            cur_sector = journal.cur_sector;
        }
        journal_entry_rollback *je = (journal_entry_rollback*)
            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
        journal.sector_info[journal.cur_sector].dirty = false;
        je->oid = v->oid;
        je->version = v->version;
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
        if (cur_sector != journal.cur_sector)
        {
            // Write previous sector. We should write the sector only after filling it,
            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
            if (cur_sector != -1)
                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
            else
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
        }
    }
-    if (cur_sector != -1)
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
-        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    assert(s == space_check.sectors_to_write);
    if (cur_sector == -1)
        PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
@@ -135,11 +126,8 @@ resume_2:
 resume_3:
    if (!disable_journal_fsync)
    {
-        io_uring_sqe *sqe = get_sqe();
+        io_uring_sqe *sqe;
-        if (!sqe)
+        BS_SUBMIT_GET_SQE_DECL(sqe);
        {
            return 0;
        }
        ring_data_t *data = ((ring_data_t*)sqe->user_data);
        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
        data->iov = { 0 };
@@ -160,7 +148,7 @@ resume_5:
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
-    return 1;
+    return 2;
 }
 void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
@@ -175,10 +163,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
        auto rm_start = it;
        auto rm_end = it;
        it--;
-        while (it->first.oid == ov.oid &&
+        while (1)
            it->first.version > ov.version &&
            !IS_IN_FLIGHT(it->second.state) &&
            !IS_STABLE(it->second.state))
        {
            if (it->first.oid != ov.oid)
                break;
@@ -188,7 +173,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
                    max_unstable = it->first.version;
                break;
            }
-            else if (IS_STABLE(it->second.state))
+            else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
                break;
            // Remove entry
            rm_start = it;
@@ -199,14 +184,14 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
        if (rm_start != rm_end)
        {
            erase_dirty(rm_start, rm_end, UINT64_MAX);
-        }
+            auto unstab_it = unstable_writes.find(ov.oid);
-        auto unstab_it = unstable_writes.find(ov.oid);
+            if (unstab_it != unstable_writes.end())
-        if (unstab_it != unstable_writes.end())
+            {
-        {
+                if (max_unstable == 0)
-            if (max_unstable == 0)
+                    unstable_writes.erase(unstab_it);
-                unstable_writes.erase(unstab_it);
+                else
-            else
+                    unstab_it->second = max_unstable;
-                unstab_it->second = max_unstable;
+            }
        }
    }
 }
@@ -225,10 +210,7 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    if (PRIV(op)->pending_ops == 0)
    {
        PRIV(op)->op_state++;
-        if (!continue_rollback(op))
+        ringloop->wakeup();
        {
            submit_queue.push_front(op);
        }
    }
 }
@@ -243,6 +225,9 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    if (IS_DELETE(dirty_it->second.state))
    {
        object_id oid = dirty_it->first.oid;
 #ifdef BLOCKSTORE_DEBUG
        printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
 #endif
        dirty_it = dirty_end;
        // Unblock operations blocked by delete flushing
        uint32_t next_state = BS_ST_IN_FLIGHT;
@@ -263,10 +248,12 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    }
    while (1)
    {
-        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
+        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
            dirty_it->second.location != UINT64_MAX)
        {
 #ifdef BLOCKSTORE_DEBUG
-            printf("Free block %lu\n", dirty_it->second.location >> block_order);
+            printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> block_order,
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
 #endif
            data_alloc->set(dirty_it->second.location >> block_order, false);
        }
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -60,7 +60,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
                // No such object version
                op->retval = -ENOENT;
                FINISH_OP(op);
-                return 1;
+                return 2;
            }
            else
            {
@@ -77,7 +77,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            // Object not synced yet. Caller must sync it first
            op->retval = -EBUSY;
            FINISH_OP(op);
-            return 1;
+            return 2;
        }
        else if (!IS_STABLE(dirty_it->second.state))
        {
@@ -89,7 +89,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        // Already stable
        op->retval = 0;
        FINISH_OP(op);
-        return 1;
+        return 2;
    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
@@ -98,45 +98,36 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        return 0;
    }
    // There is sufficient space. Get SQEs
-    struct io_uring_sqe *sqe[space_check.sectors_required];
+    struct io_uring_sqe *sqe[space_check.sectors_to_write];
-    for (i = 0; i < space_check.sectors_required; i++)
+    for (i = 0; i < space_check.sectors_to_write; i++)
    {
        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
    }
    // Prepare and submit journal entries
    auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
    int s = 0, cur_sector = -1;
    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_stable) &&
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        // FIXME: Only stabilize versions that aren't stable yet
        if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
            if (cur_sector == -1)
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
            cur_sector = journal.cur_sector;
        }
        journal_entry_stable *je = (journal_entry_stable*)
            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
        journal.sector_info[journal.cur_sector].dirty = false;
        je->oid = v->oid;
        je->version = v->version;
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
        if (cur_sector != journal.cur_sector)
        {
            // Write previous sector. We should write the sector only after filling it,
            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
            if (cur_sector != -1)
                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
            else
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
        }
    }
-    if (cur_sector != -1)
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
-        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    assert(s == space_check.sectors_to_write);
    if (cur_sector == -1)
        PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
@@ -159,11 +150,8 @@ resume_2:
 resume_3:
    if (!disable_journal_fsync)
    {
-        io_uring_sqe *sqe = get_sqe();
+        io_uring_sqe *sqe;
-        if (!sqe)
+        BS_SUBMIT_GET_SQE_DECL(sqe);
        {
            return 0;
        }
        ring_data_t *data = ((ring_data_t*)sqe->user_data);
        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
        data->iov = { 0 };
@@ -180,30 +168,50 @@ resume_5:
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        // Mark all dirty_db entries up to op->version as stable
 #ifdef BLOCKSTORE_DEBUG
        printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
 #endif
        mark_stable(*v);
    }
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
-    return 1;
+    return 2;
 }
-void blockstore_impl_t::mark_stable(const obj_ver_id & v)
+void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
 {
    auto dirty_it = dirty_db.find(v);
    if (dirty_it != dirty_db.end())
    {
        while (1)
        {
            bool was_stable = IS_STABLE(dirty_it->second.state);
            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
            {
                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
            }
-            else if (IS_STABLE(dirty_it->second.state))
+            if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
                IS_DELETE(dirty_it->second.state)))
            {
                // Big write overrides all previous dirty entries
                auto erase_end = dirty_it;
                while (dirty_it != dirty_db.begin())
                {
                    dirty_it--;
                    if (dirty_it->first.oid != v.oid)
                    {
                        dirty_it++;
                        break;
                    }
                }
                auto clean_it = clean_db.find(v.oid);
                uint64_t clean_loc = clean_it != clean_db.end()
                    ? clean_it->second.location : UINT64_MAX;
                erase_dirty(dirty_it, erase_end, clean_loc);
                break;
            }
-            if (dirty_it == dirty_db.begin())
+            if (was_stable || dirty_it == dirty_db.begin())
            {
                break;
            }
@@ -213,9 +221,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v)
                break;
            }
        }
 #ifdef BLOCKSTORE_DEBUG
        printf("enqueue_flush %lx:%lx v%lu\n", v.oid.inode, v.oid.stripe, v.version);
 #endif
        flusher->enqueue_flush(v);
    }
    auto unstab_it = unstable_writes.find(v.oid);
@@ -240,9 +245,6 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    if (PRIV(op)->pending_ops == 0)
    {
        PRIV(op)->op_state++;
-        if (!continue_stable(op))
+        ringloop->wakeup();
        {
            submit_queue.push_front(op);
        }
    }
 }
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -12,11 +12,19 @@
 #define SYNC_JOURNAL_SYNC_SENT 7
 #define SYNC_DONE 8
-int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
+int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
 {
    if (immediate_commit == IMMEDIATE_ALL)
    {
        // We can return immediately because sync is only dequeued after all previous writes
        op->retval = 0;
        FINISH_OP(op);
        return 2;
    }
    if (PRIV(op)->op_state == 0)
    {
        stop_sync_submitted = false;
        unsynced_big_write_count -= unsynced_big_writes.size();
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
        PRIV(op)->sync_small_checked = 0;
@@ -29,34 +37,15 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
            PRIV(op)->op_state = SYNC_HAS_SMALL;
        else
            PRIV(op)->op_state = SYNC_DONE;
        // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
        PRIV(op)->prev_sync_count = in_progress_syncs.size();
        PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
    }
    continue_sync(op);
    // Always dequeue because we always add syncs to in_progress_syncs
    return 1;
 }
 int blockstore_impl_t::continue_sync(blockstore_op_t *op)
 {
    auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
    {
        // No big writes, just fsync the journal
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
        {
            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
            {
                // Wait for small inflight writes to complete
                return 0;
            }
        }
        if (journal.sector_info[journal.cur_sector].dirty)
        {
            // Write out the last journal sector if it happens to be dirty
            BS_SUBMIT_GET_ONLY_SQE(sqe);
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe, [this, op](ring_data_t *data) { handle_sync_event(data, op); });
            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
            PRIV(op)->pending_ops = 1;
            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
@@ -69,21 +58,13 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
    }
    if (PRIV(op)->op_state == SYNC_HAS_BIG)
    {
        for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
        {
            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
            {
                // Wait for big inflight writes to complete
                return 0;
            }
        }
        // 1st step: fsync data
        if (!disable_data_fsync)
        {
            BS_SUBMIT_GET_SQE(sqe, data);
            my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
-            data->callback = cb;
+            data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
            PRIV(op)->pending_ops = 1;
            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
@@ -96,14 +77,6 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
    }
    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
    {
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
        {
            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
            {
                // Wait for small inflight writes to complete
                return 0;
            }
        }
        // 2nd step: Data device is synced, prepare & write journal entries
        // Check space in the journal and journal memory buffers
        blockstore_journal_check_t space_check(this);
@@ -112,30 +85,29 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            return 0;
        }
        // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
-        struct io_uring_sqe *sqe[space_check.sectors_required];
+        struct io_uring_sqe *sqe[space_check.sectors_to_write];
-        for (int i = 0; i < space_check.sectors_required; i++)
+        for (int i = 0; i < space_check.sectors_to_write; i++)
        {
            BS_SUBMIT_GET_SQE_DECL(sqe[i]);
        }
        // Prepare and submit journal entries
        auto it = PRIV(op)->sync_big_writes.begin();
        int s = 0, cur_sector = -1;
        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
            if (cur_sector == -1)
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
        while (it != PRIV(op)->sync_big_writes.end())
        {
            if (!journal.entry_fits(sizeof(journal_entry_big_write)) &&
                journal.sector_info[journal.cur_sector].dirty)
            {
                if (cur_sector == -1)
                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
                prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
                cur_sector = journal.cur_sector;
            }
            journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
                journal, (dirty_db[*it].state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
                sizeof(journal_entry_big_write)
            );
            dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
            journal.sector_info[journal.cur_sector].dirty = false;
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
            printf(
@@ -152,19 +124,11 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            je->crc32 = je_crc32((journal_entry*)je);
            journal.crc32_last = je->crc32;
            it++;
            if (cur_sector != journal.cur_sector)
            {
                // Write previous sector. We should write the sector only after filling it,
                // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
                if (cur_sector != -1)
                    prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
                else
                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
                cur_sector = journal.cur_sector;
            }
        }
-        if (cur_sector != -1)
+        prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
-            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+        assert(s == space_check.sectors_to_write);
        if (cur_sector == -1)
            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
@@ -177,7 +141,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            BS_SUBMIT_GET_SQE(sqe, data);
            my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
-            data->callback = cb;
+            data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
            PRIV(op)->pending_ops = 1;
            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
            return 1;
@@ -187,9 +151,10 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            PRIV(op)->op_state = SYNC_DONE;
        }
    }
-    if (PRIV(op)->op_state == SYNC_DONE)
+    if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
    {
-        return ack_sync(op);
+        ack_sync(op);
        return 2;
    }
    return 1;
 }
@@ -221,42 +186,16 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
        else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
        {
            PRIV(op)->op_state = SYNC_DONE;
            ack_sync(op);
        }
        else
        {
            throw std::runtime_error("BUG: unexpected sync op state");
        }
        ringloop->wakeup();
    }
 }
-int blockstore_impl_t::ack_sync(blockstore_op_t *op)
+void blockstore_impl_t::ack_sync(blockstore_op_t *op)
 {
    if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
    {
        // Remove dependency of subsequent syncs
        auto it = PRIV(op)->in_progress_ptr;
        int done_syncs = 1;
        ++it;
        // Acknowledge sync
        ack_one_sync(op);
        while (it != in_progress_syncs.end())
        {
            auto & next_sync = *it++;
            PRIV(next_sync)->prev_sync_count -= done_syncs;
            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
            {
                done_syncs++;
                // Acknowledge next_sync
                ack_one_sync(next_sync);
            }
        }
        return 2;
    }
    return 0;
 }
 void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 {
    // Handle states
    for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
@@ -304,7 +243,6 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
            }
        }
    }
    in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
    op->retval = 0;
    FINISH_OP(op);
 }
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "blockstore_impl.h"
@@ -57,13 +57,16 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        {
            // It's allowed to write versions with low numbers over deletes
            // However, we have to flush those deletes first as we use version number for ordering
 #ifdef BLOCKSTORE_DEBUG
            printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
 #endif
            wait_del = true;
            PRIV(op)->real_version = op->version;
            op->version = version;
            flusher->unshift_flush((obj_ver_id){
                .oid = op->oid,
                .version = version-1,
-            });
+            }, true);
        }
        else
        {
@@ -87,7 +90,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
-    else
+    else if (!wait_del)
        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
    // FIXME No strict need to add it into dirty_db here, it's just left
@@ -121,6 +124,29 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    return true;
 }
 void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval)
 {
    while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
    {
        dirty_db.erase(dirty_it++);
    }
    bool found = false;
    for (auto other_op: submit_queue)
    {
        if (!found && other_op == op)
            found = true;
        else if (found && other_op->oid == op->oid &&
            (other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
        {
            // Mark operations to cancel them
            PRIV(other_op)->real_version = UINT64_MAX;
            other_op->retval = retval;
        }
    }
    op->retval = retval;
    FINISH_OP(op);
 }
 // First step of the write algorithm: dequeue operation and submit initial write(s)
 int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
 {
@@ -140,17 +166,24 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
    }
    if (PRIV(op)->real_version != 0)
    {
        if (PRIV(op)->real_version == UINT64_MAX)
        {
            // This is the flag value used to cancel operations
            FINISH_OP(op);
            return 2;
        }
        // Restore original low version number for unblocked operations
 #ifdef BLOCKSTORE_DEBUG
        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
 #endif
        auto prev_it = dirty_it;
        prev_it--;
        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
        {
            // Original version is still invalid
-            // FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
+            // All subsequent writes to the same object must be canceled too
-            dirty_db.erase(dirty_it);
+            cancel_all_writes(op, dirty_it, -EEXIST);
-            op->retval = -EEXIST;
+            return 2;
            FINISH_OP(op);
            return 1;
        }
        op->version = PRIV(op)->real_version;
        PRIV(op)->real_version = 0;
@@ -161,10 +194,14 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            .version = op->version,
        }, e).first;
    }
    if (write_iodepth >= max_write_iodepth)
    {
        return 0;
    }
    if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
    {
        blockstore_journal_check_t space_check(this);
-        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
+        if (!space_check.check_available(op, unsynced_big_write_count + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@@ -179,17 +216,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                PRIV(op)->wait_for = WAIT_FREE;
                return 0;
            }
-            // FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
+            cancel_all_writes(op, dirty_it, -ENOSPC);
-            dirty_db.erase(dirty_it);
+            return 2;
            op->retval = -ENOSPC;
            FINISH_OP(op);
            return 1;
        }
        write_iodepth++;
        BS_SUBMIT_GET_SQE(sqe, data);
        dirty_it->second.location = loc << block_order;
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
 #ifdef BLOCKSTORE_DEBUG
-        printf("Allocate block %lu\n", loc);
+        printf(
            "Allocate block %lu for %lx:%lx v%lu\n",
            loc, op->oid.inode, op->oid.stripe, op->version
        );
 #endif
        data_alloc->set(loc, true);
        uint64_t stripe_offset = (op->offset % bitmap_granularity);
@@ -215,11 +253,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
        if (immediate_commit != IMMEDIATE_ALL)
        {
-            // Remember big write as unsynced
+            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
-            unsynced_big_writes.push_back((obj_ver_id){
+            unsynced_big_write_count++;
                .oid = op->oid,
                .version = op->version,
            });
            PRIV(op)->op_state = 3;
        }
        else
@@ -232,11 +267,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        // Small (journaled) write
        // First check if the journal has sufficient space
        blockstore_journal_check_t space_check(this);
-        if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
+        if (unsynced_big_write_count && !space_check.check_available(op, unsynced_big_write_count, sizeof(journal_entry_big_write), 0)
            || !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
        write_iodepth++;
        // There is sufficient space. Get SQE(s)
        struct io_uring_sqe *sqe1 = NULL;
        if (immediate_commit != IMMEDIATE_NONE ||
@@ -323,18 +359,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        {
            journal.next_free = journal_block_size;
        }
        if (immediate_commit == IMMEDIATE_NONE)
        {
            // Remember small write as unsynced
            unsynced_small_writes.push_back((obj_ver_id){
                .oid = op->oid,
                .version = op->version,
            });
        }
        if (!PRIV(op)->pending_ops)
        {
            PRIV(op)->op_state = 4;
-            continue_write(op);
+            return continue_write(op);
        }
        else
        {
@@ -348,30 +376,29 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
 {
    io_uring_sqe *sqe = NULL;
    journal_entry_big_write *je;
    int op_state = PRIV(op)->op_state;
    if (op_state != 2 && op_state != 4)
    {
        // In progress
        return 1;
    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
    assert(dirty_it != dirty_db.end());
-    if (PRIV(op)->op_state == 2)
+    if (op_state == 2)
        goto resume_2;
-    else if (PRIV(op)->op_state == 4)
+    else if (op_state == 4)
        goto resume_4;
    else
        return 1;
 resume_2:
    // Only for the immediate_commit mode: prepare and submit big_write journal entry
-    sqe = get_sqe();
+    BS_SUBMIT_GET_SQE_DECL(sqe);
    if (!sqe)
    {
        return 0;
    }
    je = (journal_entry_big_write*)prefill_single_journal_entry(
        journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
        sizeof(journal_entry_big_write)
    );
    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
    journal.sector_info[journal.cur_sector].dirty = false;
    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
    printf(
@@ -396,7 +423,7 @@ resume_2:
 resume_4:
    // Switch object state
 #ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lx:%lx v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+    printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
 #endif
    bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
        ? (immediate_commit == IMMEDIATE_ALL)
@@ -410,11 +437,31 @@ resume_4:
        | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
    if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
    {
-        // Deletions are treated as immediately stable
+        // Deletions and 'instant' operations are treated as immediately stable
        mark_stable(dirty_it->first);
    }
-    if (immediate_commit == IMMEDIATE_ALL)
+    if (!imm)
    {
        if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
        {
            // Remember big write as unsynced
            unsynced_big_writes.push_back((obj_ver_id){
                .oid = op->oid,
                .version = op->version,
            });
        }
        else
        {
            // Remember small write as unsynced
            unsynced_small_writes.push_back((obj_ver_id){
                .oid = op->oid,
                .version = op->version,
            });
        }
    }
    if (imm && (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
    {
        // Unblock small writes
        dirty_it++;
        while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
        {
@@ -427,8 +474,9 @@ resume_4:
    }
    // Acknowledge write
    op->retval = op->len;
    write_iodepth--;
    FINISH_OP(op);
-    return 1;
+    return 2;
 }
 void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
@@ -447,10 +495,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    {
        release_journal_sectors(op);
        PRIV(op)->op_state++;
-        if (!continue_write(op))
+        ringloop->wakeup();
        {
            submit_queue.push_front(op);
        }
    }
 }
@@ -463,8 +508,8 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
        uint64_t s = PRIV(op)->min_flushed_journal_sector;
        while (1)
        {
-            journal.sector_info[s-1].usage_count--;
+            journal.sector_info[s-1].flush_count--;
-            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
+            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0)
            {
                // We know for sure that we won't write into this sector anymore
                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
@@ -488,6 +533,10 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
 int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
 {
    if (PRIV(op)->op_state)
    {
        return continue_write(op);
    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
@@ -498,6 +547,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    {
        return 0;
    }
    write_iodepth++;
    io_uring_sqe *sqe = NULL;
    if (immediate_commit != IMMEDIATE_NONE ||
        (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
@@ -545,18 +595,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops++;
    }
    else
    {
        // Remember delete as unsynced
        unsynced_small_writes.push_back((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
    }
    if (!PRIV(op)->pending_ops)
    {
        PRIV(op)->op_state = 4;
-        continue_write(op);
+        return continue_write(op);
    }
    else
    {
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -0,0 +1,856 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <stdexcept>
 #include <assert.h>
 #include "cluster_client.h"
 #define PART_SENT 1
 #define PART_DONE 2
 #define PART_ERROR 4
 #define CACHE_DIRTY 1
 #define CACHE_FLUSHING 2
 #define CACHE_REPEATING 3
 #define OP_FLUSH_BUFFER 2
 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
    this->ringloop = ringloop;
    this->tfd = tfd;
    this->config = config;
    msgr.osd_num = 0;
    msgr.tfd = tfd;
    msgr.ringloop = ringloop;
    msgr.repeer_pgs = [this](osd_num_t peer_osd)
    {
        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
        {
            // peer_osd just connected
            continue_ops();
        }
        else if (dirty_buffers.size())
        {
            // peer_osd just dropped connection
            // determine WHICH dirty_buffers are now obsolete and repeat them
            for (auto & wr: dirty_buffers)
            {
                if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
                    wr.second.state != CACHE_REPEATING)
                {
                    // FIXME: Flush in larger parts
                    flush_buffer(wr.first, &wr.second);
                }
            }
            continue_ops();
        }
    };
    msgr.exec_op = [this](osd_op_t *op)
    {
        // Garbage in
        printf("Incoming garbage from peer %d\n", op->peer_fd);
        msgr.stop_client(op->peer_fd);
        delete op;
    };
    msgr.init();
    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
    st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
    st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
    st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
    st_cli.parse_config(config);
    st_cli.load_global_config();
    if (ringloop)
    {
        consumer.loop = [this]()
        {
            msgr.read_requests();
            msgr.send_replies();
            this->ringloop->submit();
        };
        ringloop->register_consumer(&consumer);
    }
 }
 cluster_client_t::~cluster_client_t()
 {
    for (auto bp: dirty_buffers)
    {
        free(bp.second.buf);
    }
    dirty_buffers.clear();
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
    }
 }
 void cluster_client_t::continue_ops(bool up_retry)
 {
    if (!pgs_loaded)
    {
        // We're offline
        return;
    }
    if (continuing_ops)
    {
        // Attempt to reenter the function
        continuing_ops = 2;
        return;
    }
 restart:
    continuing_ops = 1;
    op_queue_pos = 0;
    bool has_flushes = false, has_writes = false;
    while (op_queue_pos < op_queue.size())
    {
        auto op = op_queue[op_queue_pos];
        bool rm = false, is_flush = op->flags & OP_FLUSH_BUFFER;
        auto opcode = op->opcode;
        if (!op->up_wait || up_retry)
        {
            op->up_wait = false;
            if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE)
            {
                if (is_flush || !has_flushes)
                {
                    // Regular writes can't proceed before buffer flushes
                    rm = continue_rw(op);
                }
            }
            else if (opcode == OSD_OP_SYNC)
            {
                if (!has_writes)
                {
                    // SYNC can't proceed before previous writes
                    rm = continue_sync(op);
                }
            }
        }
        if (opcode == OSD_OP_WRITE)
        {
            has_writes = has_writes || !rm;
            if (is_flush)
            {
                has_flushes = has_writes || !rm;
            }
        }
        else if (opcode == OSD_OP_SYNC)
        {
            // Postpone writes until previous SYNC completes
            // ...so dirty_writes can't contain anything newer than SYNC
            has_flushes = has_writes || !rm;
        }
        if (rm)
        {
            op_queue.erase(op_queue.begin()+op_queue_pos, op_queue.begin()+op_queue_pos+1);
        }
        else
        {
            op_queue_pos++;
        }
        if (continuing_ops == 2)
        {
            goto restart;
        }
    }
    continuing_ops = 0;
 }
 static uint32_t is_power_of_two(uint64_t value)
 {
    uint32_t l = 0;
    while (value > 1)
    {
        if (value & 1)
        {
            return 64;
        }
        value = value >> 1;
        l++;
    }
    return l;
 }
 void cluster_client_t::on_load_config_hook(json11::Json::object & config)
 {
    bs_block_size = config["block_size"].uint64_value();
    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
    if (!bs_block_size)
    {
        bs_block_size = DEFAULT_BLOCK_SIZE;
    }
    if (!bs_bitmap_granularity)
    {
        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
    }
    uint32_t block_order;
    if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
    {
        throw std::runtime_error("Bad block size");
    }
    if (config["immediate_commit"] == "all")
    {
        // Cluster-wide immediate_commit mode
        immediate_commit = true;
    }
    if (config.find("client_max_dirty_bytes") != config.end())
    {
        client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
    }
    else if (config.find("client_dirty_limit") != config.end())
    {
        // Old name
        client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
    }
    if (config.find("client_max_dirty_ops") != config.end())
    {
        client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
    }
    if (!client_max_dirty_bytes)
    {
        client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
    }
    if (!client_max_dirty_ops)
    {
        client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
    }
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
    {
        up_wait_retry_interval = 500;
    }
    else if (up_wait_retry_interval < 50)
    {
        up_wait_retry_interval = 50;
    }
    msgr.parse_config(config);
    msgr.parse_config(this->config);
    st_cli.load_pgs();
 }
 void cluster_client_t::on_load_pgs_hook(bool success)
 {
    for (auto pool_item: st_cli.pool_config)
    {
        pg_counts[pool_item.first] = pool_item.second.real_pg_count;
    }
    pgs_loaded = true;
    for (auto fn: on_ready_hooks)
    {
        fn();
    }
    on_ready_hooks.clear();
    for (auto op: offline_ops)
    {
        execute(op);
    }
    offline_ops.clear();
    continue_ops();
 }
 void cluster_client_t::on_change_hook(json11::Json::object & changes)
 {
    for (auto pool_item: st_cli.pool_config)
    {
        if (pg_counts[pool_item.first] != pool_item.second.real_pg_count)
        {
            // At this point, all pool operations should have been suspended
            // And now they have to be resliced!
            for (auto op: op_queue)
            {
                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
                    INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
            }
            pg_counts[pool_item.first] = pool_item.second.real_pg_count;
        }
    }
    continue_ops();
 }
 void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
 {
    if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
    {
        msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
    }
 }
 bool cluster_client_t::is_ready()
 {
    return pgs_loaded;
 }
 void cluster_client_t::on_ready(std::function<void(void)> fn)
 {
    if (pgs_loaded)
    {
        fn();
    }
    else
    {
        on_ready_hooks.push_back(fn);
    }
 }
 /**
 * How writes are synced when immediate_commit is false
 *
 * "Continue" WRITE:
 * 1) if the operation is not sliced yet - slice it
 * 2) if the operation doesn't require reslice - try to connect & send all remaining parts
 * 3) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
 * 4) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
 * 5) if PG count changes before all parts are done, wait for all in-progress parts to finish,
 *    throw all results away, reslice and resubmit op
 * 6) when all parts are done, try to "continue" the current SYNC
 * 7) if the operation succeeds, but then some OSDs drop their connections, repeat
 *    parts from the current "unsynced batch" previously sent to those OSDs in any order
 *
 * "Continue" current SYNC:
 * 1) take all unsynced operations from the current batch
 * 2) check if all affected OSDs are still alive
 * 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
 * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
 * 5) if any of them fail due to other errors, fail the SYNC operation
 */
 void cluster_client_t::execute(cluster_op_t *op)
 {
    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return;
    }
    op->retval = 0;
    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
    {
        if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
        {
            // Push an extra SYNC operation to flush previous writes
            cluster_op_t *sync_op = new cluster_op_t;
            sync_op->opcode = OSD_OP_SYNC;
            sync_op->callback = [](cluster_op_t* sync_op)
            {
                delete sync_op;
            };
            op_queue.push_back(sync_op);
            dirty_bytes = 0;
            dirty_ops = 0;
        }
        dirty_bytes += op->len;
        dirty_ops++;
    }
    else if (op->opcode == OSD_OP_SYNC)
    {
        dirty_bytes = 0;
        dirty_ops = 0;
    }
    op_queue.push_back(op);
    continue_ops();
 }
 void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
 {
    // Save operation for replay when one of PGs goes out of sync
    // (primary OSD drops our connection in this case)
    auto dirty_it = dirty_buffers.lower_bound((object_id){
        .inode = op->inode,
        .stripe = op->offset,
    });
    while (dirty_it != dirty_buffers.begin())
    {
        dirty_it--;
        if (dirty_it->first.inode != op->inode ||
            (dirty_it->first.stripe + dirty_it->second.len) <= op->offset)
        {
            dirty_it++;
            break;
        }
    }
    uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
    while (len > 0)
    {
        uint64_t new_len = 0;
        if (dirty_it == dirty_buffers.end())
        {
            new_len = len;
        }
        else if (dirty_it->first.inode != op->inode || dirty_it->first.stripe > pos)
        {
            new_len = dirty_it->first.stripe - pos;
            if (new_len > len)
            {
                new_len = len;
            }
        }
        if (new_len > 0)
        {
            dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
                .inode = op->inode,
                .stripe = pos,
            }, (cluster_buffer_t){
                .buf = malloc_or_die(new_len),
                .len = new_len,
            });
        }
        // FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
        dirty_it->second.state = CACHE_DIRTY;
        uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
        if (cur_len > len)
        {
            cur_len = len;
        }
        while (cur_len > 0 && iov_idx < op->iov.count)
        {
            unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
            if (iov_len <= cur_len)
            {
                memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
                    op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
                pos += iov_len;
                len -= iov_len;
                cur_len -= iov_len;
                iov_pos = 0;
                iov_idx++;
            }
            else
            {
                memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
                    op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
                pos += cur_len;
                len -= cur_len;
                iov_pos += cur_len;
                cur_len = 0;
            }
        }
        dirty_it++;
    }
 }
 void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
 {
    wr->state = CACHE_REPEATING;
    cluster_op_t *op = new cluster_op_t;
    op->flags = OP_FLUSH_BUFFER;
    op->opcode = OSD_OP_WRITE;
    op->inode = oid.inode;
    op->offset = oid.stripe;
    op->len = wr->len;
    op->iov.push_back(wr->buf, wr->len);
    op->callback = [wr](cluster_op_t* op)
    {
        if (wr->state == CACHE_REPEATING)
        {
            wr->state = CACHE_DIRTY;
        }
        delete op;
    };
    op_queue.insert(op_queue.begin(), op);
    if (continuing_ops)
    {
        continuing_ops = 2;
        op_queue_pos++;
    }
 }
 int cluster_client_t::continue_rw(cluster_op_t *op)
 {
    if (op->state == 0)
        goto resume_0;
    else if (op->state == 1)
        goto resume_1;
    else if (op->state == 2)
        goto resume_2;
    else if (op->state == 3)
        goto resume_3;
 resume_0:
    if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return 1;
    }
    {
        pool_id_t pool_id = INODE_POOL(op->inode);
        if (!pool_id)
        {
            op->retval = -EINVAL;
            std::function<void(cluster_op_t*)>(op->callback)(op);
            return 1;
        }
        if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
            st_cli.pool_config[pool_id].real_pg_count == 0)
        {
            // Postpone operations to unknown pools
            return 0;
        }
    }
    if (op->opcode == OSD_OP_WRITE)
    {
        if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
        {
            copy_write(op, dirty_buffers);
        }
    }
 resume_1:
    // Slice the operation into parts
    slice_rw(op);
    op->needs_reslice = false;
 resume_2:
    // Send unsent parts, if they're not subject to change
    op->state = 3;
    if (op->needs_reslice)
    {
        for (int i = 0; i < op->parts.size(); i++)
        {
            if (!(op->parts[i].flags & PART_SENT) && op->retval)
            {
                op->retval = -EPIPE;
            }
        }
        goto resume_3;
    }
    for (int i = 0; i < op->parts.size(); i++)
    {
        if (!(op->parts[i].flags & PART_SENT))
        {
            if (!try_send(op, i))
            {
                // We'll need to retry again
                op->up_wait = true;
                if (!retry_timeout_id)
                {
                    retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
                    {
                        retry_timeout_id = 0;
                        continue_ops(true);
                    });
                }
                op->state = 2;
            }
        }
    }
    if (op->state == 2)
    {
        return 0;
    }
 resume_3:
    if (op->inflight_count > 0)
    {
        op->state = 3;
        return 0;
    }
    if (op->done_count >= op->parts.size())
    {
        // Finished successfully
        // Even if the PG count has changed in meanwhile we treat it as success
        // because if some operations were invalid for the new PG count we'd get errors
        op->retval = op->len;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return 1;
    }
    else if (op->retval != 0 && op->retval != -EPIPE)
    {
        // Fatal error (not -EPIPE)
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return 1;
    }
    else
    {
        // -EPIPE - clear the error and retry
        op->retval = 0;
        if (op->needs_reslice)
        {
            op->parts.clear();
            op->done_count = 0;
            goto resume_1;
        }
        else
        {
            for (int i = 0; i < op->parts.size(); i++)
            {
                op->parts[i].flags = 0;
            }
            goto resume_2;
        }
    }
    return 0;
 }
 void cluster_client_t::slice_rw(cluster_op_t *op)
 {
    // Slice the request into individual object stripe requests
    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
    uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
    uint64_t pg_block_size = bs_block_size * pg_data_size;
    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
    op->retval = 0;
    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
    int iov_idx = 0;
    size_t iov_pos = 0;
    int i = 0;
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
        pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
        uint64_t begin = (op->offset < stripe ? stripe : op->offset);
        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
            ? (stripe + pg_block_size) : (op->offset + op->len);
        op->parts[i] = (cluster_op_part_t){
            .parent = op,
            .offset = begin,
            .len = (uint32_t)(end - begin),
            .pg_num = pg_num,
            .flags = 0,
        };
        int left = end-begin;
        while (left > 0 && iov_idx < op->iov.count)
        {
            if (op->iov.buf[iov_idx].iov_len - iov_pos < left)
            {
                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, op->iov.buf[iov_idx].iov_len - iov_pos);
                left -= (op->iov.buf[iov_idx].iov_len - iov_pos);
                iov_pos = 0;
                iov_idx++;
            }
            else
            {
                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
                iov_pos += left;
                left = 0;
            }
        }
        assert(left == 0);
        i++;
    }
 }
 bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd)
 {
    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
    uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
    uint64_t pg_block_size = bs_block_size * pg_data_size;
    uint64_t first_stripe = (offset / pg_block_size) * pg_block_size;
    uint64_t last_stripe = ((offset + len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
        pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
        auto pg_it = pool_cfg.pg_config.find(pg_num);
        if (pg_it != pool_cfg.pg_config.end() && pg_it->second.cur_primary == osd)
        {
            return true;
        }
    }
    return false;
 }
 bool cluster_client_t::try_send(cluster_op_t *op, int i)
 {
    auto part = &op->parts[i];
    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
    if (pg_it != pool_cfg.pg_config.end() &&
        !pg_it->second.pause && pg_it->second.cur_primary)
    {
        osd_num_t primary_osd = pg_it->second.cur_primary;
        auto peer_it = msgr.osd_peer_fds.find(primary_osd);
        if (peer_it != msgr.osd_peer_fds.end())
        {
            int peer_fd = peer_it->second;
            part->osd_num = primary_osd;
            part->flags |= PART_SENT;
            op->inflight_count++;
            part->op = (osd_op_t){
                .op_type = OSD_OP_OUT,
                .peer_fd = peer_fd,
                .req = { .rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
                        .id = op_id++,
                        .opcode = op->opcode,
                    },
                    .inode = op->inode,
                    .offset = part->offset,
                    .len = part->len,
                } },
                .callback = [this, part](osd_op_t *op_part)
                {
                    handle_op_part(part);
                },
            };
            part->op.iov = part->iov;
            msgr.outbox_push(&part->op);
            return true;
        }
        else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
        {
            msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
        }
    }
    return false;
 }
 int cluster_client_t::continue_sync(cluster_op_t *op)
 {
    if (op->state == 1)
        goto resume_1;
    if (immediate_commit || !dirty_osds.size())
    {
        // Sync is not required in the immediate_commit mode or if there are no dirty_osds
        op->retval = 0;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return 1;
    }
    // Check that all OSD connections are still alive
    for (auto sync_osd: dirty_osds)
    {
        auto peer_it = msgr.osd_peer_fds.find(sync_osd);
        if (peer_it == msgr.osd_peer_fds.end())
        {
            return 0;
        }
    }
    // Post sync to affected OSDs
    for (auto & prev_op: dirty_buffers)
    {
        if (prev_op.second.state == CACHE_DIRTY)
        {
            prev_op.second.state = CACHE_FLUSHING;
        }
    }
    op->parts.resize(dirty_osds.size());
    op->retval = 0;
    {
        int i = 0;
        for (auto sync_osd: dirty_osds)
        {
            op->parts[i] = {
                .parent = op,
                .osd_num = sync_osd,
                .flags = 0,
            };
            send_sync(op, &op->parts[i]);
            i++;
        }
    }
    dirty_osds.clear();
 resume_1:
    if (op->inflight_count > 0)
    {
        op->state = 1;
        return 0;
    }
    if (op->retval != 0)
    {
        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
        {
            if (uw_it->second.state == CACHE_FLUSHING)
            {
                uw_it->second.state = CACHE_DIRTY;
            }
        }
        if (op->retval == -EPIPE)
        {
            // Retry later
            op->parts.clear();
            op->retval = 0;
            op->inflight_count = 0;
            op->done_count = 0;
            op->state = 0;
            return 0;
        }
    }
    else
    {
        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
        {
            if (uw_it->second.state == CACHE_FLUSHING)
            {
                free(uw_it->second.buf);
                dirty_buffers.erase(uw_it++);
            }
            else
                uw_it++;
        }
    }
    std::function<void(cluster_op_t*)>(op->callback)(op);
    return 1;
 }
 void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
 {
    auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
    assert(peer_it != msgr.osd_peer_fds.end());
    part->flags |= PART_SENT;
    op->inflight_count++;
    part->op = (osd_op_t){
        .op_type = OSD_OP_OUT,
        .peer_fd = peer_it->second,
        .req = {
            .hdr = {
                .magic = SECONDARY_OSD_OP_MAGIC,
                .id = op_id++,
                .opcode = OSD_OP_SYNC,
            },
        },
        .callback = [this, part](osd_op_t *op_part)
        {
            handle_op_part(part);
        },
    };
    msgr.outbox_push(&part->op);
 }
 void cluster_client_t::handle_op_part(cluster_op_part_t *part)
 {
    cluster_op_t *op = part->parent;
    op->inflight_count--;
    int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
    if (part->op.reply.hdr.retval != expected)
    {
        // Operation failed, retry
        printf(
            "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
            osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
        );
        if (part->op.reply.hdr.retval == -EPIPE)
        {
            // Mark op->up_wait = true before stopping the client
            op->up_wait = true;
            if (!retry_timeout_id)
            {
                retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
                {
                    retry_timeout_id = 0;
                    continue_ops(true);
                });
            }
        }
        if (!op->retval || op->retval == -EPIPE)
        {
            // Don't overwrite other errors with -EPIPE
            op->retval = part->op.reply.hdr.retval;
        }
        msgr.stop_client(part->op.peer_fd);
        part->flags |= PART_ERROR;
    }
    else
    {
        // OK
        dirty_osds.insert(part->osd_num);
        part->flags |= PART_DONE;
        op->done_count++;
    }
    if (op->inflight_count == 0)
    {
        continue_ops();
    }
 }
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
@@ -10,7 +10,8 @@
 #define MAX_BLOCK_SIZE 128*1024*1024
 #define DEFAULT_DISK_ALIGNMENT 4096
 #define DEFAULT_BITMAP_GRANULARITY 4096
-#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
+#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
 #define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
 struct cluster_op_t;
@@ -22,8 +23,7 @@ struct cluster_op_part_t
    pg_num_t pg_num;
    osd_num_t osd_num;
    osd_op_buf_list_t iov;
-    bool sent;
+    unsigned flags;
    bool done;
    osd_op_t op;
 };
@@ -37,70 +37,78 @@ struct cluster_op_t
    osd_op_buf_list_t iov;
    std::function<void(cluster_op_t*)> callback;
 protected:
    int flags = 0;
    int state = 0;
    void *buf = NULL;
    cluster_op_t *orig_op = NULL;
    bool is_internal = false;
    bool needs_reslice = false;
    bool up_wait = false;
-    int sent_count = 0, done_count = 0;
+    int inflight_count = 0, done_count = 0;
    std::vector<cluster_op_part_t> parts;
    friend class cluster_client_t;
 };
 struct cluster_buffer_t
 {
    void *buf;
    uint64_t len;
    int state;
 };
 // FIXME: Split into public and private interfaces
 class cluster_client_t
 {
    timerfd_manager_t *tfd;
    ring_loop_t *ringloop;
    uint64_t bs_block_size = 0;
    uint64_t bs_disk_alignment = 0;
    uint64_t bs_bitmap_granularity = 0;
    std::map<pool_id_t, uint64_t> pg_counts;
    bool immediate_commit = false;
    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
-    uint64_t client_dirty_limit = 0;
+    uint64_t client_max_dirty_bytes = 0;
    uint64_t client_max_dirty_ops = 0;
    int log_level;
    int up_wait_retry_interval = 500; // ms
    uint64_t op_id = 1;
    ring_consumer_t consumer;
    // operations currently in progress
    std::set<cluster_op_t*> cur_ops;
    int retry_timeout_id = 0;
-    // unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
+    uint64_t op_id = 1;
    // unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering)
    std::vector<cluster_op_t*> unsynced_writes;
    std::vector<cluster_op_t*> syncing_writes;
    cluster_op_t* cur_sync = NULL;
    std::vector<cluster_op_t*> next_writes;
    std::vector<cluster_op_t*> offline_ops;
-    uint64_t queued_bytes = 0;
+    std::vector<cluster_op_t*> op_queue;
    std::map<object_id, cluster_buffer_t> dirty_buffers;
    std::set<osd_num_t> dirty_osds;
    uint64_t dirty_bytes = 0, dirty_ops = 0;
    bool pgs_loaded = false;
    ring_consumer_t consumer;
    std::vector<std::function<void(void)>> on_ready_hooks;
    int continuing_ops = 0;
    int op_queue_pos = 0;
 public:
    etcd_state_client_t st_cli;
    osd_messenger_t msgr;
    json11::Json config;
    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
    bool is_ready();
    void on_ready(std::function<void(void)> fn);
    void stop();
-protected:
+    static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
    void continue_ops(bool up_retry = false);
 protected:
    bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
    void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
    void on_load_config_hook(json11::Json::object & config);
    void on_load_pgs_hook(bool success);
    void on_change_hook(json11::Json::object & changes);
    void on_change_osd_state_hook(uint64_t peer_osd);
-    void continue_rw(cluster_op_t *op);
+    int continue_rw(cluster_op_t *op);
    void slice_rw(cluster_op_t *op);
-    bool try_send(cluster_op_t *op, cluster_op_part_t *part);
+    bool try_send(cluster_op_t *op, int i);
-    void execute_sync(cluster_op_t *op);
+    int continue_sync(cluster_op_t *op);
    void continue_sync();
    void finish_sync();
    void send_sync(cluster_op_t *op, cluster_op_part_t *part);
    void handle_op_part(cluster_op_part_t *part);
 };
--- a/src/crc32c.c
+++ b/src/crc32c.c
--- a/src/crc32c.h
+++ b/src/crc32c.h
@@ -8,4 +8,10 @@
 // unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
 // unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)
 #ifdef __cplusplus
 extern "C" {
 #endif
 uint32_t crc32c(uint32_t crc, const void *buf, size_t len);
 #ifdef __cplusplus
 };
 #endif
--- a/src/dump_journal.cpp
+++ b/src/dump_journal.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #define _LARGEFILE64_SOURCE
 #include <sys/types.h>
@@ -26,23 +26,32 @@ struct journal_dump_t
    uint64_t journal_offset;
    uint64_t journal_len;
    uint64_t journal_pos;
    bool all;
    bool started;
    int fd;
    uint32_t crc32_last;
-    void dump_block(void *buf);
+    int dump_block(void *buf);
 };
 int main(int argc, char *argv[])
 {
-    if (argc < 5)
+    journal_dump_t self = { 0 };
    int b = 1;
    if (argc >= 2 && !strcmp(argv[1], "--all"))
    {
-        printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
+        self.all = true;
        b = 2;
    }
    if (argc < b+4)
    {
        printf("USAGE: %s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
        return 1;
    }
-    journal_dump_t self;
+    self.journal_device = argv[b];
-    self.journal_device = argv[1];
+    self.journal_block = strtoul(argv[b+1], NULL, 10);
-    self.journal_block = strtoul(argv[2], NULL, 10);
+    self.journal_offset = strtoull(argv[b+2], NULL, 10);
-    self.journal_offset = strtoull(argv[3], NULL, 10);
+    self.journal_len = strtoull(argv[b+3], NULL, 10);
    self.journal_len = strtoull(argv[4], NULL, 10);
    if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
        self.journal_block > 128*1024)
    {
@@ -57,30 +66,64 @@ int main(int argc, char *argv[])
    }
    void *data = memalign(MEM_ALIGNMENT, self.journal_block);
    self.journal_pos = 0;
-    while (self.journal_pos < self.journal_len)
+    if (self.all)
    {
        while (self.journal_pos < self.journal_len)
        {
            int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
            assert(r == self.journal_block);
            uint64_t s;
            for (s = 0; s < self.journal_block; s += 8)
            {
                if (*((uint64_t*)(data+s)) != 0)
                    break;
            }
            if (s == self.journal_block)
            {
                printf("offset %08lx: zeroes\n", self.journal_pos);
                self.journal_pos += self.journal_block;
            }
            else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
            {
                printf("offset %08lx:\n", self.journal_pos);
                self.dump_block(data);
            }
            else
            {
                printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
                self.journal_pos += self.journal_block;
            }
        }
    }
    else
    {
        int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
        assert(r == self.journal_block);
-        uint64_t s;
+        journal_entry *je = (journal_entry*)(data);
-        for (s = 0; s < self.journal_block; s += 8)
+        if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
        {
-            if (*((uint64_t*)(data+s)) != 0)
+            printf("offset %08lx: journal superblock is invalid\n", self.journal_pos);
                break;
        }
        if (s == self.journal_block)
        {
            printf("offset %08lx: zeroes\n", self.journal_pos);
            self.journal_pos += self.journal_block;
        }
        else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
        {
            printf("offset %08lx:\n", self.journal_pos);
            self.dump_block(data);
        }
        else
        {
-            printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
+            printf("offset %08lx:\n", self.journal_pos);
-            self.journal_pos += self.journal_block;
+            self.dump_block(data);
            self.started = false;
            self.journal_pos = je->start.journal_start;
            while (1)
            {
                if (self.journal_pos >= self.journal_len)
                    self.journal_pos = self.journal_block;
                r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
                assert(r == self.journal_block);
                printf("offset %08lx:\n", self.journal_pos);
                r = self.dump_block(data);
                if (r <= 0)
                {
                    printf("end of the journal\n");
                    break;
                }
            }
        }
    }
    free(data);
@@ -88,7 +131,7 @@ int main(int argc, char *argv[])
    return 0;
 }
-void journal_dump_t::dump_block(void *buf)
+int journal_dump_t::dump_block(void *buf)
 {
    uint32_t pos = 0;
    journal_pos += journal_block;
@@ -97,12 +140,19 @@ void journal_dump_t::dump_block(void *buf)
    while (pos < journal_block)
    {
        journal_entry *je = (journal_entry*)(buf + pos);
-        if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX)
+        if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
            !all && started && je->crc32_prev != crc32_last)
        {
            break;
        }
-        const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
+        bool crc32_valid = je_crc32(je) == je->crc32;
-        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
+        if (!all && !crc32_valid)
        {
            break;
        }
        started = true;
        crc32_last = je->crc32;
        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, (crc32_valid ? "(valid)" : "(invalid)"), je->crc32_prev);
        if (je->type == JE_START)
        {
            printf("je_start start=%08lx\n", je->start.journal_start);
@@ -170,4 +220,5 @@ void journal_dump_t::dump_block(void *buf)
    {
        journal_pos = journal_len;
    }
    return entry;
 }
--- a/src/epoll_manager.cpp
+++ b/src/epoll_manager.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <sys/epoll.h>
 #include <sys/poll.h>
@@ -84,8 +84,12 @@ void epoll_manager_t::handle_epoll_events()
        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
        for (int i = 0; i < nfds; i++)
        {
-            auto & cb = epoll_handlers[events[i].data.fd];
+            auto cb_it = epoll_handlers.find(events[i].data.fd);
-            cb(events[i].data.fd, events[i].events);
+            if (cb_it != epoll_handlers.end())
            {
                auto & cb = cb_it->second;
                cb(events[i].data.fd, events[i].events);
            }
        }
    } while (nfds == MAX_EPOLL_EVENTS);
 }
--- a/src/epoll_manager.h
+++ b/src/epoll_manager.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -1,12 +1,27 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include "osd_ops.h"
 #include "pg_states.h"
 #include "etcd_state_client.h"
 #ifndef __MOCK__
 #include "http_client.h"
 #include "base64.h"
 #endif
 etcd_state_client_t::~etcd_state_client_t()
 {
    etcd_watches_initialised = -1;
 #ifndef __MOCK__
    if (etcd_watch_ws)
    {
        etcd_watch_ws->close();
        etcd_watch_ws = NULL;
    }
 #endif
 }
 #ifndef __MOCK__
 json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
 {
    json_kv_t kv;
@@ -46,6 +61,23 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
    http_request_json(tfd, etcd_address, req, timeout, callback);
 }
 void etcd_state_client_t::add_etcd_url(std::string addr)
 {
    if (addr.length() > 0)
    {
        if (strtolower(addr.substr(0, 7)) == "http://")
            addr = addr.substr(7);
        else if (strtolower(addr.substr(0, 8)) == "https://")
        {
            printf("HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n");
            exit(1);
        }
        if (addr.find('/') < 0)
            addr += "/v3";
        this->etcd_addresses.push_back(addr);
    }
 }
 void etcd_state_client_t::parse_config(json11::Json & config)
 {
    this->etcd_addresses.clear();
@@ -55,13 +87,7 @@ void etcd_state_client_t::parse_config(json11::Json & config)
        while (1)
        {
            int pos = ea.find(',');
-            std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
+            add_etcd_url(pos >= 0 ? ea.substr(0, pos) : ea);
            if (addr.length() > 0)
            {
                if (addr.find('/') < 0)
                    addr += "/v3";
                this->etcd_addresses.push_back(addr);
            }
            if (pos >= 0)
                ea = ea.substr(pos+1);
            else
@@ -72,13 +98,7 @@ void etcd_state_client_t::parse_config(json11::Json & config)
    {
        for (auto & ea: config["etcd_address"].array_items())
        {
-            std::string addr = ea.string_value();
+            add_etcd_url(ea.string_value());
            if (addr != "")
            {
                if (addr.find('/') < 0)
                    addr += "/v3";
                this->etcd_addresses.push_back(addr);
            }
        }
    }
    this->etcd_prefix = config["etcd_prefix"].string_value();
@@ -160,7 +180,7 @@ void etcd_state_client_t::start_etcd_watcher()
                    start_etcd_watcher();
                });
            }
-            else
+            else if (etcd_watches_initialised > 0)
            {
                // Connection was live, retry immediately
                start_etcd_watcher();
@@ -308,6 +328,26 @@ void etcd_state_client_t::load_pgs()
        start_etcd_watcher();
    });
 }
 #else
 void etcd_state_client_t::parse_config(json11::Json & config)
 {
 }
 void etcd_state_client_t::load_global_config()
 {
    json11::Json::object global_config;
    on_load_config_hook(global_config);
 }
 void etcd_state_client_t::load_pgs()
 {
 }
 #endif
 void etcd_state_client_t::parse_state(const json_kv_t & kv)
 {
    parse_state(kv.key, kv.value);
 }
 void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
 {
@@ -321,8 +361,10 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        {
            pool_config_t pc;
            // ID
-            pool_id_t pool_id = stoull_full(pool_item.first);
+            pool_id_t pool_id;
-            if (!pool_id || pool_id >= POOL_ID_MAX)
+            char null_byte = 0;
            sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
            if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
            {
                printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
@@ -407,6 +449,7 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
            if (pc.pg_stripe_size < min_stripe_size)
                pc.pg_stripe_size = min_stripe_size;
            // Save
            pc.real_pg_count = this->pool_config[pool_id].real_pg_count;
            std::swap(pc.pg_config, this->pool_config[pool_id].pg_config);
            std::swap(this->pool_config[pool_id], pc);
            auto & parsed_cfg = this->pool_config[pool_id];
@@ -433,16 +476,19 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        }
        for (auto & pool_item: value["items"].object_items())
        {
-            pool_id_t pool_id = stoull_full(pool_item.first);
+            pool_id_t pool_id;
-            if (!pool_id || pool_id >= POOL_ID_MAX)
+            char null_byte = 0;
            sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
            if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
            {
                printf("Pool ID %s is invalid in PG configuration (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
            }
            for (auto & pg_item: pool_item.second.object_items())
            {
-                pg_num_t pg_num = stoull_full(pg_item.first);
+                pg_num_t pg_num = 0;
-                if (!pg_num)
+                sscanf(pg_item.first.c_str(), "%u%c", &pg_num, &null_byte);
                if (!pg_num || null_byte != 0)
                {
                    printf("Bad key in pool %u PG configuration: %s (must be a number), skipped\n", pool_id, pg_item.first.c_str());
                    continue;
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -1,10 +1,10 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
 #include "json11/json11.hpp"
 #include "osd_id.h"
 #include "http_client.h"
 #include "timerfd_manager.h"
 #define ETCD_CONFIG_WATCH_ID 1
@@ -52,8 +52,15 @@ struct pool_config_t
    std::map<pg_num_t, pg_config_t> pg_config;
 };
 struct websocket_t;
 struct etcd_state_client_t
 {
 protected:
    websocket_t *etcd_watch_ws = NULL;
    uint64_t bs_block_size = DEFAULT_BLOCK_SIZE;
    void add_etcd_url(std::string);
 public:
    std::vector<std::string> etcd_addresses;
    std::string etcd_prefix;
    int log_level = 0;
@@ -61,8 +68,6 @@ struct etcd_state_client_t
    int etcd_watches_initialised = 0;
    uint64_t etcd_watch_revision = 0;
    websocket_t *etcd_watch_ws = NULL;
    uint64_t bs_block_size = 0;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;
@@ -79,6 +84,8 @@ struct etcd_state_client_t
    void start_etcd_watcher();
    void load_global_config();
    void load_pgs();
    void parse_state(const json_kv_t & kv);
    void parse_state(const std::string & key, const json11::Json & value);
    void parse_config(json11::Json & config);
    ~etcd_state_client_t();
 };
--- a/src/fio_cluster.cpp
+++ b/src/fio_cluster.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 // FIO engine to test cluster I/O
 //
@@ -93,7 +93,7 @@ static struct fio_option options[] = {
    {
        .name   = "cluster_log_level",
        .lname  = "cluster log level",
-        .type   = FIO_OPT_BOOL,
+        .type   = FIO_OPT_INT,
        .off1   = offsetof(struct sec_options, cluster_log),
        .help   = "Set log level for the Vitastor client",
        .def    = "0",
@@ -117,8 +117,15 @@ static struct fio_option options[] = {
 static int sec_setup(struct thread_data *td)
 {
    sec_options *o = (sec_options*)td->eo;
    sec_data *bsd;
    if (!o->etcd_host)
    {
        td_verror(td, EINVAL, "etcd address is missing");
        return 1;
    }
    bsd = new sec_data;
    if (!bsd)
    {
@@ -145,9 +152,7 @@ static void sec_cleanup(struct thread_data *td)
        delete bsd->cli;
        delete bsd->epmgr;
        delete bsd->ringloop;
-        bsd->cli = NULL;
+        delete bsd;
        bsd->epmgr = NULL;
        bsd->ringloop = NULL;
    }
 }
--- a/src/fio_engine.cpp
+++ b/src/fio_engine.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 // FIO engine to test Blockstore
 //
--- a/src/fio_headers.h
+++ b/src/fio_headers.h
--- a/src/fio_sec_osd.cpp
+++ b/src/fio_sec_osd.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 // FIO engine to test Blockstore through Secondary OSD interface
 //
@@ -140,6 +140,7 @@ static void sec_cleanup(struct thread_data *td)
    if (bsd)
    {
        close(bsd->connect_fd);
        delete bsd;
    }
 }
@@ -312,6 +313,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
            exit(1);
        }
        io_u* io = it->second;
        bsd->queue.erase(it);
        if (io->ddir == DDIR_READ)
        {
            if (reply.hdr.retval != io->xfer_buflen)
--- a/src/http_client.cpp
+++ b/src/http_client.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <netinet/tcp.h>
 #include <sys/epoll.h>
@@ -22,7 +22,6 @@
 #define READ_BUFFER_SIZE 9000
 static int extract_port(std::string & host);
 static std::string strtolower(const std::string & in);
 static std::string trim(const std::string & in);
 static std::string ws_format_frame(int type, uint64_t size);
 static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
@@ -673,7 +672,7 @@ static int extract_port(std::string & host)
    return port;
 }
-static std::string strtolower(const std::string & in)
+std::string strtolower(const std::string & in)
 {
    std::string s = in;
    for (int i = 0; i < s.length(); i++)
--- a/src/http_client.h
+++ b/src/http_client.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
 #include <string>
@@ -49,6 +49,8 @@ std::vector<std::string> getifaddr_list(bool include_v6 = false);
 uint64_t stoull_full(const std::string & str, int base = 10);
 std::string strtolower(const std::string & in);
 void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
    const http_options_t & options, std::function<void(const http_response_t *response)> callback);
--- a/src/malloc_or_die.h
+++ b/src/malloc_or_die.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <unistd.h>
 #include <fcntl.h>
@@ -10,30 +10,119 @@
 #include "messenger.h"
-osd_op_t::~osd_op_t()
+void osd_messenger_t::init()
 {
-    assert(!bs_op);
+    keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
    assert(!op_data);
    if (rmw_buf)
    {
-        free(rmw_buf);
+        std::vector<int> to_stop;
-    }
+        std::vector<osd_op_t*> to_ping;
-    if (buf)
+        for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
-    {
+        {
-        // Note: reusing osd_op_t WILL currently lead to memory leaks
+            auto cl = cl_it->second;
-        // So we don't reuse it, but free it every time
+            if (!cl->osd_num || cl->peer_state != PEER_CONNECTED)
-        free(buf);
+            {
-    }
+                // Do not run keepalive on regular clients
                continue;
            }
            if (cl->ping_time_remaining > 0)
            {
                cl->ping_time_remaining--;
                if (!cl->ping_time_remaining)
                {
                    // Ping timed out, stop the client
                    printf("Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
                    to_stop.push_back(cl->peer_fd);
                }
            }
            else if (cl->idle_time_remaining > 0)
            {
                cl->idle_time_remaining--;
                if (!cl->idle_time_remaining)
                {
                    // Connection is idle for <osd_idle_time>, send ping
                    osd_op_t *op = new osd_op_t();
                    op->op_type = OSD_OP_OUT;
                    op->peer_fd = cl->peer_fd;
                    op->req = (osd_any_op_t){
                        .hdr = {
                            .magic = SECONDARY_OSD_OP_MAGIC,
                            .id = this->next_subop_id++,
                            .opcode = OSD_OP_PING,
                        },
                    };
                    op->callback = [this, cl](osd_op_t *op)
                    {
                        int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
                        cl->ping_time_remaining = 0;
                        delete op;
                        if (fail_fd >= 0)
                        {
                            printf("Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
                            stop_client(fail_fd, true);
                        }
                    };
                    to_ping.push_back(op);
                    cl->ping_time_remaining = osd_ping_timeout;
                    cl->idle_time_remaining = osd_idle_timeout;
                }
            }
            else
            {
                cl->idle_time_remaining = osd_idle_timeout;
            }
        }
        // Don't stop clients while a 'clients' iterator is still active
        for (int peer_fd: to_stop)
        {
            stop_client(peer_fd, true);
        }
        for (auto op: to_ping)
        {
            outbox_push(op);
        }
    });
 }
 osd_messenger_t::~osd_messenger_t()
 {
    if (keepalive_timer_id >= 0)
    {
        tfd->clear_timer(keepalive_timer_id);
        keepalive_timer_id = -1;
    }
    while (clients.size() > 0)
    {
-        stop_client(clients.begin()->first);
+        stop_client(clients.begin()->first, true);
    }
 }
 void osd_messenger_t::parse_config(const json11::Json & config)
 {
    this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
        config["use_sync_send_recv"].uint64_value();
    this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
    if (!this->peer_connect_interval)
    {
        this->peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
    }
    this->peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
    if (!this->peer_connect_timeout)
    {
        this->peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
    }
    this->osd_idle_timeout = config["osd_idle_timeout"].uint64_value();
    if (!this->osd_idle_timeout)
    {
        this->osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
    }
    this->osd_ping_timeout = config["osd_ping_timeout"].uint64_value();
    if (!this->osd_ping_timeout)
    {
        this->osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
    }
    this->log_level = config["log_level"].uint64_value();
 }
 void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
 {
    if (wanted_peers.find(peer_osd) == wanted_peers.end())
@@ -49,17 +138,14 @@ void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
        wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
    }
    wanted_peers[peer_osd].address_changed = true;
-    if (!wanted_peers[peer_osd].connecting &&
+    try_connect_peer(peer_osd);
        (time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
    {
        try_connect_peer(peer_osd);
    }
 }
 void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
 {
    auto wp_it = wanted_peers.find(peer_osd);
-    if (wp_it == wanted_peers.end())
+    if (wp_it == wanted_peers.end() || wp_it->second.connecting ||
        (time(NULL) - wp_it->second.last_connect_attempt) < peer_connect_interval)
    {
        return;
    }
@@ -105,31 +191,29 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        on_connect_peer(peer_osd, -errno);
        return;
    }
-    int timeout_id = -1;
+    clients[peer_fd] = new osd_client_t();
-    if (peer_connect_timeout > 0)
+    clients[peer_fd]->peer_addr = addr;
-    {
+    clients[peer_fd]->peer_port = peer_port;
-        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
+    clients[peer_fd]->peer_fd = peer_fd;
-        {
+    clients[peer_fd]->peer_state = PEER_CONNECTING;
-            osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
+    clients[peer_fd]->connect_timeout_id = -1;
-            stop_client(peer_fd);
+    clients[peer_fd]->osd_num = peer_osd;
-            on_connect_peer(peer_osd, -EIO);
+    clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
            return;
        });
    }
    clients[peer_fd] = new osd_client_t((osd_client_t){
        .peer_addr = addr,
        .peer_port = peer_port,
        .peer_fd = peer_fd,
        .peer_state = PEER_CONNECTING,
        .connect_timeout_id = timeout_id,
        .osd_num = peer_osd,
        .in_buf = malloc_or_die(receive_buffer_size),
    });
    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
    {
        // Either OUT (connected) or HUP
        handle_connect_epoll(peer_fd);
    });
    if (peer_connect_timeout > 0)
    {
        clients[peer_fd]->connect_timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
        {
            osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
            stop_client(peer_fd, true);
            on_connect_peer(peer_osd, -EIO);
            return;
        });
    }
 }
 void osd_messenger_t::handle_connect_epoll(int peer_fd)
@@ -149,7 +233,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
    }
    if (result != 0)
    {
-        stop_client(peer_fd);
+        stop_client(peer_fd, true);
        on_connect_peer(peer_osd, -result);
        return;
    }
@@ -171,7 +255,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
    {
        // Stop client
        printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
-        stop_client(peer_fd);
+        stop_client(peer_fd, true);
    }
    else if (epoll_events & EPOLLIN)
    {
@@ -281,112 +365,6 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
    outbox_push(op);
 }
 void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
 {
    for (auto p: cl->sent_ops)
    {
        cancel_op(p.second);
    }
    cl->sent_ops.clear();
    cl->outbox.clear();
 }
 void osd_messenger_t::cancel_op(osd_op_t *op)
 {
    if (op->op_type == OSD_OP_OUT)
    {
        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
        op->reply.hdr.id = op->req.hdr.id;
        op->reply.hdr.opcode = op->req.hdr.opcode;
        op->reply.hdr.retval = -EPIPE;
        // Copy lambda to be unaffected by `delete op`
        std::function<void(osd_op_t*)>(op->callback)(op);
    }
    else
    {
        // This function is only called in stop_client(), so it's fine to destroy the operation
        delete op;
    }
 }
 void osd_messenger_t::stop_client(int peer_fd)
 {
    assert(peer_fd != 0);
    auto it = clients.find(peer_fd);
    if (it == clients.end())
    {
        return;
    }
    uint64_t repeer_osd = 0;
    osd_client_t *cl = it->second;
    if (cl->peer_state == PEER_CONNECTED)
    {
        if (cl->osd_num)
        {
            // Reload configuration from etcd when the connection is dropped
            if (log_level > 0)
                printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
            repeer_osd = cl->osd_num;
        }
        else
        {
            if (log_level > 0)
                printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
        }
    }
    cl->peer_state = PEER_STOPPED;
    clients.erase(it);
    tfd->set_fd_handler(peer_fd, false, NULL);
    if (cl->connect_timeout_id >= 0)
    {
        tfd->clear_timer(cl->connect_timeout_id);
        cl->connect_timeout_id = -1;
    }
    if (cl->osd_num)
    {
        osd_peer_fds.erase(cl->osd_num);
    }
    if (cl->read_op)
    {
        delete cl->read_op;
        cl->read_op = NULL;
    }
    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
    {
        if (*rit == peer_fd)
        {
            read_ready_clients.erase(rit);
            break;
        }
    }
    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
    {
        if (*wit == peer_fd)
        {
            write_ready_clients.erase(wit);
            break;
        }
    }
    free(cl->in_buf);
    cl->in_buf = NULL;
    close(peer_fd);
    if (repeer_osd)
    {
        // First repeer PGs as canceling OSD ops may push new operations
        // and we need correct PG states when we do that
        repeer_pgs(repeer_osd);
    }
    if (cl->osd_num)
    {
        // Cancel outbound operations
        cancel_osd_ops(cl);
    }
    if (cl->refs <= 0)
    {
        delete cl;
    }
 }
 void osd_messenger_t::accept_connections(int listen_fd)
 {
    // Accept new connections
@@ -402,13 +380,12 @@ void osd_messenger_t::accept_connections(int listen_fd)
        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
        int one = 1;
        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        clients[peer_fd] = new osd_client_t((osd_client_t){
+        clients[peer_fd] = new osd_client_t();
-            .peer_addr = addr,
+        clients[peer_fd]->peer_addr = addr;
-            .peer_port = ntohs(addr.sin_port),
+        clients[peer_fd]->peer_port = ntohs(addr.sin_port);
-            .peer_fd = peer_fd,
+        clients[peer_fd]->peer_fd = peer_fd;
-            .peer_state = PEER_CONNECTED,
+        clients[peer_fd]->peer_state = PEER_CONNECTED;
-            .in_buf = malloc_or_die(receive_buffer_size),
+        clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
        });
        // Add FD to epoll
        tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
        {
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -0,0 +1,162 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
 #include <sys/types.h>
 #include <stdint.h>
 #include <arpa/inet.h>
 #include <set>
 #include <map>
 #include <deque>
 #include <vector>
 #include "malloc_or_die.h"
 #include "json11/json11.hpp"
 #include "msgr_op.h"
 #include "timerfd_manager.h"
 #include <ringloop.h>
 #define CL_READ_HDR 1
 #define CL_READ_DATA 2
 #define CL_READ_REPLY_DATA 3
 #define CL_WRITE_READY 1
 #define CL_WRITE_REPLY 2
 #define PEER_CONNECTING 1
 #define PEER_CONNECTED 2
 #define PEER_STOPPED 3
 #define DEFAULT_PEER_CONNECT_INTERVAL 5
 #define DEFAULT_PEER_CONNECT_TIMEOUT 5
 #define DEFAULT_OSD_PING_TIMEOUT 5
 struct osd_client_t
 {
    int refs = 0;
    sockaddr_in peer_addr;
    int peer_port;
    int peer_fd;
    int peer_state;
    int connect_timeout_id = -1;
    int ping_time_remaining = 0;
    int idle_time_remaining = 0;
    osd_num_t osd_num = 0;
    void *in_buf = NULL;
    // Read state
    int read_ready = 0;
    osd_op_t *read_op = NULL;
    iovec read_iov = { 0 };
    msghdr read_msg = { 0 };
    int read_remaining = 0;
    int read_state = 0;
    osd_op_buf_list_t recv_list;
    // Incoming operations
    std::vector<osd_op_t*> received_ops;
    // Outbound operations
    std::map<uint64_t, osd_op_t*> sent_ops;
    // PGs dirtied by this client's primary-writes
    std::set<pool_pg_num_t> dirty_pgs;
    // Write state
    msghdr write_msg = { 0 };
    int write_state = 0;
    std::vector<iovec> send_list, next_send_list;
    std::vector<osd_op_t*> outbox, next_outbox;
    ~osd_client_t()
    {
        free(in_buf);
        in_buf = NULL;
    }
 };
 struct osd_wanted_peer_t
 {
    json11::Json address_list;
    int port;
    time_t last_connect_attempt;
    bool connecting, address_changed;
    int address_index;
    std::string cur_addr;
    int cur_port;
 };
 struct osd_op_stats_t
 {
    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
    uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
 };
 struct osd_messenger_t
 {
 protected:
    int keepalive_timer_id = -1;
    // FIXME: make receive_buffer_size configurable
    int receive_buffer_size = 64*1024;
    int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
    int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
    int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
    int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
    int log_level = 0;
    bool use_sync_send_recv = false;
    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
    std::vector<std::function<void()>> set_immediate;
 public:
    timerfd_manager_t *tfd;
    ring_loop_t *ringloop;
    // osd_num_t is only for logging and asserts
    osd_num_t osd_num;
    uint64_t next_subop_id = 1;
    std::map<int, osd_client_t*> clients;
    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
    std::map<uint64_t, int> osd_peer_fds;
    // op statistics
    osd_op_stats_t stats;
    void init();
    void parse_config(const json11::Json & config);
    void connect_peer(uint64_t osd_num, json11::Json peer_state);
    void stop_client(int peer_fd, bool force = false);
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
    void read_requests();
    void send_replies();
    void accept_connections(int listen_fd);
    ~osd_messenger_t();
 protected:
    void try_connect_peer(uint64_t osd_num);
    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
    void handle_peer_epoll(int peer_fd, int epoll_events);
    void handle_connect_epoll(int peer_fd);
    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
    void check_peer_config(osd_client_t *cl);
    void cancel_osd_ops(osd_client_t *cl);
    void cancel_op(osd_op_t *op);
    bool try_send(osd_client_t *cl);
    void measure_exec(osd_op_t *cur_op);
    void handle_send(int result, osd_client_t *cl);
    bool handle_read(int result, osd_client_t *cl);
    bool handle_finished_read(osd_client_t *cl);
    void handle_op_hdr(osd_client_t *cl);
    bool handle_reply_hdr(osd_client_t *cl);
    void handle_reply_ready(osd_op_t *op);
 };
--- a/src/mock/build.sh
+++ b/src/mock/build.sh
@@ -0,0 +1 @@
 g++ -D__MOCK__ -fsanitize=address -g -Wno-pointer-arith pg_states.cpp osd_ops.cpp test_cluster_client.cpp cluster_client.cpp msgr_op.cpp msgr_stop.cpp mock/messenger.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp -I mock -I . -I ..; ./a.out
--- a/src/mock/messenger.cpp
+++ b/src/mock/messenger.cpp
@@ -0,0 +1,44 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <unistd.h>
 #include <stdexcept>
 #include <assert.h>
 #include "messenger.h"
 void osd_messenger_t::init()
 {
 }
 osd_messenger_t::~osd_messenger_t()
 {
    while (clients.size() > 0)
    {
        stop_client(clients.begin()->first, true);
    }
 }
 void osd_messenger_t::outbox_push(osd_op_t *cur_op)
 {
    clients[cur_op->peer_fd]->sent_ops[cur_op->req.hdr.id] = cur_op;
 }
 void osd_messenger_t::parse_config(const json11::Json & config)
 {
 }
 void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
 {
    wanted_peers[peer_osd] = (osd_wanted_peer_t){
        .port = 1,
    };
 }
 void osd_messenger_t::read_requests()
 {
 }
 void osd_messenger_t::send_replies()
 {
 }
--- a/src/mock/ringloop.h
+++ b/src/mock/ringloop.h
@@ -0,0 +1,25 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
 #include <functional>
 struct ring_consumer_t
 {
    std::function<void(void)> loop;
 };
 class ring_loop_t
 {
 public:
    void register_consumer(ring_consumer_t *consumer)
    {
    }
    void unregister_consumer(ring_consumer_t *consumer)
    {
    }
    void submit()
    {
    }
 };
--- a/src/msgr_op.cpp
+++ b/src/msgr_op.cpp
@@ -0,0 +1,22 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <assert.h>
 #include "msgr_op.h"
 osd_op_t::~osd_op_t()
 {
    assert(!bs_op);
    assert(!op_data);
    if (rmw_buf)
    {
        free(rmw_buf);
    }
    if (buf)
    {
        // Note: reusing osd_op_t WILL currently lead to memory leaks
        // So we don't reuse it, but free it every time
        free(buf);
    }
 }
--- a/src/msgr_op.h
+++ b/src/msgr_op.h
@@ -1,40 +1,21 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
-#include <sys/types.h>
+#include <sys/uio.h>
 #include <stdint.h>
-#include <arpa/inet.h>
+#include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <set>
 #include <map>
 #include <deque>
 #include <vector>
 #include "malloc_or_die.h"
 #include "json11/json11.hpp"
 #include "osd_ops.h"
 #include "timerfd_manager.h"
 #include "ringloop.h"
 #define OSD_OP_IN 0
 #define OSD_OP_OUT 1
 #define CL_READ_HDR 1
 #define CL_READ_DATA 2
 #define CL_READ_REPLY_DATA 3
 #define CL_WRITE_READY 1
 #define CL_WRITE_REPLY 2
 #define OSD_OP_INLINE_BUF_COUNT 16
 #define PEER_CONNECTING 1
 #define PEER_CONNECTED 2
 #define PEER_STOPPED 3
 #define DEFAULT_PEER_CONNECT_INTERVAL 5
 #define DEFAULT_PEER_CONNECT_TIMEOUT 5
 // Kind of a vector with small-list-optimisation
 struct osd_op_buf_list_t
 {
@@ -188,119 +169,3 @@ struct osd_op_t
    ~osd_op_t();
 };
 struct osd_client_t
 {
    int refs = 0;
    sockaddr_in peer_addr;
    int peer_port;
    int peer_fd;
    int peer_state;
    int connect_timeout_id = -1;
    osd_num_t osd_num = 0;
    void *in_buf = NULL;
    // Read state
    int read_ready = 0;
    osd_op_t *read_op = NULL;
    iovec read_iov = { 0 };
    msghdr read_msg = { 0 };
    int read_remaining = 0;
    int read_state = 0;
    osd_op_buf_list_t recv_list;
    // Incoming operations
    std::vector<osd_op_t*> received_ops;
    // Outbound operations
    std::map<uint64_t, osd_op_t*> sent_ops;
    // PGs dirtied by this client's primary-writes
    std::set<pool_pg_num_t> dirty_pgs;
    // Write state
    msghdr write_msg = { 0 };
    int write_state = 0;
    std::vector<iovec> send_list, next_send_list;
    std::vector<osd_op_t*> outbox, next_outbox;
 };
 struct osd_wanted_peer_t
 {
    json11::Json address_list;
    int port;
    time_t last_connect_attempt;
    bool connecting, address_changed;
    int address_index;
    std::string cur_addr;
    int cur_port;
 };
 struct osd_op_stats_t
 {
    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
    uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
 };
 struct osd_messenger_t
 {
    timerfd_manager_t *tfd;
    ring_loop_t *ringloop;
    // osd_num_t is only for logging and asserts
    osd_num_t osd_num;
    // FIXME: make receive_buffer_size configurable
    int receive_buffer_size = 64*1024;
    int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
    int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
    int log_level = 0;
    bool use_sync_send_recv = false;
    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
    std::map<uint64_t, int> osd_peer_fds;
    uint64_t next_subop_id = 1;
    std::map<int, osd_client_t*> clients;
    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
    std::vector<std::function<void()>> set_immediate;
    // op statistics
    osd_op_stats_t stats;
 public:
    void connect_peer(uint64_t osd_num, json11::Json peer_state);
    void stop_client(int peer_fd);
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
    void handle_peer_epoll(int peer_fd, int epoll_events);
    void read_requests();
    void send_replies();
    void accept_connections(int listen_fd);
    ~osd_messenger_t();
 protected:
    void try_connect_peer(uint64_t osd_num);
    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
    void handle_connect_epoll(int peer_fd);
    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
    void check_peer_config(osd_client_t *cl);
    void cancel_osd_ops(osd_client_t *cl);
    void cancel_op(osd_op_t *op);
    bool try_send(osd_client_t *cl);
    void measure_exec(osd_op_t *cur_op);
    void handle_send(int result, osd_client_t *cl);
    bool handle_read(int result, osd_client_t *cl);
    bool handle_finished_read(osd_client_t *cl);
    void handle_op_hdr(osd_client_t *cl);
    bool handle_reply_hdr(osd_client_t *cl);
    void handle_reply_ready(osd_op_t *op);
 };
--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include "messenger.h"
@@ -9,6 +9,10 @@ void osd_messenger_t::read_requests()
    {
        int peer_fd = read_ready_clients[i];
        osd_client_t *cl = clients[peer_fd];
        if (cl->read_msg.msg_iovlen)
        {
            continue;
        }
        if (cl->read_remaining < receive_buffer_size)
        {
            cl->read_iov.iov_base = cl->in_buf;
@@ -29,6 +33,7 @@ void osd_messenger_t::read_requests()
            io_uring_sqe* sqe = ringloop->get_sqe();
            if (!sqe)
            {
                cl->read_msg.msg_iovlen = 0;
                read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
                return;
            }
@@ -52,6 +57,7 @@ void osd_messenger_t::read_requests()
 bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
 {
    bool ret = false;
    cl->read_msg.msg_iovlen = 0;
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
@@ -160,8 +166,14 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    {
        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
            return handle_reply_hdr(cl);
-        else
+        else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
            handle_op_hdr(cl);
        else
        {
            printf("Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
            stop_client(cl->peer_fd);
            return false;
        }
    }
    else if (cl->read_state == CL_READ_DATA)
    {
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #define _XOPEN_SOURCE
 #include <limits.h>
@@ -46,7 +46,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
        cl->sent_ops[cur_op->req.hdr.id] = cur_op;
    }
-    // Pre-defined send_lists
+    to_outbox.push_back(NULL);
    // Operation data
    if ((cur_op->op_type == OSD_OP_IN
        ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
@@ -58,17 +59,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
    {
        to_outbox.push_back(NULL);
        for (int i = 0; i < cur_op->iov.count; i++)
        {
            assert(cur_op->iov.buf[i].iov_base);
            to_send_list.push_back(cur_op->iov.buf[i]);
-            to_outbox.push_back(i == cur_op->iov.count-1 ? cur_op : NULL);
+            to_outbox.push_back(NULL);
        }
    }
-    else
+    if (cur_op->op_type == OSD_OP_IN)
    {
-        to_outbox.push_back(cur_op);
+        // To free it later
        to_outbox[to_outbox.size()-1] = cur_op;
    }
    if (!ringloop)
    {
@@ -92,6 +93,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
 void osd_messenger_t::measure_exec(osd_op_t *cur_op)
 {
    // Measure execution latency
    if (cur_op->req.hdr.opcode > OSD_OP_MAX)
    {
        return;
    }
    timespec tv_end;
    clock_gettime(CLOCK_REALTIME, &tv_end);
    stats.op_stat_count[cur_op->req.hdr.opcode]++;
@@ -175,7 +180,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
-        if (!cl->refs)
+        if (cl->refs <= 0)
        {
            delete cl;
        }
@@ -198,11 +203,8 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
            {
                if (cl->outbox[done])
                {
-                    // Operation fully sent
+                    // Reply fully sent
-                    if (cl->outbox[done]->op_type == OSD_OP_IN)
+                    delete cl->outbox[done];
                    {
                        delete cl->outbox[done];
                    }
                }
                result -= iov.iov_len;
                done++;
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@@ -0,0 +1,137 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <unistd.h>
 #include <assert.h>
 #include "messenger.h"
 void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
 {
    std::vector<osd_op_t*> cancel_ops;
    cancel_ops.resize(cl->sent_ops.size());
    int i = 0;
    for (auto p: cl->sent_ops)
    {
        cancel_ops[i++] = p.second;
    }
    cl->sent_ops.clear();
    cl->outbox.clear();
    for (auto op: cancel_ops)
    {
        cancel_op(op);
    }
 }
 void osd_messenger_t::cancel_op(osd_op_t *op)
 {
    if (op->op_type == OSD_OP_OUT)
    {
        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
        op->reply.hdr.id = op->req.hdr.id;
        op->reply.hdr.opcode = op->req.hdr.opcode;
        op->reply.hdr.retval = -EPIPE;
        // Copy lambda to be unaffected by `delete op`
        std::function<void(osd_op_t*)>(op->callback)(op);
    }
    else
    {
        // This function is only called in stop_client(), so it's fine to destroy the operation
        delete op;
    }
 }
 void osd_messenger_t::stop_client(int peer_fd, bool force)
 {
    assert(peer_fd != 0);
    auto it = clients.find(peer_fd);
    if (it == clients.end())
    {
        return;
    }
    osd_client_t *cl = it->second;
    if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
    {
        return;
    }
    if (log_level > 0)
    {
        if (cl->osd_num)
        {
            printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
        }
        else
        {
            printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
        }
    }
    // First set state to STOPPED so another stop_client() call doesn't try to free it again
    cl->refs++;
    cl->peer_state = PEER_STOPPED;
    if (cl->osd_num)
    {
        // ...and forget OSD peer
        osd_peer_fds.erase(cl->osd_num);
    }
 #ifndef __MOCK__
    // Then remove FD from the eventloop so we don't accidentally read something
    tfd->set_fd_handler(peer_fd, false, NULL);
    if (cl->connect_timeout_id >= 0)
    {
        tfd->clear_timer(cl->connect_timeout_id);
        cl->connect_timeout_id = -1;
    }
    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
    {
        if (*rit == peer_fd)
        {
            read_ready_clients.erase(rit);
            break;
        }
    }
    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
    {
        if (*wit == peer_fd)
        {
            write_ready_clients.erase(wit);
            break;
        }
    }
 #endif
    if (cl->osd_num)
    {
        // Then repeer PGs because cancel_op() callbacks can try to perform
        // some actions and we need correct PG states to not do something silly
        repeer_pgs(cl->osd_num);
    }
    // Then cancel all operations
    if (cl->read_op)
    {
        if (!cl->read_op->callback)
        {
            delete cl->read_op;
        }
        cl->read_op = NULL;
    }
    if (cl->osd_num)
    {
        // Cancel outbound operations
        cancel_osd_ops(cl);
    }
 #ifndef __MOCK__
    // And close the FD only when everything is done
    // ...because peer_fd number can get reused after close()
    close(peer_fd);
 #endif
    // Find the item again because it can be invalidated at this point
    it = clients.find(peer_fd);
    if (it != clients.end())
    {
        clients.erase(it);
    }
    cl->refs--;
    if (cl->refs <= 0)
    {
        delete cl;
    }
 }
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 // Similar to qemu-nbd, but sets timeout and uses io_uring
 #include <linux/nbd.h>
@@ -111,7 +111,7 @@ public:
    {
        printf(
            "Vitastor NBD proxy\n"
-            "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
+            "(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
            "USAGE:\n"
            "  %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n"
            "  %s unmap /dev/nbd0\n"
--- a/src/object_id.h
+++ b/src/object_id.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include <sys/socket.h>
 #include <sys/poll.h>
@@ -8,16 +8,20 @@
 #include <arpa/inet.h>
 #include "osd.h"
 #include "http_client.h"
-osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
+osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
 {
    config["entry_attr_size"] = "0";
    this->config = config;
    this->bs = bs;
    this->ringloop = ringloop;
    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
    this->bs = new blockstore_t(config, ringloop);
    this->bs_block_size = bs->get_block_size();
-    // FIXME: use bitmap granularity instead
+    this->bs_bitmap_granularity = bs->get_bitmap_granularity();
    this->bs_disk_alignment = bs->get_disk_alignment();
    parse_config(config);
@@ -37,6 +41,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
    c_cli.ringloop = this->ringloop;
    c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
    c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
    c_cli.init();
    init_cluster();
@@ -48,6 +53,7 @@ osd_t::~osd_t()
 {
    ringloop->unregister_consumer(&consumer);
    delete epmgr;
    delete bs;
    close(listen_fd);
 }
@@ -55,6 +61,7 @@ void osd_t::parse_config(blockstore_config_t & config)
 {
    if (config.find("log_level") == config.end())
        config["log_level"] = "1";
    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
    // Initial startup configuration
    json11::Json json_config = json11::Json(config);
    st_cli.parse_config(json_config);
@@ -66,6 +73,8 @@ void osd_t::parse_config(blockstore_config_t & config)
        throw std::runtime_error("osd_num is required in the configuration");
    c_cli.osd_num = osd_num;
    run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
    no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
    no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
    // Cluster configuration
    bind_address = config["bind_address"];
    if (bind_address == "")
@@ -92,6 +101,9 @@ void osd_t::parse_config(blockstore_config_t & config)
    recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
    recovery_sync_batch = strtoull(config["recovery_sync_batch"].c_str(), NULL, 10);
    if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
        recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
        readonly = true;
    print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
@@ -100,14 +112,7 @@ void osd_t::parse_config(blockstore_config_t & config)
    slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
    if (!slow_log_interval)
        slow_log_interval = 10;
-    c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
+    c_cli.parse_config(json_config);
    if (!c_cli.peer_connect_interval)
        c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
    c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
    if (!c_cli.peer_connect_timeout)
        c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
    c_cli.log_level = log_level;
 }
 void osd_t::bind_socket()
@@ -171,7 +176,7 @@ bool osd_t::shutdown()
    {
        return false;
    }
-    return bs->is_safe_to_stop();
+    return !bs || bs->is_safe_to_stop();
 }
 void osd_t::loop()
@@ -191,6 +196,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
        delete cur_op;
        return;
    }
    // Clear the reply buffer
    memset(cur_op->reply.buf, 0, OSD_PACKET_SIZE);
    inflight_ops++;
    if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
        cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
@@ -198,19 +205,25 @@ void osd_t::exec_op(osd_op_t *cur_op)
            cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
            cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
            (cur_op->req.sec_rw.len > OSD_RW_MAX ||
-            cur_op->req.sec_rw.len % bs_disk_alignment ||
+            cur_op->req.sec_rw.len % bs_bitmap_granularity ||
-            cur_op->req.sec_rw.offset % bs_disk_alignment)) ||
+            cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
        ((cur_op->req.hdr.opcode == OSD_OP_READ ||
            cur_op->req.hdr.opcode == OSD_OP_WRITE ||
            cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
            (cur_op->req.rw.len > OSD_RW_MAX ||
-            cur_op->req.rw.len % bs_disk_alignment ||
+            cur_op->req.rw.len % bs_bitmap_granularity ||
-            cur_op->req.rw.offset % bs_disk_alignment)))
+            cur_op->req.rw.offset % bs_bitmap_granularity)))
    {
        // Bad command
        finish_op(cur_op, -EINVAL);
        return;
    }
    if (cur_op->req.hdr.opcode == OSD_OP_PING)
    {
        // Pong
        finish_op(cur_op, 0);
        return;
    }
    if (readonly &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_READ &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
@@ -261,9 +274,9 @@ void osd_t::reset_stats()
 void osd_t::print_stats()
 {
-    for (int i = 0; i <= OSD_OP_MAX; i++)
+    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
    {
-        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
+        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
        {
            uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
            uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
@@ -284,7 +297,7 @@ void osd_t::print_stats()
            prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
        }
    }
-    for (int i = 0; i <= OSD_OP_MAX; i++)
+    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
    {
        if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
        {
--- a/src/osd.h
+++ b/src/osd.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #pragma once
@@ -37,6 +37,7 @@
 #define DEFAULT_AUTOSYNC_INTERVAL 5
 #define MAX_RECOVERY_QUEUE 2048
 #define DEFAULT_RECOVERY_QUEUE 4
 #define DEFAULT_RECOVERY_BATCH 16
 //#define OSD_STUB
@@ -64,6 +65,8 @@ class osd_t
    bool readonly = false;
    osd_num_t osd_num = 1; // OSD numbers start with 1
    bool run_primary = false;
    bool no_rebalance = false;
    bool no_recovery = false;
    std::string bind_address;
    int bind_port, listen_backlog;
    // FIXME: Implement client queue depth limit
@@ -74,6 +77,7 @@ class osd_t
    int immediate_commit = IMMEDIATE_NONE;
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int log_level = 0;
    // cluster state
@@ -95,9 +99,11 @@ class osd_t
    std::map<pool_pg_num_t, pg_t> pgs;
    std::set<pool_pg_num_t> dirty_pgs;
    std::set<osd_num_t> dirty_osds;
    int copies_to_delete_after_sync_count = 0;
    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
    int recovery_done = 0;
    osd_op_t *autosync_op = NULL;
    // Unstable writes
@@ -109,7 +115,7 @@ class osd_t
    bool stopping = false;
    int inflight_ops = 0;
    blockstore_t *bs;
-    uint32_t bs_block_size, bs_disk_alignment;
+    uint32_t bs_block_size, bs_bitmap_granularity;
    ring_loop_t *ringloop;
    timerfd_manager_t *tfd = NULL;
    epoll_manager_t *epmgr = NULL;
@@ -160,6 +166,7 @@ class osd_t
    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
    void discard_list_subop(osd_op_t *list_op);
    bool stop_pg(pg_t & pg);
    void reset_pg(pg_t & pg);
    void finish_stop_pg(pg_t & pg);
    // flushing, recovery and backfill
@@ -191,6 +198,7 @@ class osd_t
    void continue_primary_del(osd_op_t *cur_op);
    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
    void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
    void handle_primary_bs_subop(osd_op_t *subop);
@@ -198,9 +206,12 @@ class osd_t
    void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
    void submit_primary_subops(int submit_type, uint64_t op_version, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
    void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set);
-    void submit_primary_sync_subops(osd_op_t *cur_op);
+    void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
    int submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);
    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
    inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
    {
        uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
@@ -210,7 +221,7 @@ class osd_t
    }
 public:
-    osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
+    osd_t(blockstore_config_t & config, ring_loop_t *ringloop);
    ~osd_t();
    void force_stop(int exitcode);
    bool shutdown();
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -1,9 +1,10 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "osd.h"
 #include "base64.h"
 #include "etcd_state_client.h"
 #include "http_client.h"
 #include "osd_rmw.h"
 // Startup sequence:
@@ -37,7 +38,7 @@ void osd_t::init_cluster()
                .pg_cursize = 0,
                .pg_size = 3,
                .pg_minsize = 2,
-                .parity_chunks = 1,
+                .pg_data_size = 2,
                .pool_id = 1,
                .pg_num = 1,
                .target_set = { 1, 2, 3 },
@@ -142,7 +143,7 @@ json11::Json osd_t::get_statistics()
    }
    st["host"] = self_state["host"];
    json11::Json::object op_stats, subop_stats;
-    for (int i = 0; i <= OSD_OP_MAX; i++)
+    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
    {
        op_stats[osd_op_names[i]] = json11::Json::object {
            { "count", c_cli.stats.op_stat_count[i] },
@@ -150,7 +151,7 @@ json11::Json osd_t::get_statistics()
            { "bytes", c_cli.stats.op_stat_bytes[i] },
        };
    }
-    for (int i = 0; i <= OSD_OP_MAX; i++)
+    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
    {
        subop_stats[osd_op_names[i]] = json11::Json::object {
            { "count", c_cli.stats.subop_stat_count[i] },
@@ -384,6 +385,7 @@ void osd_t::create_osd_state()
        {
            st_cli.load_pgs();
        }
        report_statistics();
    });
 }
@@ -494,7 +496,11 @@ void osd_t::apply_pg_count()
            }
            if (still_active > 0)
            {
-                printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
+                printf(
                    "[OSD %lu] PG count change detected for pool %u (new is %lu, old is %u),"
                    " but %u PG(s) are still active. This is not allowed. Exiting\n",
                    this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
                );
                force_stop(1);
                return;
            }
@@ -552,7 +558,7 @@ void osd_t::apply_pg_config()
                }
                if (currently_taken)
                {
-                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
+                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
                    {
                        if (pg_it->second.target_set == pg_cfg.target_set)
                        {
@@ -588,7 +594,10 @@ void osd_t::apply_pg_config()
                    }
                    else
                    {
-                        throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(pg_it->second.state));
+                        throw std::runtime_error(
                            "Unexpected PG "+std::to_string(pool_id)+"/"+std::to_string(pg_num)+
                            " state: "+std::to_string(pg_it->second.state)
                        );
                    }
                }
                auto & pg = this->pgs[{ .pool_id = pool_id, .pg_num = pg_num }];
@@ -598,7 +607,8 @@ void osd_t::apply_pg_config()
                    .pg_cursize = 0,
                    .pg_size = pool_item.second.pg_size,
                    .pg_minsize = pool_item.second.pg_minsize,
-                    .parity_chunks = pool_item.second.parity_chunks,
+                    .pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
                         ? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
                    .pool_id = pool_id,
                    .pg_num = pg_num,
                    .reported_epoch = pg_cfg.epoch,
@@ -608,7 +618,7 @@ void osd_t::apply_pg_config()
                };
                if (pg.scheme == POOL_SCHEME_JERASURE)
                {
-                    use_jerasure(pg.pg_size, pg.pg_size-pg.parity_chunks, true);
+                    use_jerasure(pg.pg_size, pg.pg_data_size, true);
                }
                this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
                pg.print_state();
@@ -656,7 +666,21 @@ void osd_t::report_pg_states()
        auto & pg = pg_it->second;
        reporting_pgs.push_back({ *it, pg.history_changed });
        std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
-        if (pg.state == PG_STARTING)
+        bool pg_state_exists = false;
        if (pg.state != PG_STARTING)
        {
            auto pool_it = st_cli.pool_config.find(pg.pool_id);
            if (pool_it != st_cli.pool_config.end())
            {
                auto pg_it = pool_it->second.pg_config.find(pg.pg_num);
                if (pg_it != pool_it->second.pg_config.end() &&
                    pg_it->second.cur_state != 0)
                {
                    pg_state_exists = true;
                }
            }
        }
        if (!pg_state_exists)
        {
            // Check that the PG key does not exist
            // Failed check indicates an unsuccessful PG lock attempt in this case
@@ -668,9 +692,7 @@ void osd_t::report_pg_states()
        }
        else
        {
-            // Check that the key is ours
+            // Check that the key is ours if it already exists
            // Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
            // and an unexpected race condition for started pgs (PG lock is held by someone else)
            checks.push_back(json11::Json::object {
                { "target", "LEASE" },
                { "lease", etcd_lease_id },
@@ -792,17 +814,16 @@ void osd_t::report_pg_states()
            for (auto pp: reporting_pgs)
            {
                auto pg_it = this->pgs.find(pp.first);
-                if (pg_it != this->pgs.end())
+                if (pg_it != this->pgs.end() &&
                    pg_it->second.state == PG_OFFLINE &&
                    pg_state_dirty.find(pp.first) == pg_state_dirty.end())
                {
-                    if (pg_it->second.state == PG_OFFLINE)
+                    // Forget offline PGs after reporting their state
                    if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
                    {
-                        // Remove offline PGs after reporting their state
+                        use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
                        this->pgs.erase(pg_it);
                        if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
                        {
                            use_jerasure(pg_it->second.pg_size, pg_it->second.pg_size-pg_it->second.parity_chunks, false);
                        }
                    }
                    this->pgs.erase(pg_it);
                }
            }
            // Push other PG state updates, if any
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "osd.h"
@@ -95,7 +95,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
    {
        // This flush batch is done
        std::vector<osd_op_t*> continue_ops;
-        auto & pg = pgs[pg_id];
+        auto & pg = pgs.at(pg_id);
        auto it = pg.flush_actions.begin(), prev_it = it;
        auto erase_start = it;
        while (1)
@@ -149,10 +149,14 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
        {
            continue_primary_write(op);
        }
-        if (pg.inflight == 0 && (pg.state & PG_STOPPING))
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
        {
            finish_stop_pg(pg);
        }
        else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
        {
            start_pg_peering(pg);
        }
    }
 }
@@ -209,32 +213,39 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
 bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
 {
-    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    if (!no_recovery)
    {
-        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
+        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
        {
-            for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
+            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
            {
-                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
                {
-                    op.degraded = true;
+                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    op.oid = obj_it->first;
+                    {
-                    return true;
+                        op.degraded = true;
                        op.oid = obj_it->first;
                        return true;
                    }
                }
            }
        }
    }
-    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    if (!no_rebalance)
    {
-        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
        {
-            for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+            // Don't try to "recover" misplaced objects if "recovery" would make them degraded
            if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
            {
-                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
                {
-                    op.degraded = false;
+                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    op.oid = obj_it->first;
+                    {
-                    return true;
+                        op.degraded = false;
                        op.oid = obj_it->first;
                        return true;
                    }
                }
            }
        }
@@ -264,7 +275,6 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
    }
    op->osd_op->callback = [this, op](osd_op_t *osd_op)
    {
        // Don't sync the write, it will be synced by our regular sync coroutine
        if (osd_op->reply.hdr.retval < 0)
        {
            // Error recovering object
@@ -286,6 +296,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
        op->osd_op = NULL;
        recovery_ops.erase(op->oid);
        delete osd_op;
        if (immediate_commit != IMMEDIATE_ALL)
        {
            recovery_done++;
            if (recovery_done >= recovery_sync_batch)
            {
                // Force sync every <recovery_sync_batch> operations
                // This is required not to pile up an excessive amount of delete operations
                autosync();
                recovery_done = 0;
            }
        }
        continue_recovery();
    };
    exec_op(op->osd_op);
--- a/src/osd_id.h
+++ b/src/osd_id.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
--- a/src/osd_main.cpp
+++ b/src/osd_main.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 (see README.md for details)
+// License: VNPL-1.1 (see README.md for details)
 #include "osd.h"
@@ -41,16 +41,13 @@ int main(int narg, char *args[])
    signal(SIGINT, handle_sigint);
    signal(SIGTERM, handle_sigint);
    ring_loop_t *ringloop = new ring_loop_t(512);
-    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
+    osd = new osd_t(config, ringloop);
    blockstore_t *bs = new blockstore_t(config, ringloop);
    osd = new osd_t(config, bs, ringloop);
    while (1)
    {
        ringloop->loop();
        ringloop->wait();
    }
    delete osd;
    delete bs;
    delete ringloop;
    return 0;
 }
--- a/src/osd_ops.cpp
+++ b/src/osd_ops.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include "osd_ops.h"
@@ -19,4 +19,5 @@ const char* osd_op_names[] = {
    "primary_write",
    "primary_sync",
    "primary_delete",
    "ping",
 };
--- a/src/osd_ops.h
+++ b/src/osd_ops.h
@@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
@@ -27,7 +27,8 @@
 #define OSD_OP_WRITE                12
 #define OSD_OP_SYNC                 13
 #define OSD_OP_DELETE               14
-#define OSD_OP_MAX                  14
+#define OSD_OP_PING                 15
 #define OSD_OP_MAX                  15
 // Alignment & limit for read/write operations
 #ifndef MEM_ALIGNMENT
 #define MEM_ALIGNMENT               512
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`g++ -D__MOCK__ -fsanitize=address -g -Wno-pointer-arith pg_states.cpp osd_ops.cpp test_cluster_client.cpp cluster_client.cpp msgr_op.cpp msgr_stop.cpp mock/messenger.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp -I mock -I . -I ..; ./a.out`