Trace I/O operations (SQEs, recvmsg/sendmsg, uring_submit)

Replace io_uring sendmsg/recvmsg with synchronous sendmsg/recvmsg
Oops, fix fio_sec_osd block_order parsing
2020-06-09 00:52:29 +03:00 · 2020-06-09 00:52:29 +03:00 · 2020-06-09 00:52:00 +03:00 · 2020-06-08 01:54:44 +03:00 · 2020-06-08 01:32:16 +03:00 · 2020-06-07 00:30:15 +03:00
76 changed files with 10891 additions and 2552 deletions
--- a/Make-gen.pl
+++ b/Make-gen.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $deps = {};
+for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
+{
+    if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
+    {
+        $deps->{$1}->{$2} = 1;
+    }
+}
+
+my $added;
+do
+{
+    $added = 0;
+    for my $file (keys %$deps)
+    {
+        for my $dep (keys %{$deps->{$file}})
+        {
+            if ($deps->{$dep})
+            {
+                for my $subdep (keys %{$deps->{$dep}})
+                {
+                    if (!$deps->{$file}->{$subdep})
+                    {
+                        $added = 1;
+                        $deps->{$file}->{$subdep} = 1;
+                    }
+                }
+            }
+        }
+    }
+} while ($added);
+
+for my $file (sort keys %$deps)
+{
+    if ($file =~ /\.cpp$/)
+    {
+        my $obj = $file;
+        $obj =~ s/\.cpp$/.o/s;
+        print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
+        print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
+    }
+}
--- a/189
+++ b/189
@@ -1,66 +1,153 @@
 BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
-	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
+	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
 # -fsanitize=address
 CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
-all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd stub_bench osd_test
+all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal
 clean:
 	rm -f *.o

-crc32c.o: crc32c.c
+dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
+	g++ $(CXXFLAGS) -o $@ $< crc32c.o
+
+libblockstore.so: $(BLOCKSTORE_OBJS)
+	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
+libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
+	g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
+
+OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
+	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o pg_states.o \
+	osd_rmw.o json11.o base64.o timerfd_manager.o
+osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
+	g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
+
+stub_osd: stub_osd.o rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
+
+STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
+stub_uring_osd: $(STUB_URING_OSD_OBJS)
+	g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
+stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
+osd_test: osd_test.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
+osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
+
+libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
+
+FIO_CLUSTER_OBJS := fio_cluster.o cluster_client.o epoll_manager.o etcd_state_client.o \
+	messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o pg_states.o timerfd_manager.o base64.o
+libfio_cluster.so: $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) -luring
+
+test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
+	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
+test: test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
+test_allocator: test_allocator.cpp allocator.o
+	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
+
+crc32c.o: crc32c.c crc32c.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 json11.o: json11/json11.cpp
 	g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
+
+# Autogenerated
+
 allocator.o: allocator.cpp allocator.h
 	g++ $(CXXFLAGS) -c -o $@ $<
+base64.o: base64.cpp base64.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+messenger.o: messenger.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+msgr_receive.o: msgr_receive.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+msgr_send.o: msgr_send.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw.o: osd_rmw.cpp object_id.h osd_id.h osd_rmw.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw_test.o: osd_rmw_test.cpp object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+pg_states.o: pg_states.cpp pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
 ringloop.o: ringloop.cpp ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-
-%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-
-libblockstore.so: $(BLOCKSTORE_OBJS)
-	g++ $(CXXFLAGS) -o libblockstore.so -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
-libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
-	g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
-
-OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
-osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
-	g++ $(CXXFLAGS) -o $@ $<
-osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
-	g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
-stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
-	g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
-stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
-	g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
 rw_blocking.o: rw_blocking.cpp rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-osd_test: osd_test.cpp osd_ops.h rw_blocking.o
-	g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
-
-libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
-	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
-
-test_blockstore: ./libblockstore.so test_blockstore.cpp
-	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp ./libblockstore.so -ltcmalloc_minimal -luring
-test: test.cpp osd_peering_pg.o
-	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring
-test_allocator: test_allocator.cpp allocator.o
-	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
+stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test_allocator.o: test_allocator.cpp allocator.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
--- a/base64.cpp
+++ b/base64.cpp
@@ -0,0 +1,52 @@
+#include "base64.h"
+
+std::string base64_encode(const std::string &in)
+{
+    std::string out;
+    unsigned val = 0;
+    int valb = -6;
+    for (unsigned char c: in)
+    {
+        val = (val << 8) + c;
+        valb += 8;
+        while (valb >= 0)
+        {
+            out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb) & 0x3F]);
+            valb -= 6;
+        }
+    }
+    if (valb > -6)
+        out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8)) & 0x3F]);
+    while (out.size() % 4)
+        out.push_back('=');
+    return out;
+}
+
+static char T[256] = { 0 };
+
+std::string base64_decode(const std::string &in)
+{
+    std::string out;
+    if (T[0] == 0)
+    {
+        for (int i = 0; i < 256; i++)
+            T[i] = -1;
+        for (int i = 0; i < 64; i++)
+            T[(unsigned char)("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i])] = i;
+    }
+    unsigned val = 0;
+    int valb = -8;
+    for (unsigned char c: in)
+    {
+        if (T[c] == -1)
+            break;
+        val = (val<<6) + T[c];
+        valb += 6;
+        if (valb >= 0)
+        {
+            out.push_back(char((val >> valb) & 0xFF));
+            valb -= 8;
+        }
+    }
+    return out;
+}
--- a/base64.h
+++ b/base64.h
@@ -0,0 +1,5 @@
+#pragma once
+#include <string>
+
+std::string base64_encode(const std::string &in);
+std::string base64_decode(const std::string &in);
--- a/blockstore.cpp
+++ b/blockstore.cpp
@@ -55,6 +55,11 @@ uint64_t blockstore_t::get_block_count()
    return impl->get_block_count();
 }

+uint64_t blockstore_t::get_free_block_count()
+{
+    return impl->get_free_block_count();
+}
+
 uint32_t blockstore_t::get_disk_alignment()
 {
    return impl->get_disk_alignment();
--- a/blockstore.h
+++ b/blockstore.h
@@ -15,7 +15,9 @@

 // Memory alignment for direct I/O (usually 512 bytes)
 // All other alignments must be a multiple of this one
+#ifndef MEM_ALIGNMENT
 #define MEM_ALIGNMENT 512
+#endif

 // Default block size is 128 KB, current allowed range is 4K - 128M
 #define DEFAULT_ORDER 17
@@ -50,6 +52,7 @@ Input:
  - version == 0: read the last stable version,
  - version == UINT64_MAX: read the last version,
  - otherwise: read the newest version that is <= the specified version
+  - reads aren't guaranteed to return data from previous unfinished writes
  For writes:
  - if version == 0, a new version is assigned automatically
  - if version != 0, it is assigned for the new write if possible, otherwise -EINVAL is returned
@@ -92,7 +95,7 @@ Input:
 - buf = pre-allocated obj_ver_id array <len> units long

 Output:
- retval = 0 or negative error number (-EINVAL)
+- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)

 ## BS_OP_SYNC_STAB_ALL

@@ -175,6 +178,7 @@ public:
    // FIXME rename to object_size
    uint32_t get_block_size();
    uint64_t get_block_count();
+    uint64_t get_free_block_count();

    uint32_t get_disk_alignment();
 };
--- a/blockstore_flush.cpp
+++ b/blockstore_flush.cpp
@@ -4,9 +4,11 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
 {
    this->bs = bs;
    this->flusher_count = flusher_count;
+    dequeuing = false;
    active_flushers = 0;
-    sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
-    journal_trim_interval = sync_threshold;
+    syncing_flushers = 0;
+    flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
+    journal_trim_interval = flusher_start_threshold;
    journal_trim_counter = 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
    co = new journal_flusher_co[flusher_count];
@@ -31,6 +33,12 @@ journal_flusher_co::journal_flusher_co()
            );
        }
        wait_count--;
+        if (!wait_count)
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("finished %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
    };
    simple_callback_w = [this](ring_data_t* data)
    {
@@ -43,6 +51,12 @@ journal_flusher_co::journal_flusher_co()
            );
        }
        wait_count--;
+        if (!wait_count)
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("finished %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
    };
 }

@@ -55,17 +69,13 @@ journal_flusher_t::~journal_flusher_t()

 bool journal_flusher_t::is_active()
 {
-    return active_flushers > 0 || start_forced && flush_queue.size() > 0 || flush_queue.size() >= sync_threshold;
+    return active_flushers > 0 || dequeuing;
 }

 void journal_flusher_t::loop()
 {
-    for (int i = 0; i < flusher_count; i++)
+    for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
    {
-        if (!active_flushers && (start_forced ? !flush_queue.size() : (flush_queue.size() < sync_threshold)))
-        {
-            return;
-        }
        co[i].loop();
    }
 }
@@ -83,6 +93,11 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_back(ov.oid);
    }
+    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
+    {
+        dequeuing = true;
+        bs->ringloop->wakeup();
+    }
 }

 void journal_flusher_t::unshift_flush(obj_ver_id ov)
@@ -98,16 +113,32 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_front(ov.oid);
    }
+    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
+    {
+        dequeuing = true;
+        bs->ringloop->wakeup();
+    }
 }

-void journal_flusher_t::force_start()
+void journal_flusher_t::request_trim()
 {
-    start_forced = true;
+    dequeuing = true;
+    trim_wanted++;
    bs->ringloop->wakeup();
 }

+void journal_flusher_t::release_trim()
+{
+    trim_wanted--;
+}
+
 #define await_sqe(label) \
    resume_##label:\
+        {\
+            timespec now;\
+            clock_gettime(CLOCK_REALTIME, &now);\
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
+        }\
        sqe = bs->get_sqe();\
        if (!sqe)\
        {\
@@ -116,6 +147,7 @@ void journal_flusher_t::force_start()
        }\
        data = ((ring_data_t*)sqe->user_data);

+// FIXME: Implement batch flushing
 bool journal_flusher_co::loop()
 {
    // This is much better than implementing the whole function as an FSM
@@ -155,10 +187,9 @@ bool journal_flusher_co::loop()
    else if (wait_state == 18)
        goto resume_18;
 resume_0:
-    if (!flusher->flush_queue.size() ||
-        !flusher->start_forced && !flusher->active_flushers && flusher->flush_queue.size() < flusher->sync_threshold)
+    if (!flusher->flush_queue.size() || !flusher->dequeuing)
    {
-        flusher->start_forced = false;
+        flusher->dequeuing = false;
        wait_state = 0;
        return true;
    }
@@ -169,6 +200,76 @@ resume_0:
    dirty_end = bs->dirty_db.find(cur);
    if (dirty_end != bs->dirty_db.end())
    {
+        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+            (bs->journal.dirty_start >= bs->journal.used_start ||
+            dirty_end->second.journal_sector < bs->journal.used_start))
+        {
+            flusher->enqueue_flush(cur);
+            // We can't flush journal sectors that are still written to
+            // However, as we group flushes by oid, current oid may have older writes to flush!
+            // And it may even block writes if we don't flush the older version
+            // (if it's in the beginning of the journal)...
+            // So first try to find an older version of the same object to flush.
+            bool found = false;
+            while (dirty_end != bs->dirty_db.begin())
+            {
+                dirty_end--;
+                if (dirty_end->first.oid != cur.oid)
+                {
+                    break;
+                }
+                if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+                    (bs->journal.dirty_start >= bs->journal.used_start ||
+                    dirty_end->second.journal_sector < bs->journal.used_start)))
+                {
+                    found = true;
+                    cur.version = dirty_end->first.version;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                // Try other objects
+                int search_left = flusher->flush_queue.size() - 1;
+#ifdef BLOCKSTORE_DEBUG
+                printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
+#endif
+                while (search_left > 0)
+                {
+                    cur.oid = flusher->flush_queue.front();
+                    cur.version = flusher->flush_versions[cur.oid];
+                    flusher->flush_queue.pop_front();
+                    flusher->flush_versions.erase(cur.oid);
+                    dirty_end = bs->dirty_db.find(cur);
+                    if (dirty_end != bs->dirty_db.end())
+                    {
+                        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+                            (bs->journal.dirty_start >= bs->journal.used_start ||
+                            dirty_end->second.journal_sector < bs->journal.used_start))
+                        {
+#ifdef BLOCKSTORE_DEBUG
+                            printf("Write %lu:%lu v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
+#endif
+                            flusher->enqueue_flush(cur);
+                        }
+                        else
+                        {
+                            break;
+                        }
+                    }
+                    search_left--;
+                }
+                if (search_left <= 0)
+                {
+#ifdef BLOCKSTORE_DEBUG
+                    printf("No older flushes, stopping\n");
+#endif
+                    flusher->dequeuing = false;
+                    wait_state = 0;
+                    return true;
+                }
+            }
+        }
        repeat_it = flusher->sync_to_repeat.find(cur.oid);
        if (repeat_it != flusher->sync_to_repeat.end())
        {
@@ -191,32 +292,26 @@ resume_0:
 #endif
        flusher->active_flushers++;
 resume_1:
+        // Find it in clean_db
+        clean_it = bs->clean_db.find(cur.oid);
+        old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
        // Scan dirty versions of the object
        if (!scan_dirty(1))
        {
            wait_state += 1;
            return false;
        }
-        if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
+        // Writes and deletes shouldn't happen at the same time
+        assert(!(copy_count > 0 || has_writes) || !has_delete);
+        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
        {
            // Nothing to flush
-            flusher->active_flushers--;
-            repeat_it = flusher->sync_to_repeat.find(cur.oid);
-            if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
-            {
-                // Requeue version
-                flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
-            }
-            flusher->sync_to_repeat.erase(repeat_it);
-            wait_state = 0;
-            goto resume_0;
+            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
+            goto trim_journal;
        }
-        // Find it in clean_db
-        clean_it = bs->clean_db.find(cur.oid);
-        old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
        if (clean_loc == UINT64_MAX)
        {
-            if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
+            if (old_clean_loc == UINT64_MAX)
            {
                // Object not allocated. This is a bug.
                char err[1024];
@@ -331,6 +426,7 @@ resume_1:
        else
        {
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
+            assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
            new_entry->oid = cur.oid;
            new_entry->version = cur.version;
            if (!bs->inmemory_meta)
@@ -386,8 +482,9 @@ resume_1:
        }
        // Update clean_db and dirty_db, free old data locations
        update_clean_db();
+    trim_journal:
        // Clear unused part of the journal every <journal_trim_interval> flushes
-        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
+        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
        {
            flusher->journal_trim_counter = 0;
            if (bs->journal.trim())
@@ -417,7 +514,7 @@ resume_1:
        }
        // All done
 #ifdef BLOCKSTORE_DEBUG
-        printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
+        printf("Flushed %lu:%lu v%lu (%ld left)\n", cur.oid.inode, cur.oid.stripe, cur.version, flusher->flush_queue.size());
 #endif
        flusher->active_flushers--;
        repeat_it = flusher->sync_to_repeat.find(cur.oid);
@@ -445,7 +542,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
    copy_count = 0;
    clean_loc = UINT64_MAX;
    has_delete = false;
-    has_empty = false;
+    has_writes = false;
    skip_copy = false;
    clean_init_bitmap = false;
    while (1)
@@ -453,11 +550,8 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
        {
            // First we submit all reads
-            if (dirty_it->second.len == 0)
-            {
-                has_empty = true;
-            }
-            else
+            has_writes = true;
+            if (dirty_it->second.len != 0)
            {
                offset = dirty_it->second.offset;
                end_offset = dirty_it->second.offset + dirty_it->second.len;
@@ -499,6 +593,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
        {
            // There is an unflushed big write. Copy small writes in its position
+            has_writes = true;
            clean_loc = dirty_it->second.location;
            clean_init_bitmap = true;
            clean_bitmap_offset = dirty_it->second.offset;
@@ -632,7 +727,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
        });
    sync_found:
        cur_sync->ready_count++;
-        if (cur_sync->ready_count >= flusher->sync_threshold || !flusher->flush_queue.size())
+        flusher->syncing_flushers++;
+        if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
        {
            // Sync batch is ready. Do it.
            await_sqe(0);
@@ -658,6 +754,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
            wait_state = 2;
            return false;
        }
+        flusher->syncing_flushers--;
        cur_sync->ready_count--;
        if (cur_sync->ready_count == 0)
        {
--- a/blockstore_flush.h
+++ b/blockstore_flush.h
@@ -45,8 +45,8 @@ class journal_flusher_co
    std::map<object_id, uint64_t>::iterator repeat_it;
    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;

-    bool skip_copy, has_delete, has_empty;
-    spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
+    bool skip_copy, has_delete, has_writes;
+    blockstore_clean_db_t::iterator clean_it;
    std::vector<copy_buffer_t> v;
    std::vector<copy_buffer_t>::iterator it;
    int copy_count;
@@ -73,9 +73,10 @@ public:
 // Journal flusher itself
 class journal_flusher_t
 {
-    bool start_forced = false;
+    int trim_wanted = 0;
+    bool dequeuing;
    int flusher_count;
-    int sync_threshold;
+    int flusher_start_threshold;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
    friend class journal_flusher_co;
@@ -84,6 +85,7 @@ class journal_flusher_t
    void* journal_superblock;

    int active_flushers;
+    int syncing_flushers;
    std::list<flusher_sync_t> syncs;
    std::map<object_id, uint64_t> sync_to_repeat;

@@ -95,7 +97,8 @@ public:
    ~journal_flusher_t();
    void loop();
    bool is_active();
-    void force_start();
+    void request_trim();
+    void release_trim();
    void enqueue_flush(obj_ver_id oid);
    void unshift_flush(obj_ver_id oid);
 };
--- a/blockstore_impl.cpp
+++ b/blockstore_impl.cpp
@@ -5,7 +5,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
    this->ringloop = ringloop;
    ring_consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(ring_consumer);
+    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
    zero_object = (uint8_t*)memalign(MEM_ALIGNMENT, block_size);
    data_fd = meta_fd = journal.fd = -1;
@@ -36,7 +36,7 @@ blockstore_impl_t::~blockstore_impl_t()
    delete data_alloc;
    delete flusher;
    free(zero_object);
-    ringloop->unregister_consumer(ring_consumer);
+    ringloop->unregister_consumer(&ring_consumer);
    if (data_fd >= 0)
        close(data_fd);
    if (meta_fd >= 0 && meta_fd != data_fd)
@@ -98,10 +98,19 @@ void blockstore_impl_t::loop()
    {
        // try to submit ops
        unsigned initial_ring_space = ringloop->space_left();
+        // FIXME: rework this "sync polling"
        auto cur_sync = in_progress_syncs.begin();
        while (cur_sync != in_progress_syncs.end())
        {
-            continue_sync(*cur_sync++);
+            if (continue_sync(*cur_sync) != 2)
+            {
+                // List is unmodified
+                cur_sync++;
+            }
+            else
+            {
+                cur_sync = in_progress_syncs.begin();
+            }
        }
        auto cur = submit_queue.begin();
        int has_writes = 0;
@@ -115,12 +124,6 @@ void blockstore_impl_t::loop()
            if (PRIV(op)->wait_for)
            {
                check_wait(op);
-#ifdef BLOCKSTORE_DEBUG
-                if (PRIV(op)->wait_for)
-                {
-                    printf("still waiting for %d\n", PRIV(op)->wait_for);
-                }
-#endif
                if (PRIV(op)->wait_for == WAIT_SQE)
                {
                    break;
@@ -136,12 +139,12 @@ void blockstore_impl_t::loop()
            }
            unsigned ring_space = ringloop->space_left();
            unsigned prev_sqe_pos = ringloop->save();
-            int dequeue_op = 0;
+            bool dequeue_op = false;
            if (op->opcode == BS_OP_READ)
            {
                dequeue_op = dequeue_read(op);
            }
-            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
+            else if (op->opcode == BS_OP_WRITE)
            {
                if (has_writes == 2)
                {
@@ -151,6 +154,16 @@ void blockstore_impl_t::loop()
                dequeue_op = dequeue_write(op);
                has_writes = dequeue_op ? 1 : 2;
            }
+            else if (op->opcode == BS_OP_DELETE)
+            {
+                if (has_writes == 2)
+                {
+                    // Some writes could not be submitted
+                    break;
+                }
+                dequeue_op = dequeue_del(op);
+                has_writes = dequeue_op ? 1 : 2;
+            }
            else if (op->opcode == BS_OP_SYNC)
            {
                // wait for all small writes to be submitted
@@ -166,16 +179,33 @@ void blockstore_impl_t::loop()
            }
            else if (op->opcode == BS_OP_STABLE)
            {
+                if (has_writes == 2)
+                {
+                    // Don't submit additional flushes before completing previous LISTs
+                    break;
+                }
                dequeue_op = dequeue_stable(op);
            }
            else if (op->opcode == BS_OP_ROLLBACK)
            {
+                if (has_writes == 2)
+                {
+                    // Don't submit additional flushes before completing previous LISTs
+                    break;
+                }
                dequeue_op = dequeue_rollback(op);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                process_list(op);
-                dequeue_op = true;
+                // Block LIST operation by previous modifications,
+                // so it always returns a consistent state snapshot
+                if (has_writes == 2 || inflight_writes > 0)
+                    has_writes = 2;
+                else
+                {
+                    process_list(op);
+                    dequeue_op = true;
+                }
            }
            if (dequeue_op)
            {
@@ -205,7 +235,7 @@ void blockstore_impl_t::loop()
        {
            live = true;
        }
-        queue_stall = !live && !ringloop->get_loop_again();
+        queue_stall = !live && !ringloop->has_work();
        live = false;
    }
 }
@@ -245,19 +275,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        if (ringloop->space_left() < PRIV(op)->wait_detail)
        {
            // stop submission if there's still no free space
-            return;
-        }
-        PRIV(op)->wait_for = 0;
-    }
-    else if (PRIV(op)->wait_for == WAIT_IN_FLIGHT)
-    {
-        auto dirty_it = dirty_db.find((obj_ver_id){
-            .oid = op->oid,
-            .version = PRIV(op)->wait_detail,
-        });
-        if (dirty_it != dirty_db.end() && IS_IN_FLIGHT(dirty_it->second.state))
-        {
-            // do not submit
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
+#endif
            return;
        }
        PRIV(op)->wait_for = 0;
@@ -267,8 +287,12 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        if (journal.used_start == PRIV(op)->wait_detail)
        {
            // do not submit
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
+#endif
            return;
        }
+        flusher->release_trim();
        PRIV(op)->wait_for = 0;
    }
    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
@@ -278,6 +302,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
            journal.sector_info[next].dirty)
        {
            // do not submit
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting for a journal buffer\n");
+#endif
            return;
        }
        PRIV(op)->wait_for = 0;
@@ -286,6 +313,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    {
        if (!data_alloc->get_free_count() && !flusher->is_active())
        {
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting for free space on the data device\n");
+#endif
            return;
        }
        PRIV(op)->wait_for = 0;
@@ -304,12 +334,12 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
            op->len > block_size-op->offset ||
            (op->len % disk_alignment)
        )) ||
-        readonly && op->opcode != BS_OP_READ ||
+        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
        first && op->opcode == BS_OP_WRITE)
    {
        // Basic verification not passed
        op->retval = -EINVAL;
-        op->callback(op);
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -350,21 +380,21 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
            }
        };
    }
-    if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
+    if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
    {
-        op->callback(op);
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
-    if (0 && op->opcode == BS_OP_SYNC && immediate_commit)
+    if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
    {
        op->retval = 0;
-        op->callback(op);
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
    PRIV(op)->wait_for = 0;
-    PRIV(op)->sync_state = 0;
+    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
    if (!first)
    {
@@ -377,82 +407,165 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
    ringloop->wakeup();
 }

+static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
+{
+    while (search_start < search_end)
+    {
+        int pos = search_start+(search_end-search_start)/2;
+        if (oid < list[pos].oid)
+        {
+            search_end = pos;
+        }
+        else if (list[pos].oid < oid)
+        {
+            search_start = pos+1;
+        }
+        else
+        {
+            list[pos].version = version;
+            return true;
+        }
+    }
+    return false;
+}
+
 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    // Count objects
+    // Check PG
    uint32_t list_pg = op->offset;
    uint32_t pg_count = op->len;
-    uint64_t parity_block_size = op->oid.stripe;
-    if (pg_count != 0 && (parity_block_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
+    uint64_t pg_stripe_size = op->oid.stripe;
+    if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
    {
        op->retval = -EINVAL;
        FINISH_OP(op);
        return;
    }
-    uint64_t stable_count = 0;
-    if (pg_count > 0)
-    {
-        for (auto it = clean_db.begin(); it != clean_db.end(); it++)
-        {
-            uint32_t pg = (it->first.inode + it->first.stripe / parity_block_size) % pg_count;
-            if (pg == list_pg)
-            {
-                stable_count++;
-            }
-        }
-    }
-    else
-    {
-        stable_count = clean_db.size();
-    }
-    uint64_t total_count = stable_count;
-    for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
-    {
-        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
-        {
-            if (IS_STABLE(it->second.state))
-            {
-                stable_count++;
-            }
-            total_count++;
-        }
-    }
-    // Allocate memory
-    op->version = stable_count;
-    op->retval = total_count;
-    op->buf = malloc(sizeof(obj_ver_id) * total_count);
-    if (!op->buf)
+    // Copy clean_db entries (sorted)
+    int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
+    obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
+    if (!stable)
    {
        op->retval = -ENOMEM;
        FINISH_OP(op);
        return;
    }
-    obj_ver_id *vers = (obj_ver_id*)op->buf;
-    int i = 0;
    for (auto it = clean_db.begin(); it != clean_db.end(); it++)
    {
-        if (!pg_count || ((it->first.inode + it->first.stripe / parity_block_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
        {
-            vers[i++] = {
+            if (stable_count >= stable_alloc)
+            {
+                stable_alloc += 32768;
+                stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+                if (!stable)
+                {
+                    op->retval = -ENOMEM;
+                    FINISH_OP(op);
+                    return;
+                }
+            }
+            stable[stable_count++] = {
                .oid = it->first,
                .version = it->second.version,
            };
        }
    }
-    int j = stable_count;
+    int clean_stable_count = stable_count;
+    // Copy dirty_db entries (sorted, too)
+    int unstable_count = 0, unstable_alloc = 0;
+    obj_ver_id *unstable = NULL;
    for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
    {
-        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
        {
-            if (IS_STABLE(it->second.state))
+            if (IS_DELETE(it->second.state))
            {
-                vers[i++] = it->first;
+                // Deletions are always stable, so try to zero out two possible entries
+                if (!replace_stable(it->first.oid, 0, 0, clean_stable_count, stable))
+                {
+                    replace_stable(it->first.oid, 0, clean_stable_count, stable_count, stable);
+                }
+            }
+            else if (IS_STABLE(it->second.state))
+            {
+                // First try to replace a clean stable version in the first part of the list
+                if (!replace_stable(it->first.oid, it->first.version, 0, clean_stable_count, stable))
+                {
+                    // Then try to replace the last dirty stable version in the second part of the list
+                    if (stable[stable_count-1].oid == it->first.oid)
+                    {
+                        stable[stable_count-1].version = it->first.version;
+                    }
+                    else
+                    {
+                        if (stable_count >= stable_alloc)
+                        {
+                            stable_alloc += 32768;
+                            stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+                            if (!stable)
+                            {
+                                if (unstable)
+                                    free(unstable);
+                                op->retval = -ENOMEM;
+                                FINISH_OP(op);
+                                return;
+                            }
+                        }
+                        stable[stable_count++] = it->first;
+                    }
+                }
            }
            else
            {
-                vers[j++] = it->first;
+                if (unstable_count >= unstable_alloc)
+                {
+                    unstable_alloc += 32768;
+                    unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
+                    if (!unstable)
+                    {
+                        if (stable)
+                            free(stable);
+                        op->retval = -ENOMEM;
+                        FINISH_OP(op);
+                        return;
+                    }
+                }
+                unstable[unstable_count++] = it->first;
            }
        }
    }
+    // Remove zeroed out stable entries
+    int j = 0;
+    for (int i = 0; i < stable_count; i++)
+    {
+        if (stable[i].version != 0)
+        {
+            stable[j++] = stable[i];
+        }
+    }
+    stable_count = j;
+    if (stable_count+unstable_count > stable_alloc)
+    {
+        stable_alloc = stable_count+unstable_count;
+        stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+        if (!stable)
+        {
+            if (unstable)
+                free(unstable);
+            op->retval = -ENOMEM;
+            FINISH_OP(op);
+            return;
+        }
+    }
+    // Copy unstable entries
+    for (int i = 0; i < unstable_count; i++)
+    {
+        stable[j++] = unstable[i];
+    }
+    free(unstable);
+    op->version = stable_count;
+    op->retval = stable_count+unstable_count;
+    op->buf = stable;
    FINISH_OP(op);
 }
--- a/blockstore_impl.h
+++ b/blockstore_impl.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "blockstore.h"
-#include "timerfd_interval.h"

 #include <sys/types.h>
 #include <sys/ioctl.h>
@@ -16,7 +15,7 @@
 #include <deque>
 #include <new>

-#include "sparsepp/sparsepp/spp.h"
+#include "cpp-btree/btree_map.h"

 #include "allocator.h"

@@ -25,17 +24,17 @@
 // States are not stored on disk. Instead, they're deduced from the journal
 // FIXME: Rename to BS_ST_*

-#define ST_J_IN_FLIGHT 1
-#define ST_J_SUBMITTED 2
-#define ST_J_WRITTEN 3
-#define ST_J_SYNCED 4
-#define ST_J_STABLE 5
+#define ST_J_WAIT_BIG 1
+#define ST_J_IN_FLIGHT 2
+#define ST_J_SUBMITTED 3
+#define ST_J_WRITTEN 4
+#define ST_J_SYNCED 5
+#define ST_J_STABLE 6

 #define ST_D_IN_FLIGHT 15
 #define ST_D_SUBMITTED 16
 #define ST_D_WRITTEN 17
-#define ST_D_META_WRITTEN 19
-#define ST_D_META_SYNCED 20
+#define ST_D_SYNCED 20
 #define ST_D_STABLE 21

 #define ST_DEL_IN_FLIGHT 31
@@ -46,19 +45,28 @@

 #define ST_CURRENT 48

-#define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
+#define IMMEDIATE_NONE 0
+#define IMMEDIATE_SMALL 1
+#define IMMEDIATE_ALL 2
+
+#define IS_IN_FLIGHT(st) (st == ST_J_WAIT_BIG || st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
 #define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
-#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED || st == ST_DEL_SYNCED)
-#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
-#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
-#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
-#define IS_UNSYNCED(st) (st >= ST_J_SUBMITTED && st <= ST_J_WRITTEN || st >= ST_D_SUBMITTED && st <= ST_D_META_WRITTEN || st >= ST_DEL_SUBMITTED && st <= ST_DEL_WRITTEN)
+#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_SYNCED || st == ST_DEL_SYNCED)
+#define IS_JOURNAL(st) (st >= ST_J_WAIT_BIG && st <= ST_J_STABLE)
+#define IS_BIG_WRITE(st) (st >= ST_D_IN_FLIGHT && st <= ST_D_STABLE)
+#define IS_DELETE(st) (st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_STABLE)
+#define IS_UNSYNCED(st) (st >= ST_J_WAIT_BIG && st <= ST_J_WRITTEN || st >= ST_D_IN_FLIGHT && st <= ST_D_WRITTEN|| st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_WRITTEN)

 #define BS_SUBMIT_GET_SQE(sqe, data) \
    BS_SUBMIT_GET_ONLY_SQE(sqe); \
    struct ring_data_t *data = ((ring_data_t*)sqe->user_data)

 #define BS_SUBMIT_GET_ONLY_SQE(sqe) \
+        {\
+            timespec now;\
+            clock_gettime(CLOCK_REALTIME, &now);\
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
+        }\
    struct io_uring_sqe *sqe = get_sqe();\
    if (!sqe)\
    {\
@@ -68,6 +76,11 @@
    }

 #define BS_SUBMIT_GET_SQE_DECL(sqe) \
+        {\
+            timespec now;\
+            clock_gettime(CLOCK_REALTIME, &now);\
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
+        }\
    sqe = get_sqe();\
    if (!sqe)\
    {\
@@ -124,8 +137,6 @@ struct __attribute__((__packed__)) dirty_entry

 // Suspend operation until there are more free SQEs
 #define WAIT_SQE 1
-// Suspend operation until version <wait_detail> of object <oid> is written
-#define WAIT_IN_FLIGHT 2
 // Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
 #define WAIT_JOURNAL 3
 // Suspend operation until the next journal sector buffer is free
@@ -139,7 +150,7 @@ struct fulfill_read_t
 };

 #define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
-#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); op->callback(op)
+#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)

 struct blockstore_op_private_t
 {
@@ -147,12 +158,13 @@ struct blockstore_op_private_t
    int wait_for;
    uint64_t wait_detail;
    int pending_ops;
+    int op_state;

    // Read
    std::vector<fulfill_read_t> read_vec;

    // Sync, write
-    uint64_t min_used_journal_sector, max_used_journal_sector;
+    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;

    // Write
    struct iovec iov_zerofill[3];
@@ -161,9 +173,13 @@ struct blockstore_op_private_t
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
    int sync_small_checked, sync_big_checked;
    std::list<blockstore_op_t*>::iterator in_progress_ptr;
-    int sync_state, prev_sync_count;
+    int prev_sync_count;
 };

+// https://github.com/algorithm-ninja/cpp-btree
+// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
+// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
+typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
 typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;

 #include "blockstore_init.h"
@@ -177,29 +193,30 @@ class blockstore_impl_t
    uint32_t block_size;
    uint64_t meta_offset;
    uint64_t data_offset;
-    uint64_t cfg_journal_size;
+    uint64_t cfg_journal_size, cfg_data_size;
    // Required write alignment and journal/metadata/data areas' location alignment
-    uint32_t disk_alignment = 512;
+    uint32_t disk_alignment = 4096;
    // Journal block size - minimum_io_size of the journal device is the best choice
-    uint64_t journal_block_size = 512;
+    uint64_t journal_block_size = 4096;
    // Metadata block size - minimum_io_size of the metadata device is the best choice
-    uint64_t meta_block_size = 512;
+    uint64_t meta_block_size = 4096;
    // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
    uint64_t bitmap_granularity = 4096;
    bool readonly = false;
+    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
+    bool disable_flock = false;
    // It is safe to disable fsync() if drive write cache is writethrough
    bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
    // Enable if you want every operation to be executed with an "implicit fsync"
-    // FIXME Not implemented yet
-    bool immediate_commit = false;
+    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
+    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
    int flusher_count;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

-    // Another option is https://github.com/algorithm-ninja/cpp-btree
-    spp::sparse_hash_map<object_id, clean_entry> clean_db;
+    blockstore_clean_db_t clean_db;
    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
    std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
@@ -224,6 +241,7 @@ class blockstore_impl_t

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
+    int inflight_writes = 0;

    bool stop_sync_submitted;

@@ -264,7 +282,7 @@ class blockstore_impl_t
    bool enqueue_write(blockstore_op_t *op);
    int dequeue_write(blockstore_op_t *op);
    int dequeue_del(blockstore_op_t *op);
-    void ack_write(blockstore_op_t *op);
+    int continue_write(blockstore_op_t *op);
    void release_journal_sectors(blockstore_op_t *op);
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

@@ -277,11 +295,15 @@ class blockstore_impl_t

    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
+    int continue_stable(blockstore_op_t *op);
+    void mark_stable(const obj_ver_id & ov);
    void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
    void stabilize_object(object_id oid, uint64_t max_ver);

    // Rollback
    int dequeue_rollback(blockstore_op_t *op);
+    int continue_rollback(blockstore_op_t *op);
+    void mark_rolled_back(const obj_ver_id & ov);
    void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
    void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);

@@ -316,5 +338,6 @@ public:

    inline uint32_t get_block_size() { return block_size; }
    inline uint64_t get_block_count() { return block_count; }
+    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
    inline uint32_t get_disk_alignment() { return disk_alignment; }
 };
--- a/blockstore_init.cpp
+++ b/blockstore_init.cpp
@@ -402,8 +402,9 @@ resume_1:
    }
    // Trim journal on start so we don't stall when all entries are older
    bs->journal.trim();
+    bs->journal.dirty_start = bs->journal.next_free;
    printf(
-        "Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
+        "Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
        entries_loaded,
        (bs->journal.next_free >= bs->journal.used_start
            ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@@ -439,7 +440,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
        {
            journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
            if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
-                je->type < JE_SMALL_WRITE || je->type > JE_DELETE || started && je->crc32_prev != crc32_last)
+                je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
            {
                if (pos == 0)
                {
@@ -474,7 +475,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                if (location != je->small_write.data_offset)
                {
                    char err[1024];
-                    snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
+                    snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
                    throw std::runtime_error(err);
                }
                uint32_t data_crc32 = 0;
@@ -509,7 +510,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                if (data_crc32 != je->small_write.crc32_data)
                {
                    // journal entry is corrupt, stop here
-                    // interesting thing is that we must clear the corrupt entry if we're not readonly
+                    // interesting thing is that we must clear the corrupt entry if we're not readonly,
+                    // because we don't write next entries in the same journal block
+                    printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
                    memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
                    bs->journal.next_free = prev_free;
                    init_write_buf = buf + proc_pos - done_pos;
@@ -518,7 +521,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                }
                auto clean_it = bs->clean_db.find(je->small_write.oid);
                if (clean_it == bs->clean_db.end() ||
-                    clean_it->second.version < je->big_write.version)
+                    clean_it->second.version < je->small_write.version)
                {
                    obj_ver_id ov = {
                        .oid = je->small_write.oid,
@@ -534,7 +537,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    });
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
-                    printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
+                    printf(
+                        "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
+                        proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
+                    );
 #endif
                    auto & unstab = bs->unstable_writes[ov.oid];
                    unstab = unstab < ov.version ? ov.version : unstab;
@@ -555,7 +561,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .version = je->big_write.version,
                    };
                    bs->dirty_db.emplace(ov, (dirty_entry){
-                        .state = ST_D_META_SYNCED,
+                        .state = ST_D_SYNCED,
                        .flags = 0,
                        .location = je->big_write.location,
                        .offset = je->big_write.offset,
@@ -581,33 +587,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    .oid = je->stable.oid,
                    .version = je->stable.version,
                };
-                auto it = bs->dirty_db.find(ov);
-                if (it == bs->dirty_db.end())
-                {
-                    // journal contains a legitimate STABLE entry for a non-existing dirty write
-                    // this probably means that journal was trimmed between WRITE and STABLE entries
-                    // skip it
-                }
-                else
-                {
-                    while (1)
-                    {
-                        it->second.state = (it->second.state == ST_D_META_SYNCED
-                            ? ST_D_STABLE
-                            : (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
-                        if (it == bs->dirty_db.begin())
-                            break;
-                        it--;
-                        if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
-                            break;
-                    }
-                    bs->flusher->enqueue_flush(ov);
-                }
-                auto unstab_it = bs->unstable_writes.find(ov.oid);
-                if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
-                {
-                    bs->unstable_writes.erase(unstab_it);
-                }
+                bs->mark_stable(ov);
            }
            else if (je->type == JE_ROLLBACK)
            {
@@ -615,70 +595,39 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
 #endif
                // rollback dirty writes of <oid> up to <version>
-                auto it = bs->dirty_db.lower_bound((obj_ver_id){
+                obj_ver_id ov = {
                    .oid = je->rollback.oid,
-                    .version = UINT64_MAX,
-                });
-                if (it != bs->dirty_db.begin())
-                {
-                    uint64_t max_unstable = 0;
-                    auto rm_start = it;
-                    auto rm_end = it;
-                    it--;
-                    while (it->first.oid == je->rollback.oid &&
-                        it->first.version > je->rollback.version &&
-                        !IS_IN_FLIGHT(it->second.state) &&
-                        !IS_STABLE(it->second.state))
-                    {
-                        if (it->first.oid != je->rollback.oid)
-                            break;
-                        else if (it->first.version <= je->rollback.version)
-                        {
-                            if (!IS_STABLE(it->second.state))
-                                max_unstable = it->first.version;
-                            break;
-                        }
-                        else if (IS_STABLE(it->second.state))
-                            break;
-                        // Remove entry
-                        rm_start = it;
-                        if (it == bs->dirty_db.begin())
-                            break;
-                        it--;
-                    }
-                    if (rm_start != rm_end)
-                    {
-                        bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
-                    }
-                    auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
-                    if (unstab_it != bs->unstable_writes.end())
-                    {
-                        if (max_unstable == 0)
-                            bs->unstable_writes.erase(unstab_it);
-                        else
-                            unstab_it->second = max_unstable;
-                    }
-                }
+                    .version = je->rollback.version,
+                };
+                bs->mark_rolled_back(ov);
            }
            else if (je->type == JE_DELETE)
            {
 #ifdef BLOCKSTORE_DEBUG
                printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
 #endif
-                // oid, version
-                obj_ver_id ov = {
-                    .oid = je->del.oid,
-                    .version = je->del.version,
-                };
-                bs->dirty_db.emplace(ov, (dirty_entry){
-                    .state = ST_DEL_SYNCED,
-                    .flags = 0,
-                    .location = 0,
-                    .offset = 0,
-                    .len = 0,
-                    .journal_sector = proc_pos,
-                });
-                bs->journal.used_sectors[proc_pos]++;
+                auto clean_it = bs->clean_db.find(je->del.oid);
+                if (clean_it == bs->clean_db.end() ||
+                    clean_it->second.version < je->del.version)
+                {
+                    // oid, version
+                    obj_ver_id ov = {
+                        .oid = je->del.oid,
+                        .version = je->del.version,
+                    };
+                    bs->dirty_db.emplace(ov, (dirty_entry){
+                        .state = ST_DEL_SYNCED,
+                        .flags = 0,
+                        .location = 0,
+                        .offset = 0,
+                        .len = 0,
+                        .journal_sector = proc_pos,
+                    });
+                    bs->journal.used_sectors[proc_pos]++;
+                    // Deletions are treated as immediately stable, because
+                    // "2-phase commit" (write->stabilize) isn't sufficient for them anyway
+                    bs->mark_stable(ov);
+                }
            }
            started = true;
            pos += je->size;
--- a/blockstore_journal.cpp
+++ b/blockstore_journal.cpp
@@ -6,18 +6,24 @@ blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
    sectors_required = 0;
    next_pos = bs->journal.next_free;
    next_sector = bs->journal.cur_sector;
+    first_sector = -1;
    next_in_pos = bs->journal.in_sector_pos;
    right_dir = next_pos >= bs->journal.used_start;
 }

 // Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
-int blockstore_journal_check_t::check_available(blockstore_op_t *op, int required, int size, int data_after)
+int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
 {
+    int required = entries_required;
    while (1)
    {
        int fits = (bs->journal.block_size - next_in_pos) / size;
        if (fits > 0)
        {
+            if (first_sector == -1)
+            {
+                first_sector = next_sector;
+            }
            required -= fits;
            next_in_pos += fits * size;
            sectors_required++;
@@ -38,19 +44,40 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int require
            right_dir = false;
        }
        next_in_pos = 0;
-        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
-            bs->journal.sector_info[next_sector].dirty)
+        next_sector = ((next_sector + 1) % bs->journal.sector_count);
+        if (next_sector == first_sector)
        {
-            next_sector = ((next_sector + 1) % bs->journal.sector_count);
+            // next_sector may wrap when all sectors are flushed and the incoming batch is too big
+            // This is an error condition, we can't wait for anything in this case
+            throw std::runtime_error(
+                "Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
+                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
+            );
        }
        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
            bs->journal.sector_info[next_sector].dirty)
        {
            // No memory buffer available. Wait for it.
-#ifdef BLOCKSTORE_DEBUG
-            printf("next journal buffer %d is still dirty=%d used=%d\n", next_sector,
-                bs->journal.sector_info[next_sector].dirty, bs->journal.sector_info[next_sector].usage_count);
-#endif
+            int used = 0, dirty = 0;
+            for (int i = 0; i < bs->journal.sector_count; i++)
+            {
+                if (bs->journal.sector_info[i].dirty)
+                {
+                    dirty++;
+                    used++;
+                }
+                if (bs->journal.sector_info[i].usage_count > 0)
+                {
+                    used++;
+                }
+            }
+            // In fact, it's even more rare than "ran out of journal space", so print a warning
+            printf(
+                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
+                used, bs->journal.sector_count, dirty, next_sector,
+                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
+                bs->journal.sector_info[next_sector].usage_count
+            );
            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
            return 0;
        }
@@ -74,7 +101,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int require
                : bs->journal.used_start - bs->journal.next_free)
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
-        bs->flusher->force_start();
+        bs->flusher->request_trim();
        PRIV(op)->wait_detail = bs->journal.used_start;
        return 0;
    }
@@ -91,6 +118,11 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        {
            // Also select next sector buffer in memory
            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
+            assert(!journal.sector_info[journal.cur_sector].usage_count);
+        }
+        else
+        {
+            journal.dirty_start = journal.next_free;
        }
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
@@ -148,8 +180,8 @@ bool journal_t::trim()
    auto journal_used_it = used_sectors.lower_bound(used_start);
 #ifdef BLOCKSTORE_DEBUG
    printf(
-        "Trimming journal (used_start=%lu, next_free=%lu, first_used=%lu, usage_count=%lu)\n",
-        used_start, next_free,
+        "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
+        used_start, next_free, dirty_start,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
    );
@@ -180,7 +212,7 @@ bool journal_t::trim()
        return false;
    }
 #ifdef BLOCKSTORE_DEBUG
-    printf("Journal trimmed to %lu (next_free=%lu)\n", used_start, next_free);
+    printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
 #endif
    return true;
 }
--- a/blockstore_journal.h
+++ b/blockstore_journal.h
@@ -12,12 +12,14 @@
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
+#define JE_MIN         0x01
 #define JE_START       0x01
 #define JE_SMALL_WRITE 0x02
 #define JE_BIG_WRITE   0x03
 #define JE_STABLE      0x04
 #define JE_DELETE      0x05
 #define JE_ROLLBACK    0x06
+#define JE_MAX         0x06

 // crc32c comes first to ease calculation and is equal to crc32()
 struct __attribute__((__packed__)) journal_entry_start
@@ -135,10 +137,14 @@ struct journal_t
    bool inmemory = false;
    void *buffer = NULL;

-    uint64_t block_size = 512;
+    uint64_t block_size;
    uint64_t offset, len;
+    // Next free block offset
    uint64_t next_free = 0;
+    // First occupied block offset
    uint64_t used_start = 0;
+    // End of the last block not used for writing anymore
+    uint64_t dirty_start = 0;
    uint32_t crc32_last = 0;

    // Current sector(s) used for writing
@@ -160,7 +166,7 @@ struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_required;
+    int sectors_required, first_sector;
    bool right_dir; // writing to the end or the beginning of the ring buffer

    blockstore_journal_check_t(blockstore_impl_t *bs);
--- a/blockstore_open.cpp
+++ b/blockstore_open.cpp
@@ -1,3 +1,4 @@
+#include <sys/file.h>
 #include "blockstore_impl.h"

 static uint32_t is_power_of_two(uint64_t value)
@@ -34,10 +35,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        disable_journal_fsync = true;
    }
+    if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
+    {
+        disable_flock = true;
+    }
+    if (config["immediate_commit"] == "all")
+    {
+        immediate_commit = IMMEDIATE_ALL;
+    }
+    else if (config["immediate_commit"] == "small")
+    {
+        immediate_commit = IMMEDIATE_SMALL;
+    }
    metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
    cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
    data_device = config["data_device"];
    data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
+    cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
    meta_device = config["meta_device"];
    meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
    block_size = strtoull(config["block_size"].c_str(), NULL, 10);
@@ -66,7 +80,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!disk_alignment)
    {
-        disk_alignment = 512;
+        disk_alignment = 4096;
    }
    else if (disk_alignment % MEM_ALIGNMENT)
    {
@@ -74,7 +88,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!journal_block_size)
    {
-        journal_block_size = 512;
+        journal_block_size = 4096;
    }
    else if (journal_block_size % MEM_ALIGNMENT)
    {
@@ -82,7 +96,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!meta_block_size)
    {
-        meta_block_size = 512;
+        meta_block_size = 4096;
    }
    else if (meta_block_size % MEM_ALIGNMENT)
    {
@@ -128,6 +142,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        metadata_buf_size = 4*1024*1024;
    }
+    if (meta_device == "")
+    {
+        disable_meta_fsync = disable_data_fsync;
+    }
+    if (journal_device == "")
+    {
+        disable_journal_fsync = disable_meta_fsync;
+    }
+    if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
+    {
+        throw std::runtime_error("immediate_commit requires disable_journal_fsync");
+    }
+    if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
+    {
+        throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
+    }
    // init some fields
    clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
    clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
@@ -151,6 +181,15 @@ void blockstore_impl_t::calc_lengths()
        data_len = data_len < journal.offset-data_offset
            ? data_len : journal.offset-data_offset;
    }
+    if (cfg_data_size != 0)
+    {
+        if (data_len < cfg_data_size)
+        {
+            throw std::runtime_error("Data area ("+std::to_string(data_len)+
+                " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
+        }
+        data_len = cfg_data_size;
+    }
    // meta
    meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
    if (meta_fd == data_fd && meta_offset <= data_offset)
@@ -252,6 +291,10 @@ void blockstore_impl_t::open_data()
    {
        throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
    }
+    if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
+    {
+        throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
+    }
 }

 void blockstore_impl_t::open_meta()
@@ -269,11 +312,14 @@ void blockstore_impl_t::open_meta()
        {
            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
        }
+        if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
+        {
+            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
+        }
    }
    else
    {
        meta_fd = data_fd;
-        disable_meta_fsync = disable_data_fsync;
        meta_size = 0;
        if (meta_offset >= data_size)
        {
@@ -291,12 +337,15 @@ void blockstore_impl_t::open_journal()
        {
            throw std::runtime_error("Failed to open journal device");
        }
-        check_size(journal.fd, &journal.device_size, "metadata device");
+        check_size(journal.fd, &journal.device_size, "journal device");
+        if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
+        {
+            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
+        }
    }
    else
    {
        journal.fd = meta_fd;
-        disable_journal_fsync = disable_meta_fsync;
        journal.device_size = 0;
        if (journal.offset >= data_size)
        {
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@@ -8,12 +8,10 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
        // Zero-length version - skip
        return 1;
    }
-    if (IS_IN_FLIGHT(item_state))
+    else if (IS_IN_FLIGHT(item_state))
    {
-        // Pause until it's written somewhere
-        PRIV(op)->wait_for = WAIT_IN_FLIGHT;
-        PRIV(op)->wait_detail = item_version;
-        return 0;
+        // Write not finished yet - skip
+        return 1;
    }
    else if (IS_DELETE(item_state))
    {
@@ -133,63 +131,66 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            dirty_it--;
        }
    }
-    if (clean_it != clean_db.end() && fulfilled < read_op->len)
+    if (clean_it != clean_db.end())
    {
        if (!result_version)
        {
            result_version = clean_it->second.version;
        }
-        if (!clean_entry_bitmap_size)
+        if (fulfilled < read_op->len)
        {
-            if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+            if (!clean_entry_bitmap_size)
            {
-                // need to wait. undo added requests, don't dequeue op
-                PRIV(read_op)->read_vec.clear();
-                return 0;
-            }
-        }
-        else
-        {
-            uint64_t meta_loc = clean_it->second.location >> block_order;
-            uint8_t *clean_entry_bitmap;
-            if (inmemory_meta)
-            {
-                uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
-                uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
-                clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
+                if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+                {
+                    // need to wait. undo added requests, don't dequeue op
+                    PRIV(read_op)->read_vec.clear();
+                    return 0;
+                }
            }
            else
            {
-                clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
-            }
-            uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
-            while (bmp_start < bmp_size)
-            {
-                while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
+                uint64_t meta_loc = clean_it->second.location >> block_order;
+                uint8_t *clean_entry_bitmap;
+                if (inmemory_meta)
                {
-                    bmp_end++;
+                    uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
+                    uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
+                    clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
                }
-                if (bmp_end > bmp_start)
+                else
                {
-                    // fill with zeroes
-                    fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                        bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
+                    clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
                }
-                bmp_start = bmp_end;
-                while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
+                uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
+                while (bmp_start < bmp_size)
                {
-                    bmp_end++;
-                }
-                if (bmp_end > bmp_start)
-                {
-                    if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                        bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
+                    while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
                    {
-                        // need to wait. undo added requests, don't dequeue op
-                        PRIV(read_op)->read_vec.clear();
-                        return 0;
+                        bmp_end++;
+                    }
+                    if (bmp_end > bmp_start)
+                    {
+                        // fill with zeroes
+                        fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
                    }
                    bmp_start = bmp_end;
+                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
+                    {
+                        bmp_end++;
+                    }
+                    if (bmp_end > bmp_start)
+                    {
+                        if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
+                        {
+                            // need to wait. undo added requests, don't dequeue op
+                            PRIV(read_op)->read_vec.clear();
+                            return 0;
+                        }
+                        bmp_start = bmp_end;
+                    }
                }
            }
        }
--- a/blockstore_rollback.cpp
+++ b/blockstore_rollback.cpp
@@ -2,6 +2,10 @@

 int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
 {
+    if (PRIV(op)->op_state)
+    {
+        return continue_rollback(op);
+    }
    obj_ver_id* v;
    int i, todo = op->len;
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -14,8 +18,13 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        });
        if (dirty_it == dirty_db.begin())
        {
+            if (v->version == 0)
+            {
+                // Already rolled back
+                // FIXME Skip this object version
+            }
        bad_op:
-            op->retval = -EINVAL;
+            op->retval = -ENOENT;
            FINISH_OP(op);
            return 1;
        }
@@ -31,7 +40,9 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
                if (!IS_SYNCED(dirty_it->second.state) ||
                    IS_STABLE(dirty_it->second.state))
                {
-                    goto bad_op;
+                    op->retval = -EBUSY;
+                    FINISH_OP(op);
+                    return 1;
                }
                if (dirty_it == dirty_db.begin())
                {
@@ -60,39 +71,12 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
-            PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
-        // FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
-        // FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
-        auto dirty_it = dirty_db.lower_bound((obj_ver_id){
-            .oid = v->oid,
-            .version = UINT64_MAX,
-        });
-        uint64_t max_unstable = 0;
-        while (dirty_it != dirty_db.begin())
-        {
-            dirty_it--;
-            if (dirty_it->first.oid != v->oid)
-                break;
-            else if (dirty_it->first.version <= v->version)
-            {
-                if (!IS_STABLE(dirty_it->second.state))
-                    max_unstable = dirty_it->first.version;
-                break;
-            }
-        }
-        auto unstab_it = unstable_writes.find(v->oid);
-        if (unstab_it != unstable_writes.end())
-        {
-            if (max_unstable == 0)
-                unstable_writes.erase(unstab_it);
-            else
-                unstab_it->second = max_unstable;
-        }
        journal_entry_rollback *je = (journal_entry_rollback*)
            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
        journal.sector_info[journal.cur_sector].dirty = false;
@@ -103,21 +87,117 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        if (cur_sector != journal.cur_sector)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
-    PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
+    PRIV(op)->op_state = 1;
+    inflight_writes++;
    return 1;
 }

+int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 3)
+        goto resume_3;
+    else if (PRIV(op)->op_state == 5)
+        goto resume_5;
+    else
+        return 1;
+resume_2:
+    // Release used journal sectors
+    release_journal_sectors(op);
+resume_3:
+    if (!disable_journal_fsync)
+    {
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->op_state = 4;
+        return 1;
+    }
+resume_5:
+    obj_ver_id* v;
+    int i;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        mark_rolled_back(*v);
+    }
+    journal.trim();
+    inflight_writes--;
+    // Acknowledge op
+    op->retval = 0;
+    FINISH_OP(op);
+    return 1;
+}
+
+void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
+{
+    auto it = dirty_db.lower_bound((obj_ver_id){
+        .oid = ov.oid,
+        .version = UINT64_MAX,
+    });
+    if (it != dirty_db.begin())
+    {
+        uint64_t max_unstable = 0;
+        auto rm_start = it;
+        auto rm_end = it;
+        it--;
+        while (it->first.oid == ov.oid &&
+            it->first.version > ov.version &&
+            !IS_IN_FLIGHT(it->second.state) &&
+            !IS_STABLE(it->second.state))
+        {
+            if (it->first.oid != ov.oid)
+                break;
+            else if (it->first.version <= ov.version)
+            {
+                if (!IS_STABLE(it->second.state))
+                    max_unstable = it->first.version;
+                break;
+            }
+            else if (IS_STABLE(it->second.state))
+                break;
+            // Remove entry
+            rm_start = it;
+            if (it == dirty_db.begin())
+                break;
+            it--;
+        }
+        if (rm_start != rm_end)
+        {
+            erase_dirty(rm_start, rm_end, UINT64_MAX);
+        }
+        auto unstab_it = unstable_writes.find(ov.oid);
+        if (unstab_it != unstable_writes.end())
+        {
+            if (max_unstable == 0)
+                unstable_writes.erase(unstab_it);
+            else
+                unstab_it->second = max_unstable;
+        }
+    }
+}
+
 void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
 {
    live = true;
    if (data->res != data->iov.iov_len)
    {
+        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -126,37 +206,11 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    PRIV(op)->pending_ops--;
    if (PRIV(op)->pending_ops == 0)
    {
-        // Release used journal sectors
-        release_journal_sectors(op);
-        obj_ver_id* v;
-        int i;
-        for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+        PRIV(op)->op_state++;
+        if (!continue_rollback(op))
        {
-            // Erase dirty_db entries
-            auto rm_end = dirty_db.lower_bound((obj_ver_id){
-                .oid = v->oid,
-                .version = UINT64_MAX,
-            });
-            rm_end--;
-            auto rm_start = rm_end;
-            while (1)
-            {
-                if (rm_end->first.oid != v->oid)
-                    break;
-                else if (rm_end->first.version <= v->version)
-                    break;
-                rm_start = rm_end;
-                if (rm_end == dirty_db.begin())
-                    break;
-                rm_end--;
-            }
-            if (rm_end != rm_start)
-                erase_dirty(rm_start, rm_end, UINT64_MAX);
+            submit_queue.push_front(op);
        }
-        journal.trim();
-        // Acknowledge op
-        op->retval = 0;
-        FINISH_OP(op);
    }
 }

@@ -173,11 +227,13 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
 #endif
            data_alloc->set(dirty_it->second.location >> block_order, false);
        }
-#ifdef BLOCKSTORE_DEBUG
-        printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
-            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
-#endif
        int used = --journal.used_sectors[dirty_it->second.journal_sector];
+#ifdef BLOCKSTORE_DEBUG
+        printf(
+            "remove usage of journal offset %08lx by %lu:%lu v%lu (%d refs)\n", dirty_it->second.journal_sector,
+            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
+        );
+#endif
        if (used == 0)
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@@ -40,6 +40,10 @@

 int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
 {
+    if (PRIV(op)->op_state)
+    {
+        return continue_stable(op);
+    }
    obj_ver_id* v;
    int i, todo = 0;
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -51,7 +55,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            if (clean_it == clean_db.end() || clean_it->second.version < v->version)
            {
                // No such object version
-                op->retval = -EINVAL;
+                op->retval = -ENOENT;
                FINISH_OP(op);
                return 1;
            }
@@ -63,7 +67,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        else if (IS_UNSYNCED(dirty_it->second.state))
        {
            // Object not synced yet. Caller must sync it first
-            op->retval = EAGAIN;
+            op->retval = -EBUSY;
            FINISH_OP(op);
            return 1;
        }
@@ -98,18 +102,13 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
-            PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
-        auto unstab_it = unstable_writes.find(v->oid);
-        if (unstab_it != unstable_writes.end() &&
-            unstab_it->second <= v->version)
-        {
-            unstable_writes.erase(unstab_it);
-        }
+        // FIXME: Only stabilize versions that aren't stable yet
        journal_entry_stable *je = (journal_entry_stable*)
            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
        journal.sector_info[journal.cur_sector].dirty = false;
@@ -120,21 +119,121 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        if (cur_sector != journal.cur_sector)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
-    PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
+    PRIV(op)->op_state = 1;
+    inflight_writes++;
    return 1;
 }

+int blockstore_impl_t::continue_stable(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 3)
+        goto resume_3;
+    else if (PRIV(op)->op_state == 5)
+        goto resume_5;
+    else
+        return 1;
+resume_2:
+    // Release used journal sectors
+    release_journal_sectors(op);
+resume_3:
+    if (!disable_journal_fsync)
+    {
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->op_state = 4;
+        return 1;
+    }
+resume_5:
+    // Mark dirty_db entries as stable, acknowledge op completion
+    obj_ver_id* v;
+    int i;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        // Mark all dirty_db entries up to op->version as stable
+        mark_stable(*v);
+    }
+    inflight_writes--;
+    // Acknowledge op
+    op->retval = 0;
+    FINISH_OP(op);
+    return 1;
+}
+
+void blockstore_impl_t::mark_stable(const obj_ver_id & v)
+{
+    auto dirty_it = dirty_db.find(v);
+    if (dirty_it != dirty_db.end())
+    {
+        while (1)
+        {
+            if (dirty_it->second.state == ST_J_SYNCED)
+            {
+                dirty_it->second.state = ST_J_STABLE;
+            }
+            else if (dirty_it->second.state == ST_D_SYNCED)
+            {
+                dirty_it->second.state = ST_D_STABLE;
+            }
+            else if (dirty_it->second.state == ST_DEL_SYNCED)
+            {
+                dirty_it->second.state = ST_DEL_STABLE;
+            }
+            else if (IS_STABLE(dirty_it->second.state))
+            {
+                break;
+            }
+            if (dirty_it == dirty_db.begin())
+            {
+                break;
+            }
+            dirty_it--;
+            if (dirty_it->first.oid != v.oid)
+            {
+                break;
+            }
+        }
+#ifdef BLOCKSTORE_DEBUG
+        printf("enqueue_flush %lu:%lu v%lu\n", v.oid.inode, v.oid.stripe, v.version);
+#endif
+        flusher->enqueue_flush(v);
+    }
+    auto unstab_it = unstable_writes.find(v.oid);
+    if (unstab_it != unstable_writes.end() &&
+        unstab_it->second <= v.version)
+    {
+        unstable_writes.erase(unstab_it);
+    }
+}
+
 void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
 {
    live = true;
    if (data->res != data->iov.iov_len)
    {
+        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -143,53 +242,15 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    PRIV(op)->pending_ops--;
    if (PRIV(op)->pending_ops == 0)
    {
-        // Release used journal sectors
-        release_journal_sectors(op);
-        // Mark dirty_db entries as stable, acknowledge op completion
-        obj_ver_id* v;
-        int i;
-        for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
        {
-            // Mark all dirty_db entries up to op->version as stable
-            auto dirty_it = dirty_db.find(*v);
-            if (dirty_it != dirty_db.end())
-            {
-                while (1)
-                {
-                    if (dirty_it->second.state == ST_J_SYNCED)
-                    {
-                        dirty_it->second.state = ST_J_STABLE;
-                    }
-                    else if (dirty_it->second.state == ST_D_META_SYNCED)
-                    {
-                        dirty_it->second.state = ST_D_STABLE;
-                    }
-                    else if (dirty_it->second.state == ST_DEL_SYNCED)
-                    {
-                        dirty_it->second.state = ST_DEL_STABLE;
-                    }
-                    else if (IS_STABLE(dirty_it->second.state))
-                    {
-                        break;
-                    }
-                    if (dirty_it == dirty_db.begin())
-                    {
-                        break;
-                    }
-                    dirty_it--;
-                    if (dirty_it->first.oid != v->oid)
-                    {
-                        break;
-                    }
-                }
-#ifdef BLOCKSTORE_DEBUG
-                printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
-#endif
-                flusher->enqueue_flush(*v);
-            }
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("finished %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+        PRIV(op)->op_state++;
+        if (!continue_stable(op))
+        {
+            submit_queue.push_front(op);
        }
-        // Acknowledge op
-        op->retval = 0;
-        FINISH_OP(op);
    }
 }
--- a/blockstore_sync.cpp
+++ b/blockstore_sync.cpp
@@ -11,7 +11,7 @@

 int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 {
-    if (PRIV(op)->sync_state == 0)
+    if (PRIV(op)->op_state == 0)
    {
        stop_sync_submitted = false;
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
@@ -21,11 +21,11 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
        unsynced_big_writes.clear();
        unsynced_small_writes.clear();
        if (PRIV(op)->sync_big_writes.size() > 0)
-            PRIV(op)->sync_state = SYNC_HAS_BIG;
+            PRIV(op)->op_state = SYNC_HAS_BIG;
        else if (PRIV(op)->sync_small_writes.size() > 0)
-            PRIV(op)->sync_state = SYNC_HAS_SMALL;
+            PRIV(op)->op_state = SYNC_HAS_SMALL;
        else
-            PRIV(op)->sync_state = SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DONE;
        // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
        PRIV(op)->prev_sync_count = in_progress_syncs.size();
        PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
@@ -38,7 +38,7 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 int blockstore_impl_t::continue_sync(blockstore_op_t *op)
 {
    auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
-    if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
+    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
    {
        // No big writes, just fsync the journal
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
@@ -54,17 +54,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            // Write out the last journal sector if it happens to be dirty
            BS_SUBMIT_GET_ONLY_SQE(sqe);
            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
        }
    }
-    if (PRIV(op)->sync_state == SYNC_HAS_BIG)
+    if (PRIV(op)->op_state == SYNC_HAS_BIG)
    {
        for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
        {
@@ -81,17 +81,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
            data->callback = cb;
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
        }
    }
-    if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
+    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
    {
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
        {
@@ -121,7 +121,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            journal.sector_info[journal.cur_sector].dirty)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
@@ -133,7 +133,11 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            journal.sector_info[journal.cur_sector].dirty = false;
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
-            printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
+            printf(
+                "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
+                dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
+                journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+            );
 #endif
            je->oid = it->oid;
            je->version = it->version;
@@ -146,17 +150,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            if (cur_sector != journal.cur_sector)
            {
                if (cur_sector == -1)
-                    PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
                cur_sector = journal.cur_sector;
                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
            }
        }
-        PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
-        PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
+        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
        return 1;
    }
-    if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
+    if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
    {
        if (!disable_journal_fsync)
        {
@@ -165,17 +169,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            data->iov = { 0 };
            data->callback = cb;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
+            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->sync_state = SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DONE;
        }
    }
-    if (PRIV(op)->sync_state == SYNC_DONE)
+    if (PRIV(op)->op_state == SYNC_DONE)
    {
-        ack_sync(op);
+        return ack_sync(op);
    }
    return 1;
 }
@@ -196,17 +200,17 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
        // Release used journal sectors
        release_journal_sectors(op);
        // Handle states
-        if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
+        if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
        {
-            PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
        }
-        else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
+        else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
        {
-            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
        }
-        else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
+        else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
        {
-            PRIV(op)->sync_state = SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DONE;
            ack_sync(op);
        }
        else
@@ -218,7 +222,7 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op

 int blockstore_impl_t::ack_sync(blockstore_op_t *op)
 {
-    if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
+    if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
    {
        // Remove dependency of subsequent syncs
        auto it = PRIV(op)->in_progress_ptr;
@@ -230,14 +234,14 @@ int blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            auto & next_sync = *it++;
            PRIV(next_sync)->prev_sync_count -= done_syncs;
-            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE)
+            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
            {
                done_syncs++;
                // Acknowledge next_sync
                ack_one_sync(next_sync);
            }
        }
-        return 1;
+        return 2;
    }
    return 0;
 }
@@ -252,7 +256,17 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
-        dirty_db[*it].state = ST_D_META_SYNCED;
+        auto dirty_it = dirty_db.find(*it);
+        dirty_it->second.state = ST_D_SYNCED;
+        dirty_it++;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
+        {
+            if (dirty_it->second.state == ST_J_WAIT_BIG)
+            {
+                dirty_it->second.state = ST_J_IN_FLIGHT;
+            }
+            dirty_it++;
+        }
    }
    for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
    {
@@ -261,7 +275,16 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
-        dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
+        if (dirty_db[*it].state == ST_DEL_WRITTEN)
+        {
+            dirty_db[*it].state = ST_DEL_SYNCED;
+            // Deletions are treated as immediately stable
+            mark_stable(*it);
+        }
+        else /* == ST_J_WRITTEN */
+        {
+            dirty_db[*it].state = ST_J_SYNCED;
+        }
    }
    in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
    op->retval = 0;
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@@ -4,6 +4,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
+    bool is_inflight_big = false;
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -17,6 +18,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
+            is_inflight_big = dirty_it->second.state >= ST_D_IN_FLIGHT &&
+                dirty_it->second.state < ST_D_SYNCED ||
+                dirty_it->second.state == ST_J_WAIT_BIG;
        }
    }
    if (!found)
@@ -38,7 +42,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    else if (op->version < version)
    {
        // Invalid version requested
-        op->retval = -EINVAL;
+        op->retval = -EEXIST;
        return false;
    }
    if (deleted && is_del)
@@ -47,10 +51,26 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        op->retval = 0;
        return false;
    }
-    // Immediately add the operation into dirty_db, so subsequent reads could see it
+    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
+        immediate_commit != IMMEDIATE_ALL)
+    {
+        // Issue an additional sync so that the previous big write can reach the journal
+        blockstore_op_t *sync_op = new blockstore_op_t;
+        sync_op->opcode = BS_OP_SYNC;
+        sync_op->callback = [this, op](blockstore_op_t *sync_op)
+        {
+            delete sync_op;
+        };
+        enqueue_op(sync_op);
+    }
 #ifdef BLOCKSTORE_DEBUG
-    printf("%s %lu:%lu v%lu\n", is_del ? "Delete" : "Write", op->oid.inode, op->oid.stripe, op->version);
+    if (is_del)
+        printf("Delete %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version);
+    else
+        printf("Write %lu:%lu v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
+    // No strict need to add it into dirty_db here, it's just left
+    // from the previous implementation where reads waited for writes
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
@@ -58,7 +78,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        .state = (uint32_t)(
            is_del
                ? ST_DEL_IN_FLIGHT
-                : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : ST_J_IN_FLIGHT)
+                : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : (is_inflight_big ? ST_J_WAIT_BIG : ST_J_IN_FLIGHT))
        ),
        .flags = 0,
        .location = 0,
@@ -72,11 +92,20 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 // First step of the write algorithm: dequeue operation and submit initial write(s)
 int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
 {
+    if (PRIV(op)->op_state)
+    {
+        return continue_write(op);
+    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
-    if (dirty_it->second.state == ST_D_IN_FLIGHT)
+    assert(dirty_it != dirty_db.end());
+    if (dirty_it->second.state == ST_J_WAIT_BIG)
+    {
+        return 0;
+    }
+    else if (dirty_it->second.state == ST_D_IN_FLIGHT)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -125,12 +154,20 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
-        PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
-        // Remember big write as unsynced
-        unsynced_big_writes.push_back((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        });
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        if (immediate_commit != IMMEDIATE_ALL)
+        {
+            // Remember big write as unsynced
+            unsynced_big_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+            PRIV(op)->op_state = 3;
+        }
+        else
+        {
+            PRIV(op)->op_state = 1;
+        }
    }
    else
    {
@@ -144,10 +181,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // There is sufficient space. Get SQE(s)
        struct io_uring_sqe *sqe1 = NULL;
-        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
+        if (immediate_commit != IMMEDIATE_NONE ||
+            (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
-            // Write current journal sector only if it's dirty and full
+            // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
            BS_SUBMIT_GET_SQE_DECL(sqe1);
        }
        struct io_uring_sqe *sqe2 = NULL;
@@ -157,16 +195,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // Got SQEs. Prepare previous journal sector write if required
        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        if (sqe1)
+        if (immediate_commit == IMMEDIATE_NONE)
        {
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
-            // FIXME rename to min/max _flushing
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
-            PRIV(op)->pending_ops++;
-        }
-        else
-        {
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+            if (sqe1)
+            {
+                prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->pending_ops++;
+            }
+            else
+            {
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+            }
        }
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)
@@ -174,7 +214,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
-        printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
+        printf(
+            "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
+            dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+        );
 #endif
        // Figure out where data will be
        journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
@@ -186,6 +230,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        je->crc32_data = crc32c(0, op->buf, op->len);
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
+        if (immediate_commit != IMMEDIATE_NONE)
+        {
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
        if (op->len > 0)
        {
            // Prepare journal data write
@@ -213,16 +263,130 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        {
            journal.next_free = journal_block_size;
        }
-        // Remember small write as unsynced
-        unsynced_small_writes.push_back((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        });
+        if (immediate_commit == IMMEDIATE_NONE)
+        {
+            // Remember small write as unsynced
+            unsynced_small_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+        }
        if (!PRIV(op)->pending_ops)
        {
-            ack_write(op);
+            PRIV(op)->op_state = 4;
+            continue_write(op);
+        }
+        else
+        {
+            PRIV(op)->op_state = 3;
        }
    }
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_write(blockstore_op_t *op)
+{
+    io_uring_sqe *sqe = NULL;
+    journal_entry_big_write *je;
+    auto dirty_it = dirty_db.find((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    });
+    assert(dirty_it != dirty_db.end());
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 4)
+        goto resume_4;
+    else
+        return 1;
+resume_2:
+    // Only for the immediate_commit mode: prepare and submit big_write journal entry
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+    sqe = get_sqe();
+    if (!sqe)
+    {
+        return 0;
+    }
+    je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
+    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+    journal.sector_info[journal.cur_sector].dirty = false;
+    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
+#ifdef BLOCKSTORE_DEBUG
+    printf(
+        "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
+        journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
+        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+    );
+#endif
+    je->oid = op->oid;
+    je->version = op->version;
+    je->offset = op->offset;
+    je->len = op->len;
+    je->location = dirty_it->second.location;
+    je->crc32 = je_crc32((journal_entry*)je);
+    journal.crc32_last = je->crc32;
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe,
+        [this, op](ring_data_t *data) { handle_write_event(data, op); });
+    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->pending_ops = 1;
+    PRIV(op)->op_state = 3;
+    return 1;
+resume_4:
+    // Switch object state
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("write_done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+#ifdef BLOCKSTORE_DEBUG
+    printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+#endif
+    bool imm = dirty_it->second.state == ST_D_SUBMITTED
+        ? (immediate_commit == IMMEDIATE_ALL)
+        : (immediate_commit != IMMEDIATE_NONE);
+    if (imm)
+    {
+        auto & unstab = unstable_writes[op->oid];
+        unstab = unstab < op->version ? op->version : unstab;
+    }
+    if (dirty_it->second.state == ST_J_SUBMITTED)
+    {
+        dirty_it->second.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
+    }
+    else if (dirty_it->second.state == ST_D_SUBMITTED)
+    {
+        dirty_it->second.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
+    }
+    else if (dirty_it->second.state == ST_DEL_SUBMITTED)
+    {
+        dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
+        if (imm)
+        {
+            // Deletions are treated as immediately stable
+            mark_stable(dirty_it->first);
+        }
+    }
+    if (immediate_commit == IMMEDIATE_ALL)
+    {
+        dirty_it++;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
+        {
+            if (dirty_it->second.state == ST_J_WAIT_BIG)
+            {
+                dirty_it->second.state = ST_J_IN_FLIGHT;
+            }
+            dirty_it++;
+        }
+    }
+    inflight_writes--;
+    // Acknowledge write
+    op->retval = op->len;
+    FINISH_OP(op);
    return 1;
 }

@@ -231,6 +395,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    live = true;
    if (data->res != data->iov.iov_len)
    {
+        inflight_writes--;
        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
@@ -241,88 +406,117 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    if (PRIV(op)->pending_ops == 0)
    {
        release_journal_sectors(op);
-        ack_write(op);
+        PRIV(op)->op_state++;
+        if (!continue_write(op))
+        {
+            submit_queue.push_front(op);
+        }
    }
 }

 void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
 {
-    // Release used journal sectors
-    if (PRIV(op)->min_used_journal_sector > 0 &&
-        PRIV(op)->max_used_journal_sector > 0)
+    // Release flushed journal sectors
+    if (PRIV(op)->min_flushed_journal_sector > 0 &&
+        PRIV(op)->max_flushed_journal_sector > 0)
    {
-        uint64_t s = PRIV(op)->min_used_journal_sector;
+        uint64_t s = PRIV(op)->min_flushed_journal_sector;
        while (1)
        {
            journal.sector_info[s-1].usage_count--;
-            if (s == PRIV(op)->max_used_journal_sector)
+            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
+            {
+                // We know for sure that we won't write into this sector anymore
+                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
+                if (new_ds >= journal.len)
+                {
+                    new_ds = journal.block_size;
+                }
+                if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
+                    (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
+                {
+                    journal.dirty_start = new_ds;
+                }
+            }
+            if (s == PRIV(op)->max_flushed_journal_sector)
                break;
            s = 1 + s % journal.sector_count;
        }
-        PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
    }
 }

-void blockstore_impl_t::ack_write(blockstore_op_t *op)
-{
-    // Switch object state
-    auto & dirty_entry = dirty_db[(obj_ver_id){
-        .oid = op->oid,
-        .version = op->version,
-    }];
-#ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_entry.state);
-#endif
-    if (dirty_entry.state == ST_J_SUBMITTED)
-    {
-        dirty_entry.state = ST_J_WRITTEN;
-    }
-    else if (dirty_entry.state == ST_D_SUBMITTED)
-    {
-        dirty_entry.state = ST_D_WRITTEN;
-    }
-    else if (dirty_entry.state == ST_DEL_SUBMITTED)
-    {
-        dirty_entry.state = ST_DEL_WRITTEN;
-    }
-    // Acknowledge write without sync
-    op->retval = op->len;
-    FINISH_OP(op);
-}
-
 int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
 {
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
+    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
    {
        return 0;
    }
-    BS_SUBMIT_GET_ONLY_SQE(sqe);
+    io_uring_sqe *sqe = NULL;
+    if (immediate_commit != IMMEDIATE_NONE ||
+        (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
+        BS_SUBMIT_GET_SQE_DECL(sqe);
+    }
+    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
    // Prepare journal sector write
+    if (immediate_commit == IMMEDIATE_NONE)
+    {
+        if (sqe)
+        {
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
+        else
+        {
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        }
+    }
+    // Pre-fill journal entry
    journal_entry_del *je = (journal_entry_del*)
        prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
-    printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
+    printf(
+        "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
+        dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+    );
 #endif
    je->oid = op->oid;
    je->version = op->version;
    je->crc32 = je_crc32((journal_entry*)je);
    journal.crc32_last = je->crc32;
-    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-    prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-    PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
-    PRIV(op)->pending_ops = 1;
    dirty_it->second.state = ST_DEL_SUBMITTED;
-    // Remember small write as unsynced
-    unsynced_small_writes.push_back((obj_ver_id){
-        .oid = op->oid,
-        .version = op->version,
-    });
+    if (immediate_commit != IMMEDIATE_NONE)
+    {
+        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->pending_ops++;
+        // Remember small write as unsynced
+        unsynced_small_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
+    }
+    if (!PRIV(op)->pending_ops)
+    {
+        PRIV(op)->op_state = 4;
+        continue_write(op);
+    }
+    else
+    {
+        PRIV(op)->op_state = 3;
+    }
    return 1;
 }
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@@ -0,0 +1,349 @@
+#include "cluster_client.h"
+
+cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
+{
+    this->ringloop = ringloop;
+    this->tfd = tfd;
+
+    msgr.tfd = tfd;
+    msgr.ringloop = ringloop;
+    msgr.repeer_pgs = [this](osd_num_t peer_osd)
+    {
+        // peer_osd just connected or dropped connection
+        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
+        {
+            // really connected :)
+            continue_ops();
+        }
+    };
+
+    st_cli.tfd = tfd;
+    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
+    st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
+    st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
+    st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
+
+    log_level = config["log_level"].int64_value();
+    st_cli.parse_config(config);
+    st_cli.load_global_config();
+}
+
+void cluster_client_t::continue_ops()
+{
+    for (auto op_it = unsent_ops.begin(); op_it != unsent_ops.end(); )
+    {
+        cluster_op_t *op = *op_it;
+        if (op->needs_reslice && !op->sent_count)
+        {
+            op->parts.clear();
+            op->done_count = 0;
+            op->needs_reslice = false;
+        }
+        if (!op->parts.size())
+        {
+            unsent_ops.erase(op_it++);
+            execute(op);
+            continue;
+        }
+        if (!op->needs_reslice)
+        {
+            for (auto & op_part: op->parts)
+            {
+                if (!op_part.sent && !op_part.done)
+                {
+                    try_send(op, &op_part);
+                }
+            }
+            if (op->sent_count == op->parts.size() - op->done_count)
+            {
+                unsent_ops.erase(op_it++);
+                sent_ops.insert(op);
+            }
+            else
+                op_it++;
+        }
+        else
+            op_it++;
+    }
+}
+
+static uint32_t is_power_of_two(uint64_t value)
+{
+    uint32_t l = 0;
+    while (value > 1)
+    {
+        if (value & 1)
+        {
+            return 64;
+        }
+        value = value >> 1;
+        l++;
+    }
+    return l;
+}
+
+void cluster_client_t::on_load_config_hook(json11::Json::object & config)
+{
+    bs_block_size = config["block_size"].uint64_value();
+    bs_disk_alignment = config["disk_alignment"].uint64_value();
+    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
+    if (!bs_block_size)
+        bs_block_size = DEFAULT_BLOCK_SIZE;
+    if (!bs_disk_alignment)
+        bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
+    if (!bs_bitmap_granularity)
+        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
+    {
+        uint32_t block_order;
+        if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
+            throw std::runtime_error("Bad block size");
+    }
+    if (config.find("pg_stripe_size") != config.end())
+    {
+        pg_stripe_size = config["pg_stripe_size"].uint64_value();
+        if (!pg_stripe_size)
+            pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
+    }
+    if (config["immediate_commit"] == "all")
+    {
+        // Cluster-wide immediate_commit mode
+        immediate_commit = true;
+    }
+    msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
+    if (!msgr.peer_connect_interval)
+        msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
+    msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
+    if (!msgr.peer_connect_timeout)
+        msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
+}
+
+void cluster_client_t::on_load_pgs_hook(bool success)
+{
+    if (success)
+    {
+        pg_count = st_cli.pg_config.size();
+        continue_ops();
+    }
+}
+
+void cluster_client_t::on_change_hook(json11::Json::object & changes)
+{
+    if (pg_count != st_cli.pg_config.size())
+    {
+        // At this point, all operations should be suspended
+        // And they need to be resliced!
+        for (auto op: unsent_ops)
+        {
+            op->needs_reslice = true;
+        }
+        for (auto op: sent_ops)
+        {
+            op->needs_reslice = true;
+        }
+        pg_count = st_cli.pg_config.size();
+    }
+    continue_ops();
+}
+
+void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
+{
+    if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
+    {
+        msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
+    }
+}
+
+// FIXME: Implement OSD_OP_SYNC for immediate_commit == false
+void cluster_client_t::execute(cluster_op_t *op)
+{
+    if (op->opcode == OSD_OP_SYNC && immediate_commit)
+    {
+        // Syncs are not required in the immediate_commit mode
+        op->retval = 0;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return;
+    }
+    if (op->opcode != OSD_OP_READ && op->opcode != OSD_OP_OUT || !op->inode || !op->len ||
+        op->offset % bs_disk_alignment || op->len % bs_disk_alignment)
+    {
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return;
+    }
+    if (!pg_stripe_size)
+    {
+        // Config is not loaded yet
+        unsent_ops.insert(op);
+        return;
+    }
+    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
+    {
+        // Copy operation
+        cluster_op_t *op_copy = new cluster_op_t();
+        op_copy->opcode = op->opcode;
+        op_copy->inode = op->inode;
+        op_copy->offset = op->offset;
+        op_copy->len = op->len;
+        op_copy->buf = malloc(op->len);
+        memcpy(op_copy->buf, op->buf, op->len);
+        unsynced_ops.push_back(op_copy);
+        unsynced_bytes += op->len;
+        if (inmemory_commit)
+        {
+            // Immediately acknowledge write and continue with the copy
+            op->retval = op->len;
+            std::function<void(cluster_op_t*)>(op->callback)(op);
+            op = op_copy;
+        }
+        if (unsynced_bytes >= inmemory_dirty_limit)
+        {
+            // Push an extra SYNC operation
+        }
+    }
+    // Slice the request into individual object stripe requests
+    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
+    uint64_t pg_block_size = bs_block_size * pg_part_count;
+    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
+    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
+    int part_count = 0;
+    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
+    {
+        if (op->offset < (stripe+pg_block_size) && (op->offset+op->len) > stripe)
+        {
+            part_count++;
+        }
+    }
+    op->parts.resize(part_count);
+    bool resend = false;
+    int i = 0;
+    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
+    {
+        uint64_t stripe_end = stripe + pg_block_size;
+        if (op->offset < stripe_end && (op->offset+op->len) > stripe)
+        {
+            pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pg_count + 1;
+            op->parts[i] = {
+                .parent = op,
+                .offset = op->offset < stripe ? stripe : op->offset,
+                .len = (uint32_t)((op->offset+op->len) > stripe_end ? pg_block_size : op->offset+op->len-stripe),
+                .pg_num = pg_num,
+                .buf = op->buf + (op->offset < stripe ? stripe-op->offset : 0),
+                .sent = false,
+                .done = false,
+            };
+            if (!try_send(op, &op->parts[i]))
+            {
+                // Part needs to be sent later
+                resend = true;
+            }
+            i++;
+        }
+    }
+    if (resend)
+    {
+        unsent_ops.insert(op);
+    }
+    else
+    {
+        sent_ops.insert(op);
+    }
+}
+
+bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
+{
+    auto pg_it = st_cli.pg_config.find(part->pg_num);
+    if (pg_it != st_cli.pg_config.end() &&
+        !pg_it->second.pause && pg_it->second.cur_primary)
+    {
+        osd_num_t primary_osd = pg_it->second.cur_primary;
+        auto peer_it = msgr.osd_peer_fds.find(primary_osd);
+        if (peer_it != msgr.osd_peer_fds.end())
+        {
+            int peer_fd = peer_it->second;
+            part->osd_num = primary_osd;
+            part->sent = true;
+            op->sent_count++;
+            part->op = {
+                .op_type = OSD_OP_OUT,
+                .peer_fd = peer_fd,
+                .req = { .rw = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = op_id++,
+                        .opcode = op->opcode,
+                    },
+                    .inode = op->inode,
+                    .offset = part->offset,
+                    .len = part->len,
+                } },
+                .callback = [this, part](osd_op_t *op_part)
+                {
+                    handle_op_part(part);
+                },
+            };
+            part->op.send_list.push_back(part->op.req.buf, OSD_PACKET_SIZE);
+            if (op->opcode == OSD_OP_WRITE)
+            {
+                part->op.send_list.push_back(part->buf, part->len);
+            }
+            else
+            {
+                part->op.buf = part->buf;
+            }
+            msgr.outbox_push(&part->op);
+            return true;
+        }
+        else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
+        {
+            msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
+        }
+    }
+    return false;
+}
+
+void cluster_client_t::handle_op_part(cluster_op_part_t *part)
+{
+    cluster_op_t *op = part->parent;
+    part->sent = false;
+    op->sent_count--;
+    part->op.buf = NULL;
+    if (part->op.reply.hdr.retval != part->op.req.rw.len)
+    {
+        // Operation failed, retry
+        printf(
+            "Operation part failed on OSD %lu: retval=%ld (expected %u), reconnecting\n",
+            part->osd_num, part->op.reply.hdr.retval, part->op.req.rw.len
+        );
+        msgr.stop_client(part->op.peer_fd);
+        if (op->sent_count == op->parts.size() - op->done_count - 1)
+        {
+            // Resend later when OSDs come up
+            // FIXME: Check for different types of errors
+            // FIXME: Repeat operations after a small timeout, for the case when OSD is coming up
+            sent_ops.erase(op);
+            unsent_ops.insert(op);
+        }
+        if (op->sent_count == 0 && op->needs_reslice)
+        {
+            // PG count has changed, reslice the operation
+            unsent_ops.erase(op);
+            op->parts.clear();
+            op->done_count = 0;
+            op->needs_reslice = false;
+            execute(op);
+        }
+    }
+    else
+    {
+        // OK
+        part->done = true;
+        op->done_count++;
+        if (op->done_count >= op->parts.size())
+        {
+            // Finished!
+            sent_ops.erase(op);
+            op->retval = op->len;
+            std::function<void(cluster_op_t*)>(op->callback)(op);
+        }
+    }
+}
--- a/cluster_client.h
+++ b/cluster_client.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "messenger.h"
+#include "etcd_state_client.h"
+
+#define MIN_BLOCK_SIZE 4*1024
+#define MAX_BLOCK_SIZE 128*1024*1024
+#define DEFAULT_BLOCK_SIZE 128*1024
+#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
+#define DEFAULT_DISK_ALIGNMENT 4096
+#define DEFAULT_BITMAP_GRANULARITY 4096
+
+struct cluster_op_t;
+
+struct cluster_op_part_t
+{
+    cluster_op_t *parent;
+    uint64_t offset;
+    uint32_t len;
+    pg_num_t pg_num;
+    osd_num_t osd_num;
+    void *buf;
+    bool sent;
+    bool done;
+    osd_op_t op;
+};
+
+struct cluster_op_t
+{
+    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC
+    uint64_t inode;
+    uint64_t offset;
+    uint64_t len;
+    int retval;
+    void *buf;
+    std::function<void(cluster_op_t*)> callback;
+protected:
+    bool needs_reslice = false;
+    int sent_count = 0, done_count = 0;
+    std::vector<cluster_op_part_t> parts;
+    friend class cluster_client_t;
+};
+
+class cluster_client_t
+{
+    timerfd_manager_t *tfd;
+    ring_loop_t *ringloop;
+
+    uint64_t pg_part_count = 2;
+    uint64_t pg_stripe_size = 0;
+    uint64_t bs_block_size = 0;
+    uint64_t bs_disk_alignment = 0;
+    uint64_t bs_bitmap_granularity = 0;
+    uint64_t pg_count = 0;
+    bool immediate_commit = false;
+    bool inmemory_commit = false;
+    uint64_t inmemory_dirty_limit = 32*1024*1024;
+    int log_level;
+
+    uint64_t op_id = 1;
+    etcd_state_client_t st_cli;
+    osd_messenger_t msgr;
+    std::set<cluster_op_t*> sent_ops, unsent_ops;
+    // unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
+    std::vector<cluster_op_t*> unsynced_ops;
+    uint64_t unsynced_bytes = 0;
+
+public:
+    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
+    void execute(cluster_op_t *op);
+
+protected:
+    void continue_ops();
+    void on_load_config_hook(json11::Json::object & cfg);
+    void on_load_pgs_hook(bool success);
+    void on_change_hook(json11::Json::object & changes);
+    void on_change_osd_state_hook(uint64_t peer_osd);
+    bool try_send(cluster_op_t *op, cluster_op_part_t *part);
+    void handle_op_part(cluster_op_part_t *part);
+};
--- a/dump_journal.cpp
+++ b/dump_journal.cpp
@@ -0,0 +1,165 @@
+#define _LARGEFILE64_SOURCE
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <linux/fs.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include "blockstore_impl.h"
+#include "crc32c.h"
+
+struct journal_dump_t
+{
+    char *journal_device;
+    uint32_t journal_block;
+    uint64_t journal_offset;
+    uint64_t journal_len;
+    uint64_t journal_pos;
+    int fd;
+
+    void dump_block(void *buf);
+};
+
+int main(int argc, char *argv[])
+{
+    if (argc < 5)
+    {
+        printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
+        return 1;
+    }
+    journal_dump_t self;
+    self.journal_device = argv[1];
+    self.journal_block = strtoul(argv[2], NULL, 10);
+    self.journal_offset = strtoull(argv[3], NULL, 10);
+    self.journal_len = strtoull(argv[4], NULL, 10);
+    if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
+        self.journal_block > 128*1024)
+    {
+        printf("Invalid journal block size\n");
+        return 1;
+    }
+    self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
+    if (self.fd == -1)
+    {
+        printf("Failed to open journal\n");
+        return 1;
+    }
+    void *data = memalign(MEM_ALIGNMENT, self.journal_block);
+    self.journal_pos = 0;
+    while (self.journal_pos < self.journal_len)
+    {
+        int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
+        assert(r == self.journal_block);
+        uint64_t s;
+        for (s = 0; s < self.journal_block; s += 8)
+        {
+            if (*((uint64_t*)(data+s)) != 0)
+                break;
+        }
+        if (s == self.journal_block)
+        {
+            printf("offset %08lx: zeroes\n", self.journal_pos);
+            self.journal_pos += self.journal_block;
+        }
+        else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
+        {
+            printf("offset %08lx:\n", self.journal_pos);
+            self.dump_block(data);
+        }
+        else
+        {
+            printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
+            self.journal_pos += self.journal_block;
+        }
+    }
+    free(data);
+    close(self.fd);
+    return 0;
+}
+
+void journal_dump_t::dump_block(void *buf)
+{
+    uint32_t pos = 0;
+    journal_pos += journal_block;
+    int entry = 0;
+    bool wrapped = false;
+    while (pos < journal_block)
+    {
+        journal_entry *je = (journal_entry*)(buf + pos);
+        if (je->magic != JOURNAL_MAGIC || je->type < JE_START || je->type > JE_DELETE)
+        {
+            break;
+        }
+        const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
+        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
+        if (je->type == JE_START)
+        {
+            printf("je_start start=%08lx\n", je->start.journal_start);
+        }
+        else if (je->type == JE_SMALL_WRITE)
+        {
+            printf(
+                "je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u loc=%08lx",
+                je->small_write.oid.inode, je->small_write.oid.stripe,
+                je->small_write.version, je->small_write.offset, je->small_write.len,
+                je->small_write.data_offset
+            );
+            if (journal_pos + je->small_write.len > journal_len)
+            {
+                // data continues from the beginning of the journal
+                journal_pos = journal_block;
+                wrapped = true;
+            }
+            if (journal_pos != je->small_write.data_offset)
+            {
+                printf(" (mismatched, calculated = %lu)", journal_pos);
+            }
+            journal_pos += je->small_write.len;
+            if (journal_pos >= journal_len)
+            {
+                journal_pos = journal_block;
+                wrapped = true;
+            }
+            uint32_t data_crc32 = 0;
+            void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
+            assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
+            data_crc32 = crc32c(0, data, je->small_write.len);
+            free(data);
+            printf(
+                " data_crc32=%08x%s", je->small_write.crc32_data,
+                (data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
+            );
+            printf("\n");
+        }
+        else if (je->type == JE_BIG_WRITE)
+        {
+            printf("je_big_write oid=%lu:%lu ver=%lu loc=%08lx\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
+        }
+        else if (je->type == JE_STABLE)
+        {
+            printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
+        }
+        else if (je->type == JE_ROLLBACK)
+        {
+            printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
+        }
+        else if (je->type == JE_DELETE)
+        {
+            printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
+        }
+        pos += je->size;
+        entry++;
+    }
+    if (wrapped)
+    {
+        journal_pos = journal_len;
+    }
+}
--- a/epoll_manager.cpp
+++ b/epoll_manager.cpp
@@ -0,0 +1,92 @@
+#include <sys/epoll.h>
+#include <sys/poll.h>
+#include <unistd.h>
+
+#include "epoll_manager.h"
+
+#define MAX_EPOLL_EVENTS 64
+
+epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
+{
+    this->ringloop = ringloop;
+
+    epoll_fd = epoll_create(1);
+    if (epoll_fd < 0)
+    {
+        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
+    }
+
+    tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
+
+    handle_epoll_events();
+}
+
+epoll_manager_t::~epoll_manager_t()
+{
+    if (tfd)
+    {
+        delete tfd;
+        tfd = NULL;
+    }
+    close(epoll_fd);
+}
+
+void epoll_manager_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
+{
+    if (handler != NULL)
+    {
+        bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
+        epoll_event ev;
+        ev.data.fd = fd;
+        ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
+        if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
+        {
+            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+        }
+        epoll_handlers[fd] = handler;
+    }
+    else
+    {
+        if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
+        {
+            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+        }
+        epoll_handlers.erase(fd);
+    }
+}
+
+void epoll_manager_t::handle_epoll_events()
+{
+    {
+        timespec now;
+        clock_gettime(CLOCK_REALTIME, &now);
+        printf("epoll %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+    }
+    io_uring_sqe *sqe = ringloop->get_sqe();
+    if (!sqe)
+    {
+        throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
+    }
+    ring_data_t *data = ((ring_data_t*)sqe->user_data);
+    my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
+    data->callback = [this](ring_data_t *data)
+    {
+        if (data->res < 0)
+        {
+            throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
+        }
+        handle_epoll_events();
+    };
+    ringloop->submit();
+    int nfds;
+    epoll_event events[MAX_EPOLL_EVENTS];
+    do
+    {
+        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
+        for (int i = 0; i < nfds; i++)
+        {
+            auto & cb = epoll_handlers[events[i].data.fd];
+            cb(events[i].data.fd, events[i].events);
+        }
+    } while (nfds == MAX_EPOLL_EVENTS);
+}
--- a/epoll_manager.h
+++ b/epoll_manager.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <map>
+
+#include "ringloop.h"
+#include "timerfd_manager.h"
+
+class epoll_manager_t
+{
+    int epoll_fd;
+    ring_loop_t *ringloop;
+    std::map<int, std::function<void(int, int)>> epoll_handlers;
+public:
+    epoll_manager_t(ring_loop_t *ringloop);
+    ~epoll_manager_t();
+    void set_fd_handler(int fd, std::function<void(int, int)> handler);
+    void handle_epoll_events();
+
+    timerfd_manager_t *tfd;
+};
--- a/etcd_state_client.cpp
+++ b/etcd_state_client.cpp
@@ -0,0 +1,424 @@
+#include "osd_ops.h"
+#include "pg_states.h"
+#include "etcd_state_client.h"
+#include "http_client.h"
+#include "base64.h"
+
+json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
+{
+    json_kv_t kv;
+    kv.key = base64_decode(kv_json["key"].string_value());
+    std::string json_err, json_text = base64_decode(kv_json["value"].string_value());
+    kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
+    if (json_err != "")
+    {
+        printf("Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
+        kv.key = "";
+    }
+    return kv;
+}
+
+void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
+{
+    etcd_call("/kv/txn", txn, timeout, callback);
+}
+
+void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
+{
+    std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
+    std::string etcd_api_path;
+    int pos = etcd_address.find('/');
+    if (pos >= 0)
+    {
+        etcd_api_path = etcd_address.substr(pos);
+        etcd_address = etcd_address.substr(0, pos);
+    }
+    std::string req = payload.dump();
+    req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
+        "Host: "+etcd_address+"\r\n"
+        "Content-Type: application/json\r\n"
+        "Content-Length: "+std::to_string(req.size())+"\r\n"
+        "Connection: close\r\n"
+        "\r\n"+req;
+    http_request_json(tfd, etcd_address, req, timeout, callback);
+}
+
+void etcd_state_client_t::parse_config(json11::Json & config)
+{
+    this->etcd_addresses.clear();
+    if (config["etcd_address"].is_string())
+    {
+        std::string ea = config["etcd_address"].string_value();
+        while (1)
+        {
+            int pos = ea.find(',');
+            std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
+            if (addr.length() > 0)
+            {
+                if (addr.find('/') < 0)
+                    addr += "/v3";
+                this->etcd_addresses.push_back(addr);
+            }
+            if (pos >= 0)
+                ea = ea.substr(pos+1);
+            else
+                break;
+        }
+    }
+    else if (config["etcd_address"].array_items().size())
+    {
+        for (auto & ea: config["etcd_address"].array_items())
+        {
+            std::string addr = ea.string_value();
+            if (addr != "")
+            {
+                if (addr.find('/') < 0)
+                    addr += "/v3";
+                this->etcd_addresses.push_back(addr);
+            }
+        }
+    }
+    this->etcd_prefix = config["etcd_prefix"].string_value();
+    if (this->etcd_prefix == "")
+    {
+        this->etcd_prefix = "/microceph";
+    }
+    else if (this->etcd_prefix[0] != '/')
+    {
+        this->etcd_prefix = "/"+this->etcd_prefix;
+    }
+    this->log_level = config["log_level"].int64_value();
+}
+
+void etcd_state_client_t::start_etcd_watcher()
+{
+    std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
+    std::string etcd_api_path;
+    int pos = etcd_address.find('/');
+    if (pos >= 0)
+    {
+        etcd_api_path = etcd_address.substr(pos);
+        etcd_address = etcd_address.substr(0, pos);
+    }
+    etcd_watches_initialised = 0;
+    etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT, [this](const http_response_t *msg)
+    {
+        if (msg->body.length())
+        {
+            std::string json_err;
+            json11::Json data = json11::Json::parse(msg->body, json_err);
+            if (json_err != "")
+            {
+                printf("Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
+            }
+            else
+            {
+                if (data["result"]["created"].bool_value())
+                {
+                    etcd_watches_initialised++;
+                }
+                if (etcd_watches_initialised == 4)
+                {
+                    etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
+                }
+                // First gather all changes into a hash to remove multiple overwrites
+                json11::Json::object changes;
+                for (auto & ev: data["result"]["events"].array_items())
+                {
+                    auto kv = parse_etcd_kv(ev["kv"]);
+                    if (kv.key != "")
+                    {
+                        changes[kv.key] = kv.value;
+                    }
+                }
+                for (auto & kv: changes)
+                {
+                    if (this->log_level > 0)
+                    {
+                        printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.dump().c_str());
+                    }
+                    parse_state(kv.first, kv.second);
+                }
+                // React to changes
+                if (on_change_hook != NULL)
+                {
+                    on_change_hook(changes);
+                }
+            }
+        }
+        if (msg->eof)
+        {
+            etcd_watch_ws = NULL;
+            if (etcd_watches_initialised == 0)
+            {
+                // Connection not established, retry in <ETCD_SLOW_TIMEOUT>
+                tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int)
+                {
+                    start_etcd_watcher();
+                });
+            }
+            else
+            {
+                // Connection was live, retry immediately
+                start_etcd_watcher();
+            }
+        }
+    });
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/config/") },
+            { "range_end", base64_encode(etcd_prefix+"/config0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_CONFIG_WATCH_ID },
+        } }
+    }).dump());
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/osd/state/") },
+            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
+        } }
+    }).dump());
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/pg/state/") },
+            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_PG_STATE_WATCH_ID },
+        } }
+    }).dump());
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/pg/history/") },
+            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
+        } }
+    }).dump());
+}
+
+void etcd_state_client_t::load_global_config()
+{
+    etcd_call("/kv/range", json11::Json::object {
+        { "key", base64_encode(etcd_prefix+"/config/global") }
+    }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "")
+        {
+            printf("Error reading OSD configuration from etcd: %s\n", err.c_str());
+            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
+            {
+                load_global_config();
+            });
+            return;
+        }
+        if (!etcd_watch_revision)
+        {
+            etcd_watch_revision = data["header"]["revision"].uint64_value();
+        }
+        json11::Json::object global_config;
+        if (data["kvs"].array_items().size() > 0)
+        {
+            auto kv = parse_etcd_kv(data["kvs"][0]);
+            if (kv.value.is_object())
+            {
+                global_config = kv.value.object_items();
+            }
+        }
+        on_load_config_hook(global_config);
+    });
+}
+
+void etcd_state_client_t::load_pgs()
+{
+    json11::Json::array txn = {
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/config/pgs") },
+            } }
+        },
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/pg/history/") },
+                { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
+            } }
+        },
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/pg/state/") },
+                { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
+            } }
+        },
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/osd/state/") },
+                { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
+            } }
+        },
+    };
+    json11::Json::object req = { { "success", txn } };
+    json11::Json checks = load_pgs_checks_hook != NULL ? load_pgs_checks_hook() : json11::Json();
+    if (checks.array_items().size() > 0)
+    {
+        req["compare"] = checks;
+    }
+    etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "")
+        {
+            printf("Error loading PGs from etcd: %s\n", err.c_str());
+            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
+            {
+                load_pgs();
+            });
+            return;
+        }
+        if (!data["succeeded"].bool_value())
+        {
+            on_load_pgs_hook(false);
+            return;
+        }
+        for (auto & res: data["responses"].array_items())
+        {
+            for (auto & kv_json: res["response_range"]["kvs"].array_items())
+            {
+                auto kv = parse_etcd_kv(kv_json);
+                parse_state(kv.key, kv.value);
+            }
+        }
+        on_load_pgs_hook(true);
+    });
+}
+
+void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
+{
+    if (key == etcd_prefix+"/config/pgs")
+    {
+        for (auto & pg_item: this->pg_config)
+        {
+            pg_item.second.exists = false;
+        }
+        for (auto & pg_item: value["items"].object_items())
+        {
+            pg_num_t pg_num = stoull_full(pg_item.first);
+            if (!pg_num)
+            {
+                printf("Bad key in PG configuration: %s (must be a number), skipped\n", pg_item.first.c_str());
+                continue;
+            }
+            this->pg_config[pg_num].exists = true;
+            this->pg_config[pg_num].pause = pg_item.second["pause"].bool_value();
+            this->pg_config[pg_num].primary = pg_item.second["primary"].uint64_value();
+            this->pg_config[pg_num].target_set.clear();
+            for (auto pg_osd: pg_item.second["osd_set"].array_items())
+            {
+                this->pg_config[pg_num].target_set.push_back(pg_osd.uint64_value());
+            }
+            if (this->pg_config[pg_num].target_set.size() != 3)
+            {
+                printf("Bad PG %u config format: incorrect osd_set = %s\n", pg_num, pg_item.second["osd_set"].dump().c_str());
+                this->pg_config[pg_num].target_set.resize(3);
+                this->pg_config[pg_num].pause = true;
+            }
+        }
+    }
+    else if (key.substr(0, etcd_prefix.length()+12) == etcd_prefix+"/pg/history/")
+    {
+        // <etcd_prefix>/pg/history/%d
+        pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+12));
+        if (!pg_num)
+        {
+            printf("Bad etcd key %s, ignoring\n", key.c_str());
+        }
+        else
+        {
+            auto & pg_cfg = this->pg_config[pg_num];
+            pg_cfg.target_history.clear();
+            pg_cfg.all_peers.clear();
+            // Refuse to start PG if any set of the <osd_sets> has no live OSDs
+            for (auto hist_item: value["osd_sets"].array_items())
+            {
+                std::vector<osd_num_t> history_set;
+                for (auto pg_osd: hist_item.array_items())
+                {
+                    history_set.push_back(pg_osd.uint64_value());
+                }
+                pg_cfg.target_history.push_back(history_set);
+            }
+            // Include these additional OSDs when peering the PG
+            for (auto pg_osd: value["all_peers"].array_items())
+            {
+                pg_cfg.all_peers.push_back(pg_osd.uint64_value());
+            }
+        }
+    }
+    else if (key.substr(0, etcd_prefix.length()+10) == etcd_prefix+"/pg/state/")
+    {
+        // <etcd_prefix>/pg/state/%d
+        pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+10));
+        if (!pg_num)
+        {
+            printf("Bad etcd key %s, ignoring\n", key.c_str());
+        }
+        else if (value.is_null())
+        {
+            this->pg_config[pg_num].cur_primary = 0;
+            this->pg_config[pg_num].cur_state = 0;
+        }
+        else
+        {
+            osd_num_t cur_primary = value["primary"].uint64_value();
+            int state = 0;
+            for (auto & e: value["state"].array_items())
+            {
+                int i;
+                for (i = 0; i < pg_state_bit_count; i++)
+                {
+                    if (e.string_value() == pg_state_names[i])
+                    {
+                        state = state | pg_state_bits[i];
+                        break;
+                    }
+                }
+                if (i >= pg_state_bit_count)
+                {
+                    printf("Unexpected PG %u state keyword in etcd: %s\n", pg_num, e.dump().c_str());
+                    return;
+                }
+            }
+            if (!cur_primary || !value["state"].is_array() || !state ||
+                (state & PG_OFFLINE) && state != PG_OFFLINE ||
+                (state & PG_PEERING) && state != PG_PEERING ||
+                (state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
+            {
+                printf("Unexpected PG %u state in etcd: primary=%lu, state=%s\n", pg_num, cur_primary, value["state"].dump().c_str());
+                return;
+            }
+            this->pg_config[pg_num].cur_primary = cur_primary;
+            this->pg_config[pg_num].cur_state = state;
+        }
+    }
+    else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")
+    {
+        // <etcd_prefix>/osd/state/%d
+        osd_num_t peer_osd = std::stoull(key.substr(etcd_prefix.length()+11));
+        if (peer_osd > 0)
+        {
+            if (value.is_object() && value["state"] == "up" &&
+                value["addresses"].is_array() &&
+                value["port"].int64_value() > 0 && value["port"].int64_value() < 65536)
+            {
+                this->peer_states[peer_osd] = value;
+            }
+            else
+            {
+                this->peer_states.erase(peer_osd);
+            }
+            if (on_change_osd_state_hook != NULL)
+            {
+                on_change_osd_state_hook(peer_osd);
+            }
+        }
+    }
+}
--- a/etcd_state_client.h
+++ b/etcd_state_client.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "osd_id.h"
+#include "http_client.h"
+#include "timerfd_manager.h"
+
+#define ETCD_CONFIG_WATCH_ID 1
+#define ETCD_PG_STATE_WATCH_ID 2
+#define ETCD_PG_HISTORY_WATCH_ID 3
+#define ETCD_OSD_STATE_WATCH_ID 4
+
+#define MAX_ETCD_ATTEMPTS 5
+#define ETCD_SLOW_TIMEOUT 5000
+#define ETCD_QUICK_TIMEOUT 1000
+
+struct pg_config_t
+{
+    bool exists;
+    osd_num_t primary;
+    std::vector<osd_num_t> target_set;
+    std::vector<std::vector<osd_num_t>> target_history;
+    std::vector<osd_num_t> all_peers;
+    bool pause;
+    osd_num_t cur_primary;
+    int cur_state;
+};
+
+struct json_kv_t
+{
+    std::string key;
+    json11::Json value;
+};
+
+struct etcd_state_client_t
+{
+    std::vector<std::string> etcd_addresses;
+    std::string etcd_prefix;
+    int log_level = 0;
+    timerfd_manager_t *tfd = NULL;
+
+    int etcd_watches_initialised = 0;
+    uint64_t etcd_watch_revision = 0;
+    websocket_t *etcd_watch_ws = NULL;
+    std::map<pg_num_t, pg_config_t> pg_config;
+    std::map<osd_num_t, json11::Json> peer_states;
+
+    std::function<void(json11::Json::object &)> on_change_hook;
+    std::function<void(json11::Json::object &)> on_load_config_hook;
+    std::function<json11::Json()> load_pgs_checks_hook;
+    std::function<void(bool)> on_load_pgs_hook;
+    std::function<void(uint64_t)> on_change_osd_state_hook;
+
+    json_kv_t parse_etcd_kv(const json11::Json & kv_json);
+    void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
+    void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
+    void start_etcd_watcher();
+    void load_global_config();
+    void load_pgs();
+    void parse_state(const std::string & key, const json11::Json & value);
+    void parse_config(json11::Json & config);
+};
--- a/fio_cluster.cpp
+++ b/fio_cluster.cpp
@@ -0,0 +1,298 @@
+// FIO engine to test cluster I/O
+//
+// Random write:
+//
+// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
+//     -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
+//
+// Linear write:
+//
+// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
+//     -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
+//
+// Random read (run with -iodepth=32 or -iodepth=1):
+//
+// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
+//     -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <vector>
+#include <unordered_map>
+
+#include "epoll_manager.h"
+#include "cluster_client.h"
+extern "C" {
+#define CONFIG_HAVE_GETTID
+#define CONFIG_PWRITEV2
+#include "fio/fio.h"
+#include "fio/optgroup.h"
+}
+
+struct sec_data
+{
+    ring_loop_t *ringloop = NULL;
+    epoll_manager_t *epmgr = NULL;
+    cluster_client_t *cli = NULL;
+    bool last_sync = false;
+    /* The list of completed io_u structs. */
+    std::vector<io_u*> completed;
+    uint64_t op_n = 0, inflight = 0;
+    bool trace = false;
+};
+
+struct sec_options
+{
+    int __pad;
+    char *etcd_host = NULL;
+    char *etcd_prefix = NULL;
+    int inode = 0;
+    int trace = 0;
+};
+
+static struct fio_option options[] = {
+    {
+        .name   = "etcd",
+        .lname  = "etcd address",
+        .type   = FIO_OPT_STR_STORE,
+        .off1   = offsetof(struct sec_options, etcd_host),
+        .help   = "etcd address in the form HOST:PORT[/PATH]",
+        .category = FIO_OPT_C_ENGINE,
+        .group  = FIO_OPT_G_FILENAME,
+    },
+    {
+        .name   = "etcd",
+        .lname  = "etcd key prefix",
+        .type   = FIO_OPT_STR_STORE,
+        .off1   = offsetof(struct sec_options, etcd_prefix),
+        .help   = "etcd key prefix, by default /microceph",
+        .category = FIO_OPT_C_ENGINE,
+        .group  = FIO_OPT_G_FILENAME,
+    },
+    {
+        .name   = "inode",
+        .lname  = "inode to run tests on",
+        .type   = FIO_OPT_INT,
+        .off1   = offsetof(struct sec_options, inode),
+        .help   = "inode to run tests on (1 by default)",
+        .category = FIO_OPT_C_ENGINE,
+        .group  = FIO_OPT_G_FILENAME,
+    },
+    {
+        .name   = "osd_trace",
+        .lname  = "OSD trace",
+        .type   = FIO_OPT_BOOL,
+        .off1   = offsetof(struct sec_options, trace),
+        .help   = "Trace OSD operations",
+        .def    = "0",
+        .category = FIO_OPT_C_ENGINE,
+        .group  = FIO_OPT_G_FILENAME,
+    },
+    {
+        .name = NULL,
+    },
+};
+
+static int sec_setup(struct thread_data *td)
+{
+    sec_data *bsd;
+
+    bsd = new sec_data;
+    if (!bsd)
+    {
+        td_verror(td, errno, "calloc");
+        return 1;
+    }
+    td->io_ops_data = bsd;
+
+    if (!td->files_index)
+    {
+        add_file(td, "osd_cluster", 0, 0);
+        td->o.nr_files = td->o.nr_files ? : 1;
+        td->o.open_files++;
+    }
+
+    return 0;
+}
+
+static void sec_cleanup(struct thread_data *td)
+{
+    sec_data *bsd = (sec_data*)td->io_ops_data;
+    if (bsd)
+    {
+        delete bsd->cli;
+        delete bsd->epmgr;
+        delete bsd->ringloop;
+        bsd->cli = NULL;
+        bsd->epmgr = NULL;
+        bsd->ringloop = NULL;
+    }
+}
+
+/* Connect to the server from each thread. */
+static int sec_init(struct thread_data *td)
+{
+    sec_options *o = (sec_options*)td->eo;
+    sec_data *bsd = (sec_data*)td->io_ops_data;
+
+    json11::Json cfg = json11::Json::object {
+        { "etcd_address", std::string(o->etcd_host) },
+        { "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/microceph") },
+    };
+
+    bsd->ringloop = new ring_loop_t(512);
+    bsd->epmgr = new epoll_manager_t(bsd->ringloop);
+    bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
+
+    bsd->trace = o->trace ? true : false;
+
+    return 0;
+}
+
+/* Begin read or write request. */
+static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
+{
+    sec_options *opt = (sec_options*)td->eo;
+    sec_data *bsd = (sec_data*)td->io_ops_data;
+    int n = bsd->op_n;
+
+    fio_ro_check(td, io);
+    if (io->ddir == DDIR_SYNC && bsd->last_sync)
+    {
+        return FIO_Q_COMPLETED;
+    }
+
+    io->engine_data = bsd;
+    cluster_op_t *op = new cluster_op_t;
+
+    switch (io->ddir)
+    {
+    case DDIR_READ:
+        op->opcode = OSD_OP_READ;
+        op->inode = opt->inode;
+        op->offset = io->offset;
+        op->len = io->xfer_buflen;
+        op->buf = io->xfer_buf;
+        bsd->last_sync = false;
+        break;
+    case DDIR_WRITE:
+        op->opcode = OSD_OP_WRITE;
+        op->inode = opt->inode;
+        op->offset = io->offset;
+        op->len = io->xfer_buflen;
+        op->buf = io->xfer_buf;
+        bsd->last_sync = false;
+        break;
+    case DDIR_SYNC:
+        op->opcode = OSD_OP_SYNC;
+        bsd->last_sync = true;
+        break;
+    default:
+        io->error = EINVAL;
+        return FIO_Q_COMPLETED;
+    }
+
+    op->callback = [io, n](cluster_op_t *op)
+    {
+        io->error = op->retval < 0 ? -op->retval : 0;
+        sec_data *bsd = (sec_data*)io->engine_data;
+        bsd->inflight--;
+        bsd->completed.push_back(io);
+        if (bsd->trace)
+        {
+            printf("--- %s n=%d retval=%d\n", io->ddir == DDIR_READ ? "READ" :
+                (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n, op->retval);
+        }
+        delete op;
+    };
+
+    if (opt->trace)
+    {
+        printf("+++ %s # %d\n", io->ddir == DDIR_READ ? "READ" :
+            (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n);
+    }
+
+    io->error = 0;
+    bsd->inflight++;
+    bsd->op_n++;
+    bsd->cli->execute(op);
+
+    if (io->error != 0)
+        return FIO_Q_COMPLETED;
+    return FIO_Q_QUEUED;
+}
+
+static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
+{
+    sec_data *bsd = (sec_data*)td->io_ops_data;
+    while (true)
+    {
+        bsd->ringloop->loop();
+        if (bsd->completed.size() >= min)
+            break;
+        bsd->ringloop->wait();
+    }
+    return bsd->completed.size();
+}
+
+static struct io_u *sec_event(struct thread_data *td, int event)
+{
+    sec_data *bsd = (sec_data*)td->io_ops_data;
+    if (bsd->completed.size() == 0)
+        return NULL;
+    /* FIXME We ignore the event number and assume fio calls us exactly once for [0..nr_events-1] */
+    struct io_u *ev = bsd->completed.back();
+    bsd->completed.pop_back();
+    return ev;
+}
+
+static int sec_io_u_init(struct thread_data *td, struct io_u *io)
+{
+    io->engine_data = NULL;
+    return 0;
+}
+
+static void sec_io_u_free(struct thread_data *td, struct io_u *io)
+{
+}
+
+static int sec_open_file(struct thread_data *td, struct fio_file *f)
+{
+    return 0;
+}
+
+static int sec_invalidate(struct thread_data *td, struct fio_file *f)
+{
+    return 0;
+}
+
+struct ioengine_ops ioengine = {
+    .name               = "microceph_cluster",
+    .version            = FIO_IOOPS_VERSION,
+    .flags              = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
+    .setup              = sec_setup,
+    .init               = sec_init,
+    .queue              = sec_queue,
+    .getevents          = sec_getevents,
+    .event              = sec_event,
+    .cleanup            = sec_cleanup,
+    .open_file          = sec_open_file,
+    .invalidate         = sec_invalidate,
+    .io_u_init          = sec_io_u_init,
+    .io_u_free          = sec_io_u_free,
+    .option_struct_size = sizeof(struct sec_options),
+    .options            = options,
+};
+
+static void fio_init fio_sec_register(void)
+{
+    register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_sec_unregister(void)
+{
+    unregister_ioengine(&ioengine);
+}
--- a/fio_engine.cpp
+++ b/fio_engine.cpp
@@ -23,6 +23,7 @@

 #include "blockstore.h"
 extern "C" {
+#define CONFIG_HAVE_GETTID
 #define CONFIG_PWRITEV2
 #include "fio/fio.h"
 #include "fio/optgroup.h"
@@ -100,7 +101,7 @@ static void bs_cleanup(struct thread_data *td)
                bsd->ringloop->loop();
                if (bsd->bs->is_safe_to_stop())
                    goto safe;
-            } while (bsd->ringloop->get_loop_again());
+            } while (bsd->ringloop->has_work());
            bsd->ringloop->wait();
        }
    safe:
--- a/fio_sec_osd.cpp
+++ b/fio_sec_osd.cpp
@@ -5,7 +5,7 @@
 // Random write:
 //
 // fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
-//     -host=127.0.0.1 -port=11203 [-single_primary=1] -size=1000M
+//     -host=127.0.0.1 -port=11203 [-block_size_order=17] [-single_primary=1] -size=1000M
 //
 // Linear write:
 //
@@ -28,6 +28,7 @@
 #include "rw_blocking.h"
 #include "osd_ops.h"
 extern "C" {
+#define CONFIG_HAVE_GETTID
 #define CONFIG_PWRITEV2
 #include "fio/fio.h"
 #include "fio/optgroup.h"
@@ -52,6 +53,7 @@ struct sec_options
    int port = 0;
    int single_primary = 0;
    int trace = 0;
+    int block_order = 17;
 };

 static struct fio_option options[] = {
@@ -73,6 +75,15 @@ static struct fio_option options[] = {
        .category = FIO_OPT_C_ENGINE,
        .group  = FIO_OPT_G_FILENAME,
    },
+    {
+        .name   = "block_size_order",
+        .lname  = "Blockstore block size order",
+        .type   = FIO_OPT_INT,
+        .off1   = offsetof(struct sec_options, block_order),
+        .help   = "Blockstore block size order (size = 2^order)",
+        .category = FIO_OPT_C_ENGINE,
+        .group  = FIO_OPT_G_FILENAME,
+    },
    {
        .name   = "single_primary",
        .lname  = "Single Primary",
@@ -139,6 +150,8 @@ static int sec_init(struct thread_data *td)
 {
    sec_options *o = (sec_options*)td->eo;
    sec_data *bsd = (sec_data*)td->io_ops_data;
+    bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
+    bsd->block_size = 1 << o->block_order;

    struct sockaddr_in addr;
    int r;
--- a/http_client.cpp
+++ b/http_client.cpp
@@ -0,0 +1,680 @@
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+
+#include <ctype.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "json11/json11.hpp"
+#include "http_client.h"
+#include "timerfd_manager.h"
+
+#define READ_BUFFER_SIZE 9000
+
+static int extract_port(std::string & host);
+static std::string strtolower(const std::string & in);
+static std::string trim(const std::string & in);
+static std::string ws_format_frame(int type, uint64_t size);
+static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
+
+// FIXME: Use keepalive
+struct http_co_t
+{
+    timerfd_manager_t *tfd;
+
+    int request_timeout = 0;
+    std::string host;
+    std::string request;
+    std::string ws_outbox;
+    std::string response;
+    bool want_streaming;
+
+    http_response_t parsed;
+    uint64_t target_response_size = 0;
+
+    int state = 0;
+    int peer_fd = -1;
+    int timeout_id = -1;
+    int epoll_events = 0;
+    int sent = 0;
+    std::vector<char> rbuf;
+    iovec read_iov, send_iov;
+    msghdr read_msg = { 0 }, send_msg = { 0 };
+
+    std::function<void(const http_response_t*)> callback;
+
+    websocket_t ws;
+
+    int onstack = 0;
+    bool ended = false;
+
+    ~http_co_t();
+    inline void stackin() { onstack++; }
+    inline void stackout() { onstack--; if (!onstack && ended) end(); }
+    inline void end() { ended = true; if (!onstack) { delete this; } }
+    void start_connection();
+    void handle_events();
+    void handle_connect_result();
+    void submit_read();
+    void submit_send();
+    bool handle_read();
+    void post_message(int type, const std::string & msg);
+};
+
+#define HTTP_CO_CONNECTING 1
+#define HTTP_CO_SENDING_REQUEST 2
+#define HTTP_CO_REQUEST_SENT 3
+#define HTTP_CO_HEADERS_RECEIVED 4
+#define HTTP_CO_WEBSOCKET 5
+#define HTTP_CO_CHUNKED 6
+
+#define DEFAULT_TIMEOUT 5000
+
+void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    const http_options_t & options, std::function<void(const http_response_t *response)> callback)
+{
+    http_co_t *handler = new http_co_t();
+    handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
+    handler->want_streaming = options.want_streaming;
+    handler->tfd = tfd;
+    handler->host = host;
+    handler->request = request;
+    handler->callback = callback;
+    handler->ws.co = handler;
+    handler->start_connection();
+}
+
+void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    int timeout, std::function<void(std::string, json11::Json r)> callback)
+{
+    http_request(tfd, host, request, { .timeout = timeout }, [callback](const http_response_t* res)
+    {
+        if (res->error_code != 0)
+        {
+            callback("Error code: "+std::to_string(res->error_code)+" ("+std::string(strerror(res->error_code))+")", json11::Json());
+            return;
+        }
+        if (res->status_code != 200)
+        {
+            callback("HTTP "+std::to_string(res->status_code)+" "+res->status_line+" body: "+trim(res->body), json11::Json());
+            return;
+        }
+        std::string json_err;
+        json11::Json data = json11::Json::parse(res->body, json_err);
+        if (json_err != "")
+        {
+            callback("Bad JSON: "+json_err+" (response: "+trim(res->body)+")", json11::Json());
+            return;
+        }
+        callback(std::string(), data);
+    });
+}
+
+websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
+    int timeout, std::function<void(const http_response_t *msg)> callback)
+{
+    std::string request = "GET "+path+" HTTP/1.1\r\n"
+        "Host: "+host+"\r\n"
+        "Upgrade: websocket\r\n"
+        "Connection: upgrade\r\n"
+        "Sec-WebSocket-Key: x3JJHMbDL1EzLkh9GBhXDw==\r\n"
+        "Sec-WebSocket-Version: 13\r\n"
+        "\r\n";
+    http_co_t *handler = new http_co_t();
+    handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
+    handler->want_streaming = false;
+    handler->tfd = tfd;
+    handler->host = host;
+    handler->request = request;
+    handler->callback = callback;
+    handler->ws.co = handler;
+    handler->start_connection();
+    return &handler->ws;
+}
+
+void websocket_t::post_message(int type, const std::string & msg)
+{
+    co->post_message(type, msg);
+}
+
+void websocket_t::close()
+{
+    co->end();
+}
+
+http_co_t::~http_co_t()
+{
+    if (timeout_id >= 0)
+    {
+        tfd->clear_timer(timeout_id);
+        timeout_id = -1;
+    }
+    if (peer_fd >= 0)
+    {
+        tfd->set_fd_handler(peer_fd, NULL);
+        close(peer_fd);
+        peer_fd = -1;
+    }
+    if (parsed.headers["transfer-encoding"] == "chunked")
+    {
+        int prev = 0, pos = 0;
+        while ((pos = response.find("\r\n", prev)) >= prev)
+        {
+            uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
+            parsed.body += response.substr(pos+2, len);
+            prev = pos+2+len+2;
+        }
+    }
+    else
+    {
+        std::swap(parsed.body, response);
+    }
+    parsed.eof = true;
+    callback(&parsed);
+}
+
+void http_co_t::start_connection()
+{
+    stackin();
+    int port = extract_port(host);
+    struct sockaddr_in addr;
+    int r;
+    if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
+    {
+        parsed.error_code = ENXIO;
+        stackout();
+        end();
+        return;
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port ? port : 80);
+    peer_fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (peer_fd < 0)
+    {
+        parsed.error_code = errno;
+        stackout();
+        end();
+        return;
+    }
+    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
+    if (request_timeout > 0)
+    {
+        timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
+        {
+            if (response.length() == 0)
+            {
+                parsed.error_code = ETIME;
+            }
+            end();
+        });
+    }
+    epoll_events = 0;
+    // Finally call connect
+    r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
+    if (r < 0 && errno != EINPROGRESS)
+    {
+        parsed.error_code = errno;
+        stackout();
+        end();
+        return;
+    }
+    tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
+    {
+        this->epoll_events |= epoll_events;
+        handle_events();
+    });
+    state = HTTP_CO_CONNECTING;
+    stackout();
+}
+
+void http_co_t::handle_events()
+{
+    stackin();
+    while (epoll_events)
+    {
+        if (state == HTTP_CO_CONNECTING)
+        {
+            handle_connect_result();
+        }
+        else
+        {
+            epoll_events &= ~EPOLLOUT;
+            if (epoll_events & EPOLLIN)
+            {
+                submit_read();
+            }
+            else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
+            {
+                end();
+                break;
+            }
+        }
+    }
+    stackout();
+}
+
+void http_co_t::handle_connect_result()
+{
+    stackin();
+    int result = 0;
+    socklen_t result_len = sizeof(result);
+    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
+    {
+        result = errno;
+    }
+    if (result != 0)
+    {
+        parsed.error_code = result;
+        stackout();
+        end();
+        return;
+    }
+    int one = 1;
+    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+    state = HTTP_CO_SENDING_REQUEST;
+    submit_send();
+    stackout();
+}
+
+void http_co_t::submit_read()
+{
+    stackin();
+    int res;
+    if (rbuf.size() != READ_BUFFER_SIZE)
+    {
+        rbuf.resize(READ_BUFFER_SIZE);
+    }
+    read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
+    read_msg.msg_iov = &read_iov;
+    read_msg.msg_iovlen = 1;
+    res = recvmsg(peer_fd, &read_msg, 0);
+    if (res < 0)
+    {
+        res = -errno;
+    }
+    if (res == -EAGAIN || res == 0)
+    {
+        epoll_events = epoll_events & ~EPOLLIN;
+    }
+    else if (res < 0)
+    {
+        end();
+    }
+    else if (res > 0)
+    {
+        response += std::string(rbuf.data(), res);
+        handle_read();
+    }
+    stackout();
+}
+
+void http_co_t::submit_send()
+{
+    stackin();
+    int res;
+again:
+    if (sent < request.size())
+    {
+        send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
+        send_msg.msg_iov = &send_iov;
+        send_msg.msg_iovlen = 1;
+        res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL);
+        if (res < 0)
+        {
+            res = -errno;
+        }
+        if (res == -EAGAIN)
+        {
+            res = 0;
+        }
+        else if (res < 0)
+        {
+            stackout();
+            end();
+            return;
+        }
+        sent += res;
+        if (state == HTTP_CO_SENDING_REQUEST)
+        {
+            if (sent >= request.size())
+            {
+                state = HTTP_CO_REQUEST_SENT;
+            }
+            else
+                goto again;
+        }
+        else if (state == HTTP_CO_WEBSOCKET)
+        {
+            request = request.substr(sent);
+            sent = 0;
+            goto again;
+        }
+    }
+    stackout();
+}
+
+bool http_co_t::handle_read()
+{
+    stackin();
+    if (state == HTTP_CO_REQUEST_SENT)
+    {
+        int pos = response.find("\r\n\r\n");
+        if (pos >= 0)
+        {
+            if (timeout_id >= 0)
+            {
+                tfd->clear_timer(timeout_id);
+                timeout_id = -1;
+            }
+            state = HTTP_CO_HEADERS_RECEIVED;
+            parse_http_headers(response, &parsed);
+            if (parsed.status_code == 101 &&
+                parsed.headers.find("sec-websocket-accept") != parsed.headers.end() &&
+                parsed.headers["upgrade"] == "websocket" &&
+                parsed.headers["connection"] == "upgrade")
+            {
+                // Don't care about validating the key
+                state = HTTP_CO_WEBSOCKET;
+                request = ws_outbox;
+                ws_outbox = "";
+                sent = 0;
+                submit_send();
+            }
+            else if (parsed.headers["transfer-encoding"] == "chunked")
+            {
+                state = HTTP_CO_CHUNKED;
+            }
+            else if (parsed.headers["connection"] != "close")
+            {
+                target_response_size = stoull_full(parsed.headers["content-length"]);
+                if (!target_response_size)
+                {
+                    // Sorry, unsupported response
+                    stackout();
+                    end();
+                    return false;
+                }
+            }
+        }
+    }
+    if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
+    {
+        stackout();
+        end();
+        return false;
+    }
+    if (state == HTTP_CO_CHUNKED && response.size() > 0)
+    {
+        int prev = 0, pos = 0;
+        while ((pos = response.find("\r\n", prev)) >= prev)
+        {
+            uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
+            if (!len)
+            {
+                // Zero length chunk indicates EOF
+                parsed.eof = true;
+                break;
+            }
+            if (response.size() < pos+2+len+2)
+            {
+                break;
+            }
+            parsed.body += response.substr(pos+2, len);
+            prev = pos+2+len+2;
+        }
+        if (prev > 0)
+        {
+            response = response.substr(prev);
+        }
+        if (parsed.eof)
+        {
+            stackout();
+            end();
+            return false;
+        }
+        if (want_streaming && parsed.body.size() > 0)
+        {
+            callback(&parsed);
+            parsed.body = "";
+        }
+    }
+    if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
+    {
+        while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body))
+        {
+            callback(&parsed);
+            parsed.body = "";
+        }
+    }
+    stackout();
+    return true;
+}
+
+void http_co_t::post_message(int type, const std::string & msg)
+{
+    stackin();
+    if (state == HTTP_CO_WEBSOCKET)
+    {
+        request += ws_format_frame(type, msg.size());
+        request += msg;
+        submit_send();
+    }
+    else
+    {
+        ws_outbox += ws_format_frame(type, msg.size());
+        ws_outbox += msg;
+    }
+    stackout();
+}
+
+uint64_t stoull_full(const std::string & str, int base)
+{
+    if (isspace(str[0]))
+    {
+        return 0;
+    }
+    char *end = NULL;
+    uint64_t r = strtoull(str.c_str(), &end, base);
+    if (end != str.c_str()+str.length())
+    {
+        return 0;
+    }
+    return r;
+}
+
+void parse_http_headers(std::string & res, http_response_t *parsed)
+{
+    int pos = res.find("\r\n");
+    pos = pos < 0 ? res.length() : pos+2;
+    std::string status_line = res.substr(0, pos);
+    int http_version;
+    char *status_text = NULL;
+    sscanf(status_line.c_str(), "HTTP/1.%d %d %ms", &http_version, &parsed->status_code, &status_text);
+    if (status_text)
+    {
+        parsed->status_line = status_text;
+        // %ms = allocate a buffer
+        free(status_text);
+        status_text = NULL;
+    }
+    int prev = pos;
+    while ((pos = res.find("\r\n", prev)) >= prev)
+    {
+        if (pos == prev)
+        {
+            res = res.substr(pos+2);
+            break;
+        }
+        std::string header = res.substr(prev, pos-prev);
+        int p2 = header.find(":");
+        if (p2 >= 0)
+        {
+            std::string key = strtolower(header.substr(0, p2));
+            int p3 = p2+1;
+            while (p3 < header.length() && isblank(header[p3]))
+                p3++;
+            parsed->headers[key] = key == "connection" || key == "upgrade" || key == "transfer-encoding"
+                ? strtolower(header.substr(p3)) : header.substr(p3);
+        }
+        prev = pos+2;
+    }
+}
+
+static std::string ws_format_frame(int type, uint64_t size)
+{
+    // Always zero mask
+    std::string res;
+    int p = 0;
+    res.resize(2 + (size >= 126 ? 2 : 0) + (size >= 65536 ? 6 : 0) + /*mask*/4);
+    res[p++] = 0x80 | type;
+    if (size < 126)
+        res[p++] = size | /*mask*/0x80;
+    else if (size < 65536)
+    {
+        res[p++] = 126 | /*mask*/0x80;
+        res[p++] = (size >> 8) & 0xFF;
+        res[p++] = (size >> 0) & 0xFF;
+    }
+    else
+    {
+        res[p++] = 127 | /*mask*/0x80;
+        res[p++] = (size >> 56) & 0xFF;
+        res[p++] = (size >> 48) & 0xFF;
+        res[p++] = (size >> 40) & 0xFF;
+        res[p++] = (size >> 32) & 0xFF;
+        res[p++] = (size >> 24) & 0xFF;
+        res[p++] = (size >> 16) & 0xFF;
+        res[p++] = (size >>  8) & 0xFF;
+        res[p++] = (size >>  0) & 0xFF;
+    }
+    res[p++] = 0;
+    res[p++] = 0;
+    res[p++] = 0;
+    res[p++] = 0;
+    return res;
+}
+
+static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
+{
+    uint64_t hdr = 2;
+    if (buf.size() < hdr)
+    {
+        return false;
+    }
+    type = buf[0] & ~0x80;
+    bool mask = !!(buf[1] & 0x80);
+    hdr += mask ? 4 : 0;
+    uint64_t len = ((uint8_t)buf[1] & ~0x80);
+    if (len == 126)
+    {
+        hdr += 2;
+        if (buf.size() < hdr)
+        {
+            return false;
+        }
+        len = ((uint64_t)(uint8_t)buf[2] << 8) | ((uint64_t)(uint8_t)buf[3] << 0);
+    }
+    else if (len == 127)
+    {
+        hdr += 8;
+        if (buf.size() < hdr)
+        {
+            return false;
+        }
+        len = ((uint64_t)(uint8_t)buf[2] << 56) |
+            ((uint64_t)(uint8_t)buf[3] << 48) |
+            ((uint64_t)(uint8_t)buf[4] << 40) |
+            ((uint64_t)(uint8_t)buf[5] << 32) |
+            ((uint64_t)(uint8_t)buf[6] << 24) |
+            ((uint64_t)(uint8_t)buf[7] << 16) |
+            ((uint64_t)(uint8_t)buf[8] << 8) |
+            ((uint64_t)(uint8_t)buf[9] << 0);
+    }
+    if (buf.size() < hdr+len)
+    {
+        return false;
+    }
+    if (mask)
+    {
+        for (int i = 0; i < len; i++)
+            buf[hdr+i] ^= buf[hdr-4+(i & 3)];
+    }
+    res += buf.substr(hdr, len);
+    buf = buf.substr(hdr+len);
+    return true;
+}
+
+std::vector<std::string> getifaddr_list(bool include_v6)
+{
+    std::vector<std::string> addresses;
+    ifaddrs *list, *ifa;
+    if (getifaddrs(&list) == -1)
+    {
+        throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
+    }
+    for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
+    {
+        if (!ifa->ifa_addr)
+        {
+            continue;
+        }
+        int family = ifa->ifa_addr->sa_family;
+        if ((family == AF_INET || family == AF_INET6 && include_v6) &&
+            (ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
+        {
+            void *addr_ptr;
+            if (family == AF_INET)
+                addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
+            else
+                addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
+            char addr[INET6_ADDRSTRLEN];
+            if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
+            {
+                throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
+            }
+            addresses.push_back(std::string(addr));
+        }
+    }
+    freeifaddrs(list);
+    return addresses;
+}
+
+static int extract_port(std::string & host)
+{
+    int port = 0;
+    int pos = 0;
+    if ((pos = host.find(':')) >= 0)
+    {
+        port = strtoull(host.c_str() + pos + 1, NULL, 10);
+        if (port >= 0x10000)
+        {
+            port = 0;
+        }
+        host = host.substr(0, pos);
+    }
+    return port;
+}
+
+static std::string strtolower(const std::string & in)
+{
+    std::string s = in;
+    for (int i = 0; i < s.length(); i++)
+    {
+        s[i] = tolower(s[i]);
+    }
+    return s;
+}
+
+static std::string trim(const std::string & in)
+{
+    int begin = in.find_first_not_of(" \n\r\t");
+    if (begin == -1)
+        return "";
+    int end = in.find_last_not_of(" \n\r\t");
+    return in.substr(begin, end+1-begin);
+}
--- a/http_client.h
+++ b/http_client.h
@@ -0,0 +1,56 @@
+#pragma once
+#include <string>
+#include <vector>
+#include <map>
+#include <functional>
+#include "json11/json11.hpp"
+
+#define WS_CONTINUATION 0
+#define WS_TEXT 1
+#define WS_BINARY 2
+#define WS_CLOSE 8
+#define WS_PING 9
+#define WS_PONG 10
+
+class timerfd_manager_t;
+
+struct http_options_t
+{
+    int timeout;
+    bool want_streaming;
+};
+
+struct http_response_t
+{
+    bool eof = false;
+    int error_code = 0;
+    int status_code = 0;
+    std::string status_line;
+    std::map<std::string, std::string> headers;
+    int ws_msg_type = -1;
+    std::string body;
+};
+
+struct http_co_t;
+
+struct websocket_t
+{
+    http_co_t *co;
+    void post_message(int type, const std::string & msg);
+    void close();
+};
+
+void parse_http_headers(std::string & res, http_response_t *parsed);
+
+std::vector<std::string> getifaddr_list(bool include_v6 = false);
+
+uint64_t stoull_full(const std::string & str, int base = 10);
+
+void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    const http_options_t & options, std::function<void(const http_response_t *response)> callback);
+
+void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    int timeout, std::function<void(std::string, json11::Json r)> callback);
+
+websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
+    int timeout, std::function<void(const http_response_t *msg)> callback);
--- a/lp/lp-optimizer.js
+++ b/lp/lp-optimizer.js
@@ -0,0 +1,521 @@
+// Data distribution optimizer using linear programming (lp_solve)
+
+const child_process = require('child_process');
+
+const NO_OSD = 'Z';
+
+async function lp_solve(text)
+{
+    const cp = child_process.spawn('lp_solve');
+    let stdout = '', stderr = '', finish_cb;
+    cp.stdout.on('data', buf => stdout += buf.toString());
+    cp.stderr.on('data', buf => stderr += buf.toString());
+    cp.on('exit', () => finish_cb && finish_cb());
+    cp.stdin.write(text);
+    cp.stdin.end();
+    if (cp.exitCode == null)
+    {
+        await new Promise(ok => finish_cb = ok);
+    }
+    if (!stdout.trim())
+    {
+        return null;
+    }
+    let score = 0;
+    let vars = {};
+    for (const line of stdout.split(/\n/))
+    {
+        let m = /^(^Value of objective function: ([\d\.]+)|Actual values of the variables:)\s*$/.exec(line);
+        if (m)
+        {
+            if (m[2])
+            {
+                score = m[2];
+            }
+            continue;
+        }
+        else if (/This problem is (infeasible|unbounded)/.exec(line))
+        {
+            return null;
+        }
+        let [ k, v ] = line.trim().split(/\s+/, 2);
+        if (v)
+        {
+            vars[k] = v;
+        }
+    }
+    return { score, vars };
+}
+
+async function optimize_initial(osd_tree, pg_count, max_combinations)
+{
+    max_combinations = max_combinations || 10000;
+    const all_weights = Object.assign({}, ...Object.values(osd_tree));
+    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
+    let all_pgs = all_combinations(osd_tree, null, true);
+    if (all_pgs.length > max_combinations)
+    {
+        const prob = max_combinations/all_pgs.length;
+        all_pgs = all_pgs.filter(pg => Math.random() < prob);
+    }
+    const pg_per_osd = {};
+    for (const pg of all_pgs)
+    {
+        for (const osd of pg)
+        {
+            pg_per_osd[osd] = pg_per_osd[osd] || [];
+            pg_per_osd[osd].push("pg_"+pg.join("_"));
+        }
+    }
+    const pg_size = Math.min(Object.keys(osd_tree).length, 3);
+    let lp = '';
+    lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n";
+    for (const osd in pg_per_osd)
+    {
+        if (osd !== NO_OSD)
+        {
+            let osd_pg_count = all_weights[osd]/total_weight*pg_size*pg_count;
+            lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
+        }
+    }
+    for (const pg of all_pgs)
+    {
+        lp += 'pg_'+pg.join('_')+" >= 0;\n";
+    }
+    lp += "sec "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(', ')+";\n";
+    const lp_result = await lp_solve(lp);
+    if (!lp_result)
+    {
+        throw new Error('Problem is infeasible or unbounded - is it a bug?');
+    }
+    const int_pgs = make_int_pgs(lp_result.vars, pg_count);
+    const eff = pg_list_space_efficiency(int_pgs, all_weights);
+    return { score: lp_result.score, weights: lp_result.vars, int_pgs, space: eff*pg_size, total_space: total_weight };
+}
+
+function make_int_pgs(weights, pg_count)
+{
+    const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
+    let int_pgs = [];
+    let pg_left = pg_count;
+    let weight_left = total_weight;
+    for (const pg_name in weights)
+    {
+        let n = Math.round(weights[pg_name] / weight_left * pg_left);
+        for (let i = 0; i < n; i++)
+        {
+            int_pgs.push(pg_name.substr(3).split('_'));
+        }
+        weight_left -= weights[pg_name];
+        pg_left -= n;
+    }
+    return int_pgs;
+}
+
+// Try to minimize data movement
+async function optimize_change(prev_int_pgs, osd_tree, max_combinations)
+{
+    max_combinations = max_combinations || 10000;
+    const pg_size = Math.min(Object.keys(osd_tree).length, 3);
+    const pg_count = prev_int_pgs.length;
+    const prev_weights = {};
+    const prev_pg_per_osd = {};
+    for (const pg of prev_int_pgs)
+    {
+        const pg_name = 'pg_'+pg.join('_');
+        prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1;
+        for (const osd of pg)
+        {
+            prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || [];
+            prev_pg_per_osd[osd].push(pg_name);
+        }
+    }
+    // Get all combinations
+    let all_pgs = all_combinations(osd_tree, null, true);
+    if (all_pgs.length > max_combinations)
+    {
+        const intersecting = all_pgs.filter(pg => prev_weights['pg_'+pg.join('_')]);
+        if (intersecting.length > max_combinations)
+        {
+            const prob = max_combinations/intersecting.length;
+            all_pgs = intersecting.filter(pg => Math.random() < prob);
+        }
+        else
+        {
+            const prob = (max_combinations-intersecting.length)/all_pgs.length;
+            all_pgs = all_pgs.filter(pg => Math.random() < prob || prev_weights['pg_'+pg.join('_')]);
+        }
+    }
+    const pg_per_osd = {};
+    for (const pg of all_pgs)
+    {
+        const pg_name = 'pg_'+pg.join('_');
+        for (const osd of pg)
+        {
+            pg_per_osd[osd] = pg_per_osd[osd] || [];
+            pg_per_osd[osd].push(pg_name);
+        }
+    }
+    // Penalize PGs based on their similarity to old PGs
+    const intersect = {};
+    for (const pg_name in prev_weights)
+    {
+        const pg = pg_name.substr(3).split(/_/);
+        intersect[pg[0]+'::'] = intersect[':'+pg[1]+':'] = intersect['::'+pg[2]] = 2;
+        intersect[pg[0]+'::'+pg[2]] = intersect[':'+pg[1]+':'+pg[2]] = intersect[pg[0]+':'+pg[1]+':'] = 1;
+    }
+    const move_weights = {};
+    for (const pg of all_pgs)
+    {
+        move_weights['pg_'+pg.join('_')] =
+            intersect[pg[0]+'::'+pg[2]] || intersect[':'+pg[1]+':'+pg[2]] || intersect[pg[0]+':'+pg[1]+':'] ||
+            intersect[pg[0]+'::'] || intersect[':'+pg[1]+':'] || intersect['::'+pg[2]] ||
+            3;
+    }
+    // Calculate total weight - old PG weights
+    const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
+    const all_weights = Object.assign({}, ...Object.values(osd_tree));
+    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
+    // Generate the LP problem
+    let lp = '';
+    lp += 'max: '+all_pg_names.map(pg_name => (
+        prev_weights[pg_name] ? `${4-move_weights[pg_name]}*add_${pg_name} - 4*del_${pg_name}` : `${4-move_weights[pg_name]}*${pg_name}`
+    )).join(' + ')+';\n';
+    for (const osd in pg_per_osd)
+    {
+        if (osd !== NO_OSD)
+        {
+            const osd_sum = (pg_per_osd[osd]||[]).map(pg_name => prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : pg_name).join(' + ');
+            const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]).filter(old_pg_name => move_weights[old_pg_name]).length;
+            let osd_pg_count = all_weights[osd]*3/total_weight*pg_count - rm_osd_pg_count;
+            lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
+        }
+    }
+    let pg_vars = [];
+    for (const pg_name of all_pg_names)
+    {
+        if (prev_weights[pg_name])
+        {
+            pg_vars.push(`add_${pg_name}`, `del_${pg_name}`);
+            // Can't add or remove less than zero
+            lp += `add_${pg_name} >= 0;\n`;
+            lp += `del_${pg_name} >= 0;\n`;
+            // Can't remove more than the PG already has
+            lp += `add_${pg_name} - del_${pg_name} >= -${prev_weights[pg_name]};\n`;
+        }
+        else
+        {
+            pg_vars.push(pg_name);
+            lp += `${pg_name} >= 0;\n`;
+        }
+    }
+    lp += 'sec '+pg_vars.join(', ')+';\n';
+    // Solve it
+    const lp_result = await lp_solve(lp);
+    if (!lp_result)
+    {
+        console.log(lp);
+        throw new Error('Problem is infeasible or unbounded - is it a bug?');
+    }
+    // Generate the new distribution
+    const weights = { ...prev_weights };
+    for (const k in prev_weights)
+    {
+        if (!move_weights[k])
+        {
+            delete weights[k];
+        }
+    }
+    for (const k in lp_result.vars)
+    {
+        if (k.substr(0, 4) === 'add_')
+        {
+            weights[k.substr(4)] = (weights[k.substr(4)] || 0) + Number(lp_result.vars[k]);
+        }
+        else if (k.substr(0, 4) === 'del_')
+        {
+            weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]);
+        }
+        else
+        {
+            weights[k] = Number(lp_result.vars[k]);
+        }
+    }
+    for (const k in weights)
+    {
+        if (!weights[k])
+        {
+            delete weights[k];
+        }
+    }
+    const int_pgs = make_int_pgs(weights, pg_count);
+    // Align them with most similar previous PGs
+    const new_pgs = align_pgs(prev_int_pgs, int_pgs);
+    let differs = 0, osd_differs = 0;
+    for (let i = 0; i < pg_count; i++)
+    {
+        if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
+        {
+            differs++;
+        }
+        for (let j = 0; j < 3; j++)
+        {
+            if (new_pgs[i][j] != prev_int_pgs[i][j])
+            {
+                osd_differs++;
+            }
+        }
+    }
+    return {
+        prev_pgs: prev_int_pgs,
+        score: lp_result.score,
+        weights,
+        int_pgs: new_pgs,
+        differs,
+        osd_differs,
+        space: pg_size * pg_list_space_efficiency(new_pgs, all_weights),
+        total_space: total_weight,
+    };
+}
+
+function print_change_stats(retval, detailed)
+{
+    const new_pgs = retval.int_pgs;
+    const prev_int_pgs = retval.prev_pgs;
+    if (prev_int_pgs)
+    {
+        if (detailed)
+        {
+            for (let i = 0; i < new_pgs.length; i++)
+            {
+                if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
+                {
+                    console.log("pg "+i+": "+prev_int_pgs[i].join(' ')+" -> "+new_pgs[i].join(' '));
+                }
+            }
+        }
+        console.log(
+            "Data movement: "+retval.differs+" pgs, "+
+            retval.osd_differs+" pg*osds = "+Math.round(retval.osd_differs / prev_int_pgs.length / 3 * 10000)/100+" %"
+        );
+    }
+    console.log(
+        "Total space (raw): "+Math.round(retval.space*100)/100+" TB, space efficiency: "+
+        Math.round(retval.space/(retval.total_space||1)*10000)/100+" %"
+    );
+}
+
+function align_pgs(prev_int_pgs, int_pgs)
+{
+    const aligned_pgs = [];
+    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg.join(':') ]);
+    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::'+pg[2], ':'+pg[1]+':'+pg[2], pg[0]+':'+pg[1]+':' ]);
+    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::', ':'+pg[1]+':', '::'+pg[2] ]);
+    const free_slots = prev_int_pgs.map((pg, i) => !aligned_pgs[i] ? i : null).filter(i => i != null);
+    for (const pg of int_pgs)
+    {
+        if (!free_slots.length)
+        {
+            throw new Error("Can't place unaligned PG");
+        }
+        aligned_pgs[free_slots.shift()] = pg;
+    }
+    return aligned_pgs;
+}
+
+function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
+{
+    let prev_indexes = {};
+    for (let i = 0; i < prev_int_pgs.length; i++)
+    {
+        for (let k of keygen(prev_int_pgs[i]))
+        {
+            prev_indexes[k] = prev_indexes[k] || [];
+            prev_indexes[k].push(i);
+        }
+    }
+    PG: for (let i = int_pgs.length-1; i >= 0; i--)
+    {
+        let pg = int_pgs[i];
+        let keys = keygen(int_pgs[i]);
+        for (let k of keys)
+        {
+            while (prev_indexes[k] && prev_indexes[k].length)
+            {
+                let idx = prev_indexes[k].shift();
+                if (!aligned_pgs[idx])
+                {
+                    aligned_pgs[idx] = pg;
+                    int_pgs.splice(i, 1);
+                    continue PG;
+                }
+            }
+        }
+    }
+}
+
+// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
+// levels = { string: number }
+// to a two-level osd_tree suitable for all_combinations()
+function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
+{
+    osd_level = levels[osd_level] || osd_level;
+    failure_domain_level = levels[failure_domain_level] || failure_domain_level;
+    for (const node of osd_tree)
+    {
+        if ((levels[node.level] || node.level) < failure_domain_level)
+        {
+            flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
+        }
+        else
+        {
+            domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
+        }
+    }
+    return domains;
+}
+
+function extract_osds(osd_tree, levels, osd_level, osds = {})
+{
+    for (const node of osd_tree)
+    {
+        if ((levels[node.level] || node.level) >= osd_level)
+        {
+            osds[node.id] = node.size;
+        }
+        else
+        {
+            extract_osds(node.children||[], levels, osd_level, osds);
+        }
+    }
+    return osds;
+}
+
+// FIXME: support different pg_sizes, not just 3
+// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
+function all_combinations(osd_tree, count, ordered)
+{
+    const hosts = Object.keys(osd_tree).sort();
+    const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
+    while (hosts.length < 3)
+    {
+        osds[NO_OSD] = [ NO_OSD ];
+        hosts.push(NO_OSD);
+    }
+    let host_idx = [ 0, 1, 2 ];
+    let osd_idx = [ 0, 0, 0 ];
+    const r = [];
+    while (!count || count < 0 || r.length < count)
+    {
+        let inc;
+        if (host_idx[2] != host_idx[1] && host_idx[2] != host_idx[0] && host_idx[1] != host_idx[0])
+        {
+            r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
+            inc = 2;
+            while (inc >= 0)
+            {
+                osd_idx[inc]++;
+                if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
+                {
+                    osd_idx[inc] = 0;
+                    inc--;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+        else
+        {
+            inc = -1;
+        }
+        if (inc < 0)
+        {
+            // no osds left in current host combination, select the next one
+            osd_idx = [ 0, 0, 0 ];
+            host_idx[2]++;
+            if (host_idx[2] >= hosts.length)
+            {
+                host_idx[1]++;
+                host_idx[2] = ordered ? host_idx[1]+1 : 0;
+                if ((ordered ? host_idx[2] : host_idx[1]) >= hosts.length)
+                {
+                    host_idx[0]++;
+                    host_idx[1] = ordered ? host_idx[0]+1 : 0;
+                    host_idx[2] = ordered ? host_idx[1]+1 : 0;
+                    if ((ordered ? host_idx[2] : host_idx[0]) >= hosts.length)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    return r;
+}
+
+function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
+{
+    const per_osd = {};
+    for (const pg_name in weights)
+    {
+        for (const osd of pg_name.substr(3).split(/_/))
+        {
+            per_osd[osd] = (per_osd[osd]||0) + weights[pg_name];
+        }
+    }
+    return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes);
+}
+
+function pg_list_space_efficiency(pgs, osd_sizes)
+{
+    const per_osd = {};
+    for (const pg of pgs)
+    {
+        for (const osd of pg)
+        {
+            per_osd[osd] = (per_osd[osd]||0) + 1;
+        }
+    }
+    return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes);
+}
+
+function pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes)
+{
+    // each PG gets randomly selected in 1/N cases
+    // & there are x PGs per OSD
+    // => an OSD is selected in x/N cases
+    // => total space * x/N <= OSD size
+    // => total space <= OSD size * N/x
+    let space;
+    for (let osd in per_osd)
+    {
+        if (osd in osd_sizes)
+        {
+            const space_estimate = osd_sizes[osd] * pg_count / per_osd[osd];
+            if (space == null || space > space_estimate)
+            {
+                space = space_estimate;
+            }
+        }
+    }
+    return space == null ? 0 : space;
+}
+
+module.exports = {
+    NO_OSD,
+
+    optimize_initial,
+    optimize_change,
+    print_change_stats,
+    pg_weights_space_efficiency,
+    pg_list_space_efficiency,
+    pg_per_osd_space_efficiency,
+    flatten_tree,
+
+    lp_solve,
+    make_int_pgs,
+    align_pgs,
+    all_combinations,
+};
--- a/lp/mon-main.js
+++ b/lp/mon-main.js
@@ -0,0 +1,22 @@
+#!/usr/bin/node
+
+const Mon = require('./mon.js');
+
+const options = {};
+
+for (let i = 2; i < process.argv.length; i++)
+{
+    if (process.argv[i].substr(0, 2) == '--')
+    {
+        options[process.argv[i].substr(2)] = process.argv[i+1];
+        i++;
+    }
+}
+
+if (!options.etcd_url)
+{
+    console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url "http://127.0.0.1:2379,..." --etcd_prefix "/rage" --etcd_start_timeout 5');
+    process.exit();
+}
+
+new Mon(options).start();
--- a/lp/mon.js
+++ b/lp/mon.js
@@ -0,0 +1,858 @@
+const http = require('http');
+const os = require('os');
+const WebSocket = require('ws');
+const LPOptimizer = require('./lp-optimizer.js');
+const stableStringify = require('./stable-stringify.js');
+
+class Mon
+{
+    static etcd_tree = {
+        config: {
+            global: null,
+            /* placement_tree = {
+                levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
+                nodes: { host1: { level: 'host', parent: 'rack1' }, ... },
+                failure_domain: 'host',
+            } */
+            placement_tree: null,
+            osd: {},
+            pgs: {},
+        },
+        osd: {
+            state: {},
+            stats: {},
+        },
+        mon: {
+            master: null,
+        },
+        pg: {
+            change_stamp: null,
+            state: {},
+            stats: {},
+            history: {},
+        },
+    }
+
+    constructor(config)
+    {
+        // FIXME: Maybe prefer local etcd
+        this.etcd_urls = [];
+        for (let url of config.etcd_url.split(/,/))
+        {
+            let scheme = 'http';
+            url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
+            if (!/\/[^\/]/.exec(url))
+                url += '/v3';
+            this.etcd_urls.push(scheme+'://'+url);
+        }
+        this.etcd_prefix = config.etcd_prefix || '/rage';
+        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
+        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
+        this.state = JSON.parse(JSON.stringify(Mon.etcd_tree));
+    }
+
+    async start()
+    {
+        await this.load_config();
+        await this.get_lease();
+        await this.become_master();
+        await this.load_cluster_state();
+        await this.start_watcher();
+        await this.recheck_pgs();
+    }
+
+    async load_config()
+    {
+        const res = await this.etcd_call('/txn', { success: [
+            { requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
+        ] }, this.etcd_start_timeout, -1);
+        this.parse_kv(res.responses[0].response_range.kvs[0]);
+        this.check_config();
+    }
+
+    check_config()
+    {
+        this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
+        if (this.config.etcd_mon_timeout <= 0)
+        {
+            this.config.etcd_mon_timeout = 1000;
+        }
+        this.config.etcd_mon_retries = Number(this.config.etcd_mon_retries) || 5;
+        if (this.config.etcd_mon_retries < 0)
+        {
+            this.config.etcd_mon_retries = 0;
+        }
+        this.config.mon_change_timeout = Number(this.config.mon_change_timeout) || 1000;
+        if (this.config.mon_change_timeout < 100)
+        {
+            this.config.mon_change_timeout = 100;
+        }
+        this.config.mon_stats_timeout = Number(this.config.mon_stats_timeout) || 1000;
+        if (this.config.mon_stats_timeout < 100)
+        {
+            this.config.mon_stats_timeout = 100;
+        }
+        // After this number of seconds, a dead OSD will be removed from PG distribution
+        this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
+        if (!this.config.osd_out_time)
+        {
+            this.config.osd_out_time = 30*60; // 30 minutes by default
+        }
+        this.config.max_osd_combinations = Number(this.config.max_osd_combinations) || 10000;
+        if (this.config.max_osd_combinations < 100)
+        {
+            this.config.max_osd_combinations = 100;
+        }
+    }
+
+    async start_watcher(retries)
+    {
+        let retry = 0;
+        if (retries >= 0 && retries < 1)
+        {
+            retries = 1;
+        }
+        while (retries < 0 || retry < retries)
+        {
+            const base = 'ws'+this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)].substr(4);
+            const ok = await new Promise((ok, no) =>
+            {
+                const timer_id = setTimeout(() =>
+                {
+                    this.ws.close();
+                    ok(false);
+                }, timeout);
+                this.ws = new WebSocket(base+'/watch');
+                this.ws.on('open', () =>
+                {
+                    if (timer_id)
+                        clearTimeout(timer_id);
+                    ok(true);
+                });
+            });
+            if (!ok)
+            {
+                this.ws = null;
+            }
+            retry++;
+        }
+        if (!this.ws)
+        {
+            this.die('Failed to open etcd watch websocket');
+        }
+        this.ws.send(JSON.stringify({
+            create_request: {
+                key: b64(this.etcd_prefix+'/'),
+                range_end: b64(this.etcd_prefix+'0'),
+                start_revision: ''+this.etcd_watch_revision,
+                watch_id: 1,
+            },
+        }));
+        this.ws.on('message', (msg) =>
+        {
+            let data;
+            try
+            {
+                data = JSON.parse(msg);
+            }
+            catch (e)
+            {
+            }
+            if (!data || !data.result || !data.result.events)
+            {
+                console.error('Garbage received from watch websocket: '+msg);
+            }
+            else
+            {
+                let stats_changed = false, changed = false;
+                console.log('Revision '+data.result.header.revision+' events: ');
+                for (const e of data.result.events)
+                {
+                    this.parse_kv(e.kv);
+                    const key = e.kv.key.substr(this.etcd_prefix.length);
+                    if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/')
+                    {
+                        stats_changed = true;
+                    }
+                    else if (key != '/stats')
+                    {
+                        changed = true;
+                    }
+                    console.log(e);
+                }
+                if (stats_changed)
+                {
+                    this.schedule_update_stats();
+                }
+                if (changed)
+                {
+                    this.schedule_recheck();
+                }
+            }
+        });
+    }
+
+    async get_lease()
+    {
+        const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
+        const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
+        this.etcd_lease_id = res.ID;
+        setInterval(async () =>
+        {
+            const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
+            if (!res.result.TTL)
+            {
+                this.die('Lease expired');
+            }
+        }, config.etcd_mon_timeout);
+    }
+
+    async become_master()
+    {
+        const state = { ip: this.local_ips() };
+        while (1)
+        {
+            const res = await this.etcd_call('/txn', {
+                compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.etcd_prefix+'/mon/master') } ],
+                success: [ { key: b64(this.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.etcd_lease_id } ],
+            }, this.etcd_start_timeout, 0);
+            if (!res.succeeded)
+            {
+                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout));
+            }
+        }
+    }
+
+    async load_cluster_state()
+    {
+        const res = await this.etcd_call('/txn', { success: [
+            { requestRange: { key: b64(this.etcd_prefix+'/'), range_end: b64(this.etcd_prefix+'0') } },
+        ] }, this.etcd_start_timeout, -1);
+        this.etcd_watch_revision = BigInt(res.header.revision)+BigInt(1);
+        const data = JSON.parse(JSON.stringify(Mon.etcd_tree));
+        for (const response of res.responses)
+        {
+            for (const kv of response.response_range.kvs)
+            {
+                this.parse_kv(kv);
+            }
+        }
+        this.state = data;
+    }
+
+    all_osds()
+    {
+        return Object.keys(this.state.osd.stats);
+    }
+
+    get_osd_tree()
+    {
+        this.state.config.placement_tree = this.state.config.placement_tree||{};
+        const levels = this.state.config.placement_tree.levels||{};
+        levels.host = levels.host || 100;
+        levels.osd = levels.osd || 101;
+        const tree = { '': { children: [] } };
+        for (const node_id in this.state.config.placement_tree.nodes||{})
+        {
+            const node_cfg = this.state.config.placement_tree.nodes[node_id];
+            if (!node_id || /^\d/.exec(node_id) ||
+                !node_cfg.level || !levels[node_cfg.level])
+            {
+                // All nodes must have non-empty non-numeric IDs and valid levels
+                continue;
+            }
+            tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
+        }
+        // This requires monitor system time to be in sync with OSD system times (at least to some extent)
+        const down_time = Date.now()/1000 - this.config.osd_out_time;
+        for (const osd_num of this.all_osds().sort((a, b) => a - b))
+        {
+            const stat = this.state.osd.stats[osd_num];
+            if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
+            {
+                // Numeric IDs are reserved for OSDs
+                const reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight) || 1;
+                tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
+                tree[osd_num].level = 'osd';
+                tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
+                delete tree[osd_num].children;
+            }
+        }
+        for (const node_id in tree)
+        {
+            if (node_id === '')
+            {
+                continue;
+            }
+            const node_cfg = tree[node_id];
+            const node_level = levels[node_cfg.level] || node_cfg.level;
+            let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
+                && tree[node_cfg.parent].level;
+            parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
+            // Parent's level must be less than child's; OSDs must be leaves
+            const parent = parent_level && parent_level < node_level ? tree[node_cfg.parent] : '';
+            tree[parent].children.push(tree[node_id]);
+            delete node_cfg.parent;
+        }
+        return LPOptimizer.flatten_tree(tree[''].children, levels, this.state.config.failure_domain, 'osd');
+    }
+
+    async stop_all_pgs()
+    {
+        let has_online = false, paused = true;
+        for (const pg in this.state.config.pgs.items||{})
+        {
+            const cur_state = ((this.state.pg.state[pg]||{}).state||[]).join(',');
+            if (cur_state != '' && cur_state != 'offline')
+            {
+                has_online = true;
+            }
+            if (!this.state.config.pgs.items[pg].pause)
+            {
+                paused = false;
+            }
+        }
+        if (!paused)
+        {
+            console.log('Stopping all PGs before changing PG count');
+            const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
+            for (const pg in new_cfg.items)
+            {
+                new_cfg.items[pg].pause = true;
+            }
+            // Check that no OSDs change their state before we pause PGs
+            // Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
+            // and can't see the old PG configuration
+            const checks = [];
+            for (const osd_num of this.all_osds())
+            {
+                const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
+                checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
+            }
+            const res = await this.etcd_call('/txn', {
+                compare: [
+                    { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
+                    { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
+                    ...checks,
+                ],
+                success: [
+                    { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
+                ],
+            }, this.config.etcd_mon_timeout, 0);
+            if (!res.succeeded)
+            {
+                return false;
+            }
+            this.state.config.pgs = new_cfg;
+        }
+        return !has_online;
+    }
+
+    scale_pg_count(prev_pgs, pg_history, new_pg_count)
+    {
+        const old_pg_count = prev_pgs.length;
+        // Add all possibly intersecting PGs into the history of new PGs
+        if (!(new_pg_count % old_pg_count))
+        {
+            // New PG count is a multiple of the old PG count
+            const mul = (new_pg_count / old_pg_count);
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                const old_i = Math.floor(new_pg_count / mul);
+                pg_history[i] = JSON.parse(JSON.stringify(this.state.pg.history[1+old_i]));
+            }
+        }
+        else if (!(old_pg_count % new_pg_count))
+        {
+            // Old PG count is a multiple of the new PG count
+            const mul = (old_pg_count / new_pg_count);
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                pg_history[i] = {
+                    osd_sets: [],
+                    all_peers: [],
+                };
+                for (let j = 0; j < mul; j++)
+                {
+                    pg_history[i].osd_sets.push(prev_pgs[i*mul]);
+                    const hist = this.state.pg.history[1+i*mul+j];
+                    if (hist && hist.osd_sets && hist.osd_sets.length)
+                    {
+                        Array.prototype.push.apply(pg_history[i].osd_sets, hist.osd_sets);
+                    }
+                    if (hist && hist.all_peers && hist.all_peers.length)
+                    {
+                        Array.prototype.push.apply(pg_history[i].all_peers, hist.all_peers);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Any PG may intersect with any PG after non-multiple PG count change
+            // So, merge ALL PGs history
+            let all_sets = {};
+            let all_peers = {};
+            for (const pg of prev_pgs)
+            {
+                all_sets[pg.join(' ')] = pg;
+            }
+            for (const pg in this.state.pg.history)
+            {
+                const hist = this.state.pg.history[pg];
+                if (hist && hist.osd_sets)
+                {
+                    for (const pg of hist.osd_sets)
+                    {
+                        all_sets[pg.join(' ')] = pg;
+                    }
+                }
+                if (hist && hist.all_peers)
+                {
+                    for (const osd_num of hist.all_peers)
+                    {
+                        all_peers[osd_num] = Number(osd_num);
+                    }
+                }
+            }
+            all_sets = Object.values(all_sets);
+            all_peers = Object.values(all_peers);
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                pg_history[i] = { osd_sets: all_sets, all_peers };
+            }
+        }
+        // Mark history keys for removed PGs as removed
+        for (let i = new_pg_count; i < old_pg_count; i++)
+        {
+            pg_history[i] = null;
+        }
+        if (old_pg_count < new_pg_count)
+        {
+            for (let i = new_pg_count-1; i >= 0; i--)
+            {
+                prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
+            }
+        }
+        else if (old_pg_count > new_pg_count)
+        {
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
+            }
+            prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
+        }
+    }
+
+    async save_new_pgs(prev_pgs, new_pgs, pg_history, tree_hash)
+    {
+        const txn = [], checks = [];
+        const pg_items = {};
+        new_pgs.map((osd_set, i) =>
+        {
+            osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
+            const alive_set = osd_set.filter(osd_num => osd_num);
+            pg_items[i+1] = {
+                osd_set,
+                primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
+            };
+            if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
+            {
+                pg_history[i] = pg_history[i] || {};
+                pg_history[i].osd_sets = pg_history[i].osd_sets || [];
+                pg_history[i].osd_sets.push(prev_pgs[i]);
+            }
+        });
+        for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
+        {
+            checks.push({
+                key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
+                target: 'MOD',
+                mod_revision: ''+this.etcd_watch_revision,
+                result: 'LESS',
+            });
+            if (pg_history[i])
+            {
+                txn.push({
+                    requestPut: {
+                        key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
+                        value: b64(JSON.stringify(pg_history[i])),
+                    },
+                });
+            }
+            else
+            {
+                txn.push({
+                    requestDeleteRange: {
+                        key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
+                    },
+                });
+            }
+        }
+        this.state.config.pgs = {
+            hash: tree_hash,
+            items: pg_items,
+        };
+        const res = await this.etcd_call('/txn', {
+            compare: [
+                { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
+                { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
+                ...checks,
+            ],
+            success: [
+                { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(this.state.config.pgs)) } },
+                ...txn,
+            ],
+        }, this.config.etcd_mon_timeout, 0);
+        return res.succeeded;
+    }
+
+    async recheck_pgs()
+    {
+        // Take configuration and state, check it against the stored configuration hash
+        // Recalculate PGs and save them to etcd if the configuration is changed
+        const tree_cfg = {
+            osd_tree: this.get_osd_tree(),
+            pg_count: this.config.pg_count || Object.keys(this.state.config.pgs.items||{}).length || 128,
+            max_osd_combinations: this.config.max_osd_combinations,
+        };
+        const tree_hash = sha1hex(stableStringify(tree_cfg));
+        if (this.state.config.pgs.hash != tree_hash)
+        {
+            // Something has changed
+            const prev_pgs = [];
+            for (const pg in this.state.config.pgs.items||{})
+            {
+                prev_pgs[pg-1] = this.state.config.pgs.items[pg].osd_set;
+            }
+            const pg_history = [];
+            const old_pg_count = prev_pgs.length;
+            let optimize_result;
+            if (old_pg_count > 0)
+            {
+                if (old_pg_count != tree_cfg.pg_count)
+                {
+                    // PG count changed. Need to bring all PGs down.
+                    if (!await this.stop_all_pgs())
+                    {
+                        this.schedule_recheck();
+                        return;
+                    }
+                    this.scale_pg_count(prev_pgs, pg_history, new_pg_count);
+                }
+                optimize_result = await LPOptimizer.optimize_change(prev_pgs, tree_cfg.osd_tree, tree_cfg.max_osd_combinations);
+            }
+            else
+            {
+                optimize_result = await LPOptimizer.optimize_initial(tree_cfg.osd_tree, tree_cfg.pg_count, tree_cfg.max_osd_combinations);
+            }
+            if (!await this.save_new_pgs(prev_pgs, optimize_result.int_pgs, pg_history, tree_hash))
+            {
+                console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
+                this.schedule_recheck();
+                return;
+            }
+            console.log('PG configuration successfully changed');
+            if (old_pg_count != optimize_result.int_pgs.length)
+            {
+                console.log(`PG count changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`);
+            }
+            LPOptimizer.print_change_stats(optimize_result);
+        }
+    }
+
+    schedule_recheck()
+    {
+        if (this.recheck_timer)
+        {
+            clearTimeout(this.recheck_timer);
+            this.recheck_timer = null;
+        }
+        this.recheck_timer = setTimeout(() =>
+        {
+            this.recheck_timer = null;
+            this.recheck_pgs().catch(console.error);
+        }, this.config.mon_change_timeout || 1000);
+    }
+
+    sum_stats()
+    {
+        let overflow = false;
+        this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} };
+        const op_stats = {}, subop_stats = {}, recovery_stats = {};
+        for (const osd in this.state.osd.stats)
+        {
+            const st = this.state.osd.stats[osd];
+            for (const op in st.op_stats||{})
+            {
+                op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
+                op_stats[op].count += BigInt(st.op_stats.count||0);
+                op_stats[op].usec += BigInt(st.op_stats.usec||0);
+                op_stats[op].bytes += BigInt(st.op_stats.bytes||0);
+            }
+            for (const op in st.subop_stats||{})
+            {
+                subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
+                subop_stats[op].count += BigInt(st.subop_stats.count||0);
+                subop_stats[op].usec += BigInt(st.subop_stats.usec||0);
+            }
+            for (const op in st.recovery_stats||{})
+            {
+                recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
+                recovery_stats[op].count += BigInt(st.recovery_stats.count||0);
+                recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
+            }
+        }
+        for (const op in op_stats)
+        {
+            if (op_stats[op].count >= 0x10000000000000000n)
+            {
+                if (!this.prev_stats.op_stats[op])
+                {
+                    overflow = true;
+                }
+                else
+                {
+                    op_stats[op].count -= this.prev_stats.op_stats[op].count;
+                    op_stats[op].usec -= this.prev_stats.op_stats[op].usec;
+                    op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes;
+                }
+            }
+        }
+        for (const op in subop_stats)
+        {
+            if (subop_stats[op].count >= 0x10000000000000000n)
+            {
+                if (!this.prev_stats.subop_stats[op])
+                {
+                    overflow = true;
+                }
+                else
+                {
+                    subop_stats[op].count -= this.prev_stats.subop_stats[op].count;
+                    subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec;
+                }
+            }
+        }
+        for (const op in recovery_stats)
+        {
+            if (recovery_stats[op].count >= 0x10000000000000000n)
+            {
+                if (!this.prev_stats.recovery_stats[op])
+                {
+                    overflow = true;
+                }
+                else
+                {
+                    recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count;
+                    recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes;
+                }
+            }
+        }
+        const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
+        for (const pg_num in this.state.pg.stats)
+        {
+            const st = this.state.pg.stats[pg_num];
+            for (const k in object_counts)
+            {
+                if (st[k+'_count'])
+                {
+                    object_counts[k] += BigInt(st[k+'_count']);
+                }
+            }
+        }
+        return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts });
+    }
+
+    async update_total_stats()
+    {
+        const stats = this.sum_stats();
+        if (!stats.overflow)
+        {
+            // Convert to strings, serialize and save
+            const ser = {};
+            for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ])
+            {
+                ser[st] = {};
+                for (const op in stats[st])
+                {
+                    ser[st][op] = {};
+                    for (const k in stats[st][op])
+                    {
+                        ser[st][op][k] = ''+stats[st][op][k];
+                    }
+                }
+            }
+            ser.object_counts = {};
+            for (const k in stats.object_counts)
+            {
+                ser.object_counts[k] = ''+stats.object_counts[k];
+            }
+            await this.etcd_call('/txn', {
+                success: [ { requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } } ],
+            }, this.config.etcd_mon_timeout, 0);
+        }
+    }
+
+    schedule_update_stats()
+    {
+        if (this.stats_timer)
+        {
+            clearTimeout(this.stats_timer);
+            this.stats_timer = null;
+        }
+        this.stats_timer = setTimeout(() =>
+        {
+            this.stats_timer = null;
+            this.update_total_stats().catch(console.error);
+        }, this.config.mon_stats_timeout || 1000);
+    }
+
+    parse_kv(kv)
+    {
+        if (!kv || !kv.key)
+        {
+            return;
+        }
+        kv.key = de64(kv.key);
+        kv.value = kv.value ? JSON.parse(de64(kv.value)) : null;
+        const key = kv.key.substr(this.etcd_prefix.length).replace(/^\/+/, '').split('/');
+        const cur = this.state, orig = Mon.etcd_tree;
+        for (let i = 0; i < key.length-1; i++)
+        {
+            if (!orig[key[i]])
+            {
+                console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
+                return;
+            }
+            orig = orig[key[i]];
+            cur = (cur[key[i]] = cur[key[i]] || {});
+        }
+        if (orig[key.length-1])
+        {
+            console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
+            return;
+        }
+        cur[key[key.length-1]] = kv.value;
+        if (key.join('/') === 'config/global')
+        {
+            this.state.config.global = this.state.config.global || {};
+            this.config = this.state.config.global;
+            this.check_config();
+        }
+    }
+
+    async etcd_call(path, body, timeout, retries)
+    {
+        let retry = 0;
+        if (retries >= 0 && retries < 1)
+        {
+            retries = 1;
+        }
+        while (retries < 0 || retry < retries)
+        {
+            const base = this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)];
+            const res = await POST(base+path, body, timeout);
+            if (res.json)
+            {
+                if (res.json.error)
+                {
+                    console.log('etcd returned error: '+res.json.error);
+                    break;
+                }
+                return res.json;
+            }
+            retry++;
+        }
+        this.die();
+    }
+
+    die(err)
+    {
+        // In fact we can just try to rejoin
+        console.fatal(err || 'Cluster connection failed');
+        process.exit(1);
+    }
+
+    local_ips()
+    {
+        const ips = [];
+        const ifaces = os.networkInterfaces();
+        for (const ifname in ifaces)
+        {
+            for (const iface of ifaces[ifname])
+            {
+                if (iface.family == 'IPv4' && !iface.internal)
+                {
+                    ips.push(iface.address);
+                }
+            }
+        }
+        return ips;
+    }
+}
+
+function POST(url, body, timeout)
+{
+    return new Promise((ok, no) =>
+    {
+        const body_text = Buffer.from(JSON.stringify(body));
+        let timer_id = timeout > 0 ? setTimeout(() =>
+        {
+            if (req)
+                req.abort();
+            req = null;
+            ok({ error: 'timeout' });
+        }, timeout) : null;
+        let req = http.request(url, { method: 'POST', headers: {
+            'Content-Type': 'application/json',
+            'Content-Length': body_text,
+        } }, (res) =>
+        {
+            if (!req)
+            {
+                return;
+            }
+            clearTimeout(timer_id);
+            if (res.statusCode != 200)
+            {
+                ok({ error: res.statusCode, response: res });
+                return;
+            }
+            let res_body = '';
+            res.setEncoding('utf8');
+            res.on('data', chunk => { res_body += chunk });
+            res.on('end', () =>
+            {
+                try
+                {
+                    res_body = JSON.parse(res_body);
+                    ok({ response: res, json: res_body });
+                }
+                catch (e)
+                {
+                    ok({ error: e, response: res, body: res_body });
+                }
+            });
+        });
+        req.write(body_text);
+        req.end();
+    });
+}
+
+function b64(str)
+{
+    return Buffer.from(str).toString('base64');
+}
+
+function de64(str)
+{
+    return Buffer.from(str, 'base64').toString();
+}
+
+function sha1hex(str)
+{
+    const hash = crypto.createHash('sha1');
+    hash.update(str);
+    return hash.digest('hex');
+}
--- a/lp/package.json
+++ b/lp/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "rage-mon",
+  "version": "1.0.0",
+  "description": "RAGE storage monitor service",
+  "main": "mon.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "Vitaliy Filippov",
+  "license": "UNLICENSED",
+  "dependencies": {
+    "ws": "^7.2.5"
+  }
+}
--- a/lp/test-optimize-undersized.js
+++ b/lp/test-optimize-undersized.js
@@ -0,0 +1,71 @@
+const LPOptimizer = require('./lp-optimizer.js');
+
+const crush_tree = [
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 1, size: 3 },
+            { level: 3, id: 2, size: 3 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 3, size: 3 },
+            { level: 3, id: 4, size: 3 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 5, size: 3 },
+            { level: 3, id: 6, size: 3 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 7, size: 3 },
+            { level: 3, id: 8, size: 3 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 9, size: 3 },
+            { level: 3, id: 10, size: 3 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 11, size: 3 },
+            { level: 3, id: 12, size: 3 },
+        ] },
+    ] },
+];
+
+const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
+console.log(osd_tree);
+
+async function run()
+{
+    const cur_tree = {};
+    console.log('Empty tree:');
+    let res = await LPOptimizer.optimize_initial(cur_tree, 256);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nAdding 1st failure domain:');
+    cur_tree['dom1'] = osd_tree['dom1'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nAdding 2nd failure domain:');
+    cur_tree['dom2'] = osd_tree['dom2'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nAdding 3rd failure domain:');
+    cur_tree['dom3'] = osd_tree['dom3'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nRemoving 3rd failure domain:');
+    delete cur_tree['dom3'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nRemoving 2nd failure domain:');
+    delete cur_tree['dom2'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nRemoving 1st failure domain:');
+    delete cur_tree['dom1'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+}
+
+run().catch(console.error);
--- a/lp/test-optimize.js
+++ b/lp/test-optimize.js
@@ -0,0 +1,94 @@
+const LPOptimizer = require('./lp-optimizer.js');
+
+const osd_tree = {
+    100: {
+        7: 3.63869,
+    },
+    300: {
+        10: 3.46089,
+        11: 3.46089,
+        12: 3.46089,
+    },
+    400: {
+        1: 3.49309,
+        2: 3.49309,
+        3: 3.49309,
+    },
+    500: {
+        4: 3.58498,
+//        8: 3.58589,
+        9: 3.63869,
+    },
+    600: {
+        5: 3.63869,
+        6: 3.63869,
+    },
+/*    100: {
+        1: 2.72800,
+    },
+    200: {
+        2: 2.72900,
+    },
+    300: {
+        3: 1.87000,
+    },
+    400: {
+        4: 1.87000,
+    },
+    500: {
+        5: 3.63869,
+    },*/
+};
+
+const crush_tree = [
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 1, size: 3 },
+            { level: 3, id: 2, size: 2 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 3, size: 4 },
+            { level: 3, id: 4, size: 4 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 5, size: 4 },
+            { level: 3, id: 6, size: 1 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 7, size: 3 },
+            { level: 3, id: 8, size: 5 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 9, size: 5 },
+            { level: 3, id: 10, size: 2 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 11, size: 3 },
+            { level: 3, id: 12, size: 3 },
+        ] },
+    ] },
+];
+
+async function run()
+{
+    // Test: add 1 OSD of almost the same size. Ideal data movement could be 1/12 = 8.33%. Actual is ~13%
+    // Space efficiency is ~99.5% in both cases.
+    let res = await LPOptimizer.optimize_initial(osd_tree, 256);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('adding osd.8');
+    osd_tree[500][8] = 3.58589;
+    res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('removing osd.8');
+    delete osd_tree[500][8];
+    res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
+    LPOptimizer.print_change_stats(res, false);
+    res = await LPOptimizer.optimize_initial(LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), 256);
+    LPOptimizer.print_change_stats(res, false);
+}
+
+run().catch(console.error);
--- a/messenger.cpp
+++ b/messenger.cpp
@@ -0,0 +1,398 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <netinet/tcp.h>
+
+#include "messenger.h"
+
+osd_op_t::~osd_op_t()
+{
+    assert(!bs_op);
+    assert(!op_data);
+    if (rmw_buf)
+    {
+        free(rmw_buf);
+    }
+    if (buf)
+    {
+        // Note: reusing osd_op_t WILL currently lead to memory leaks
+        // So we don't reuse it, but free it every time
+        free(buf);
+    }
+}
+
+void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
+{
+    if (wanted_peers.find(peer_osd) == wanted_peers.end())
+    {
+        wanted_peers[peer_osd] = (osd_wanted_peer_t){
+            .address_list = peer_state["addresses"],
+            .port = (int)peer_state["port"].int64_value(),
+        };
+    }
+    else
+    {
+        wanted_peers[peer_osd].address_list = peer_state["addresses"];
+        wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
+    }
+    wanted_peers[peer_osd].address_changed = true;
+    if (!wanted_peers[peer_osd].connecting &&
+        (time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
+    {
+        try_connect_peer(peer_osd);
+    }
+}
+
+void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
+{
+    auto wp_it = wanted_peers.find(peer_osd);
+    if (wp_it == wanted_peers.end())
+    {
+        return;
+    }
+    if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
+    {
+        wanted_peers.erase(peer_osd);
+        return;
+    }
+    auto & wp = wp_it->second;
+    if (wp.address_index >= wp.address_list.array_items().size())
+    {
+        return;
+    }
+    wp.cur_addr = wp.address_list[wp.address_index].string_value();
+    wp.cur_port = wp.port;
+    try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
+}
+
+void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
+{
+    struct sockaddr_in addr;
+    int r;
+    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
+    {
+        on_connect_peer(peer_osd, -EINVAL);
+        return;
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(peer_port ? peer_port : 11203);
+    int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (peer_fd < 0)
+    {
+        on_connect_peer(peer_osd, -errno);
+        return;
+    }
+    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
+    int timeout_id = -1;
+    if (peer_connect_timeout > 0)
+    {
+        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
+        {
+            osd_num_t peer_osd = clients[peer_fd].osd_num;
+            stop_client(peer_fd);
+            on_connect_peer(peer_osd, -EIO);
+            return;
+        });
+    }
+    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
+    if (r < 0 && errno != EINPROGRESS)
+    {
+        close(peer_fd);
+        on_connect_peer(peer_osd, -errno);
+        return;
+    }
+    assert(peer_osd != this->osd_num);
+    clients[peer_fd] = (osd_client_t){
+        .peer_addr = addr,
+        .peer_port = peer_port,
+        .peer_fd = peer_fd,
+        .peer_state = PEER_CONNECTING,
+        .connect_timeout_id = timeout_id,
+        .osd_num = peer_osd,
+        .in_buf = malloc(receive_buffer_size),
+    };
+    tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
+    {
+        // Either OUT (connected) or HUP
+        handle_connect_epoll(peer_fd);
+    });
+}
+
+void osd_messenger_t::handle_connect_epoll(int peer_fd)
+{
+    auto & cl = clients[peer_fd];
+    if (cl.connect_timeout_id >= 0)
+    {
+        tfd->clear_timer(cl.connect_timeout_id);
+        cl.connect_timeout_id = -1;
+    }
+    osd_num_t peer_osd = cl.osd_num;
+    int result = 0;
+    socklen_t result_len = sizeof(result);
+    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
+    {
+        result = errno;
+    }
+    if (result != 0)
+    {
+        stop_client(peer_fd);
+        on_connect_peer(peer_osd, -result);
+        return;
+    }
+    int one = 1;
+    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+    cl.peer_state = PEER_CONNECTED;
+    // FIXME Disable EPOLLOUT on this fd
+    tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
+    {
+        handle_peer_epoll(peer_fd, epoll_events);
+    });
+    // Check OSD number
+    check_peer_config(cl);
+}
+
+void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
+{
+    // Mark client as ready (i.e. some data is available)
+    if (epoll_events & EPOLLRDHUP)
+    {
+        // Stop client
+        printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
+        stop_client(peer_fd);
+    }
+    else if (epoll_events & EPOLLIN)
+    {
+        // Mark client as ready (i.e. some data is available)
+        auto & cl = clients[peer_fd];
+        cl.read_ready++;
+        if (cl.read_ready == 1)
+        {
+            read_ready_clients.push_back(cl.peer_fd);
+            ringloop->wakeup();
+        }
+    }
+}
+
+void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
+{
+    auto & wp = wanted_peers.at(peer_osd);
+    wp.connecting = false;
+    if (peer_fd < 0)
+    {
+        printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
+        if (wp.address_changed)
+        {
+            wp.address_changed = false;
+            wp.address_index = 0;
+            try_connect_peer(peer_osd);
+        }
+        else if (wp.address_index < wp.address_list.array_items().size()-1)
+        {
+            // Try other addresses
+            wp.address_index++;
+            try_connect_peer(peer_osd);
+        }
+        else
+        {
+            // Retry again in <peer_connect_interval> seconds
+            wp.last_connect_attempt = time(NULL);
+            wp.address_index = 0;
+            tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
+            {
+                try_connect_peer(peer_osd);
+            });
+        }
+        return;
+    }
+    printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
+    wanted_peers.erase(peer_osd);
+    repeer_pgs(peer_osd);
+}
+
+void osd_messenger_t::check_peer_config(osd_client_t & cl)
+{
+    osd_op_t *op = new osd_op_t();
+    op->op_type = OSD_OP_OUT;
+    op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+    op->peer_fd = cl.peer_fd;
+    op->req = {
+        .show_conf = {
+            .header = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = this->next_subop_id++,
+                .opcode = OSD_OP_SHOW_CONFIG,
+            },
+        },
+    };
+    op->callback = [this](osd_op_t *op)
+    {
+        osd_client_t & cl = clients[op->peer_fd];
+        std::string json_err;
+        json11::Json config;
+        bool err = false;
+        if (op->reply.hdr.retval < 0)
+        {
+            err = true;
+            printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
+        }
+        else
+        {
+            config = json11::Json::parse(std::string((char*)op->buf), json_err);
+            if (json_err != "")
+            {
+                err = true;
+                printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
+            }
+            else if (config["osd_num"].uint64_value() != cl.osd_num)
+            {
+                err = true;
+                printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
+                on_connect_peer(cl.osd_num, -1);
+            }
+        }
+        if (err)
+        {
+            stop_client(op->peer_fd);
+            delete op;
+            return;
+        }
+        osd_peer_fds[cl.osd_num] = cl.peer_fd;
+        on_connect_peer(cl.osd_num, cl.peer_fd);
+        delete op;
+    };
+    outbox_push(op);
+}
+
+void osd_messenger_t::cancel_osd_ops(osd_client_t & cl)
+{
+    for (auto p: cl.sent_ops)
+    {
+        cancel_op(p.second);
+    }
+    cl.sent_ops.clear();
+    for (auto op: cl.outbox)
+    {
+        cancel_op(op);
+    }
+    cl.outbox.clear();
+    if (cl.write_op)
+    {
+        cancel_op(cl.write_op);
+        cl.write_op = NULL;
+    }
+}
+
+void osd_messenger_t::cancel_op(osd_op_t *op)
+{
+    if (op->op_type == OSD_OP_OUT)
+    {
+        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        op->reply.hdr.id = op->req.hdr.id;
+        op->reply.hdr.opcode = op->req.hdr.opcode;
+        op->reply.hdr.retval = -EPIPE;
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(op->callback)(op);
+    }
+    else
+    {
+        // This function is only called in stop_client(), so it's fine to destroy the operation
+        delete op;
+    }
+}
+
+void osd_messenger_t::stop_client(int peer_fd)
+{
+    assert(peer_fd != 0);
+    auto it = clients.find(peer_fd);
+    if (it == clients.end())
+    {
+        return;
+    }
+    uint64_t repeer_osd = 0;
+    osd_client_t cl = it->second;
+    if (cl.peer_state == PEER_CONNECTED)
+    {
+        if (cl.osd_num)
+        {
+            // Reload configuration from etcd when the connection is dropped
+            printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
+            repeer_osd = cl.osd_num;
+        }
+        else
+        {
+            printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
+        }
+    }
+    clients.erase(it);
+    tfd->set_fd_handler(peer_fd, NULL);
+    if (cl.osd_num)
+    {
+        osd_peer_fds.erase(cl.osd_num);
+        // Cancel outbound operations
+        cancel_osd_ops(cl);
+    }
+    if (cl.read_op)
+    {
+        delete cl.read_op;
+        cl.read_op = NULL;
+    }
+    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
+    {
+        if (*rit == peer_fd)
+        {
+            read_ready_clients.erase(rit);
+            break;
+        }
+    }
+    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
+    {
+        if (*wit == peer_fd)
+        {
+            write_ready_clients.erase(wit);
+            break;
+        }
+    }
+    free(cl.in_buf);
+    close(peer_fd);
+    if (repeer_osd)
+    {
+        repeer_pgs(repeer_osd);
+    }
+}
+
+void osd_messenger_t::accept_connections(int listen_fd)
+{
+    // Accept new connections
+    sockaddr_in addr;
+    socklen_t peer_addr_size = sizeof(addr);
+    int peer_fd;
+    while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
+    {
+        assert(peer_fd != 0);
+        char peer_str[256];
+        printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
+            inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
+        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
+        int one = 1;
+        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+        clients[peer_fd] = {
+            .peer_addr = addr,
+            .peer_port = ntohs(addr.sin_port),
+            .peer_fd = peer_fd,
+            .peer_state = PEER_CONNECTED,
+            .in_buf = malloc(receive_buffer_size),
+        };
+        // Add FD to epoll
+        tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
+        {
+            handle_peer_epoll(peer_fd, epoll_events);
+        });
+        // Try to accept next connection
+        peer_addr_size = sizeof(addr);
+    }
+    if (peer_fd == -1 && errno != EAGAIN)
+    {
+        throw std::runtime_error(std::string("accept: ") + strerror(errno));
+    }
+}
--- a/messenger.h
+++ b/messenger.h
@@ -0,0 +1,213 @@
+#pragma once
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <arpa/inet.h>
+#include <malloc.h>
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+
+#include "json11/json11.hpp"
+#include "osd_ops.h"
+#include "timerfd_manager.h"
+#include "ringloop.h"
+
+#define OSD_OP_IN 0
+#define OSD_OP_OUT 1
+
+#define CL_READ_HDR 1
+#define CL_READ_DATA 2
+#define CL_READ_REPLY_DATA 3
+#define CL_WRITE_READY 1
+#define CL_WRITE_REPLY 2
+#define OSD_OP_INLINE_BUF_COUNT 16
+
+#define PEER_CONNECTING 1
+#define PEER_CONNECTED 2
+
+#define DEFAULT_PEER_CONNECT_INTERVAL 5
+#define DEFAULT_PEER_CONNECT_TIMEOUT 5
+
+struct osd_op_buf_list_t
+{
+    int count = 0, alloc = 0, sent = 0;
+    iovec *buf = NULL;
+    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
+
+    ~osd_op_buf_list_t()
+    {
+        if (buf && buf != inline_buf)
+        {
+            free(buf);
+        }
+    }
+
+    inline iovec* get_iovec()
+    {
+        return (buf ? buf : inline_buf) + sent;
+    }
+
+    inline int get_size()
+    {
+        return count - sent;
+    }
+
+    inline void push_back(void *nbuf, size_t len)
+    {
+        if (count >= alloc)
+        {
+            if (!alloc)
+            {
+                alloc = OSD_OP_INLINE_BUF_COUNT;
+                buf = inline_buf;
+            }
+            else if (buf == inline_buf)
+            {
+                int old = alloc;
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)malloc(sizeof(iovec) * alloc);
+                memcpy(buf, inline_buf, sizeof(iovec)*old);
+            }
+            else
+            {
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
+            }
+        }
+        buf[count++] = { .iov_base = nbuf, .iov_len = len };
+    }
+};
+
+struct blockstore_op_t;
+
+struct osd_primary_op_data_t;
+
+struct osd_op_t
+{
+    timespec tv_begin;
+    uint64_t op_type = OSD_OP_IN;
+    int peer_fd;
+    osd_any_op_t req;
+    osd_any_reply_t reply;
+    blockstore_op_t *bs_op = NULL;
+    void *buf = NULL;
+    void *rmw_buf = NULL;
+    osd_primary_op_data_t* op_data = NULL;
+    std::function<void(osd_op_t*)> callback;
+
+    osd_op_buf_list_t send_list;
+
+    ~osd_op_t();
+};
+
+struct osd_client_t
+{
+    sockaddr_in peer_addr;
+    int peer_port;
+    int peer_fd;
+    int peer_state;
+    int connect_timeout_id = -1;
+    osd_num_t osd_num = 0;
+
+    void *in_buf = NULL;
+
+    // Read state
+    int read_ready = 0;
+    osd_op_t *read_op = NULL;
+    int read_reply_id = 0;
+    iovec read_iov;
+    msghdr read_msg;
+    void *read_buf = NULL;
+    int read_remaining = 0;
+    int read_state = 0;
+
+    // Incoming operations
+    std::vector<osd_op_t*> received_ops;
+
+    // Outbound operations
+    std::deque<osd_op_t*> outbox;
+    std::map<int, osd_op_t*> sent_ops;
+
+    // PGs dirtied by this client's primary-writes (FIXME to drop the connection)
+    std::set<pg_num_t> dirty_pgs;
+
+    // Write state
+    osd_op_t *write_op = NULL;
+    msghdr write_msg;
+    int write_state = 0;
+};
+
+struct osd_wanted_peer_t
+{
+    json11::Json address_list;
+    int port;
+    time_t last_connect_attempt;
+    bool connecting, address_changed;
+    int address_index;
+    std::string cur_addr;
+    int cur_port;
+};
+
+struct osd_op_stats_t
+{
+    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
+    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
+    uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
+    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
+    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
+};
+
+struct osd_messenger_t
+{
+    timerfd_manager_t *tfd;
+    ring_loop_t *ringloop;
+
+    // osd_num_t is only for logging and asserts
+    osd_num_t osd_num;
+    int receive_buffer_size = 9000;
+    int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
+    int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
+    int log_level = 0;
+
+    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
+    std::map<uint64_t, int> osd_peer_fds;
+    uint64_t next_subop_id = 1;
+
+    std::map<int, osd_client_t> clients;
+    std::vector<int> read_ready_clients;
+    std::vector<int> write_ready_clients;
+
+    // op statistics
+    osd_op_stats_t stats;
+
+public:
+    void connect_peer(uint64_t osd_num, json11::Json peer_state);
+    void stop_client(int peer_fd);
+    void outbox_push(osd_op_t *cur_op);
+    std::function<void(osd_op_t*)> exec_op;
+    std::function<void(osd_num_t)> repeer_pgs;
+    void handle_peer_epoll(int peer_fd, int epoll_events);
+    void read_requests();
+    void send_replies();
+    void accept_connections(int listen_fd);
+
+protected:
+    void try_connect_peer(uint64_t osd_num);
+    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
+    void handle_connect_epoll(int peer_fd);
+    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
+    void check_peer_config(osd_client_t & cl);
+    void cancel_osd_ops(osd_client_t & cl);
+    void cancel_op(osd_op_t *op);
+
+    bool try_send(osd_client_t & cl);
+    void handle_send(int result, int peer_fd);
+
+    bool handle_read(int result, int peer_fd);
+    void handle_finished_read(osd_client_t & cl);
+    void handle_op_hdr(osd_client_t *cl);
+    void handle_reply_hdr(osd_client_t *cl);
+};
--- a/msgr_receive.cpp
+++ b/msgr_receive.cpp
@@ -0,0 +1,275 @@
+#include "messenger.h"
+
+void osd_messenger_t::read_requests()
+{
+    while (read_ready_clients.size() > 0)
+    {
+        int peer_fd = read_ready_clients[0];
+        auto & cl = clients[peer_fd];
+        if (!cl.read_op || cl.read_remaining < receive_buffer_size)
+        {
+            cl.read_iov.iov_base = cl.in_buf;
+            cl.read_iov.iov_len = receive_buffer_size;
+        }
+        else
+        {
+            cl.read_iov.iov_base = cl.read_buf;
+            cl.read_iov.iov_len = cl.read_remaining;
+        }
+        cl.read_msg.msg_iov = &cl.read_iov;
+        cl.read_msg.msg_iovlen = 1;
+        read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + 1);
+        int result = recvmsg(peer_fd, &cl.read_msg, 0);
+        if (result < 0)
+        {
+            result = -errno;
+        }
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("recvmsg done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+        handle_read(result, peer_fd);
+    }
+}
+
+bool osd_messenger_t::handle_read(int result, int peer_fd)
+{
+    auto cl_it = clients.find(peer_fd);
+    if (cl_it != clients.end())
+    {
+        auto & cl = cl_it->second;
+        if (result < 0 && result != -EAGAIN)
+        {
+            // this is a client socket, so don't panic. just disconnect it
+            printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
+            stop_client(peer_fd);
+            return false;
+        }
+        if (result == -EAGAIN || result < cl.read_iov.iov_len)
+        {
+            cl.read_ready--;
+            if (cl.read_ready > 0)
+                read_ready_clients.push_back(peer_fd);
+        }
+        else
+        {
+            read_ready_clients.push_back(peer_fd);
+        }
+        if (result > 0)
+        {
+            if (cl.read_iov.iov_base == cl.in_buf)
+            {
+                // Compose operation(s) from the buffer
+                int remain = result;
+                void *curbuf = cl.in_buf;
+                while (remain > 0)
+                {
+                    if (!cl.read_op)
+                    {
+                        cl.read_op = new osd_op_t;
+                        cl.read_op->peer_fd = peer_fd;
+                        cl.read_op->op_type = OSD_OP_IN;
+                        cl.read_buf = cl.read_op->req.buf;
+                        cl.read_remaining = OSD_PACKET_SIZE;
+                        cl.read_state = CL_READ_HDR;
+                    }
+                    if (cl.read_remaining > remain)
+                    {
+                        memcpy(cl.read_buf, curbuf, remain);
+                        cl.read_remaining -= remain;
+                        cl.read_buf += remain;
+                        remain = 0;
+                        if (cl.read_remaining <= 0)
+                            handle_finished_read(cl);
+                    }
+                    else
+                    {
+                        memcpy(cl.read_buf, curbuf, cl.read_remaining);
+                        curbuf += cl.read_remaining;
+                        remain -= cl.read_remaining;
+                        cl.read_remaining = 0;
+                        cl.read_buf = NULL;
+                        handle_finished_read(cl);
+                    }
+                }
+            }
+            else
+            {
+                // Long data
+                cl.read_remaining -= result;
+                cl.read_buf += result;
+                if (cl.read_remaining <= 0)
+                {
+                    handle_finished_read(cl);
+                }
+            }
+            if (result >= cl.read_iov.iov_len)
+            {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+void osd_messenger_t::handle_finished_read(osd_client_t & cl)
+{
+    if (cl.read_state == CL_READ_HDR)
+    {
+        if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
+            handle_reply_hdr(&cl);
+        else
+            handle_op_hdr(&cl);
+    }
+    else if (cl.read_state == CL_READ_DATA)
+    {
+        // Operation is ready
+        cl.received_ops.push_back(cl.read_op);
+        exec_op(cl.read_op);
+        cl.read_op = NULL;
+        cl.read_state = 0;
+    }
+    else if (cl.read_state == CL_READ_REPLY_DATA)
+    {
+        // Reply is ready
+        auto req_it = cl.sent_ops.find(cl.read_reply_id);
+        osd_op_t *request = req_it->second;
+        cl.sent_ops.erase(req_it);
+        cl.read_reply_id = 0;
+        delete cl.read_op;
+        cl.read_op = NULL;
+        cl.read_state = 0;
+        // Measure subop latency
+        timespec tv_end;
+        clock_gettime(CLOCK_REALTIME, &tv_end);
+        stats.subop_stat_count[request->req.hdr.opcode]++;
+        if (!stats.subop_stat_count[request->req.hdr.opcode])
+        {
+            stats.subop_stat_count[request->req.hdr.opcode]++;
+            stats.subop_stat_sum[request->req.hdr.opcode] = 0;
+        }
+        stats.subop_stat_sum[request->req.hdr.opcode] += (
+            (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
+            (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
+        );
+        request->callback(request);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+
+void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
+{
+    osd_op_t *cur_op = cl->read_op;
+    if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
+    {
+        if (cur_op->req.sec_rw.len > 0)
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
+        cl->read_remaining = 0;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
+    {
+        if (cur_op->req.sec_rw.len > 0)
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
+        cl->read_remaining = cur_op->req.sec_rw.len;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
+        cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
+    {
+        if (cur_op->req.sec_stab.len > 0)
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
+        cl->read_remaining = cur_op->req.sec_stab.len;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
+    {
+        if (cur_op->req.rw.len > 0)
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
+        cl->read_remaining = 0;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
+    {
+        if (cur_op->req.rw.len > 0)
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
+        cl->read_remaining = cur_op->req.rw.len;
+    }
+    if (cl->read_remaining > 0)
+    {
+        // Read data
+        cl->read_buf = cur_op->buf;
+        cl->read_state = CL_READ_DATA;
+    }
+    else
+    {
+        // Operation is ready
+        cl->read_op = NULL;
+        cl->read_state = 0;
+        cl->received_ops.push_back(cur_op);
+        exec_op(cur_op);
+    }
+}
+
+void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
+{
+    osd_op_t *cur_op = cl->read_op;
+    auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
+    if (req_it == cl->sent_ops.end())
+    {
+        // Command out of sync. Drop connection
+        printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
+        stop_client(cl->peer_fd);
+        return;
+    }
+    osd_op_t *op = req_it->second;
+    memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
+    if ((op->reply.hdr.opcode == OSD_OP_SECONDARY_READ || op->reply.hdr.opcode == OSD_OP_READ) &&
+        op->reply.hdr.retval > 0)
+    {
+        // Read data. In this case we assume that the buffer is preallocated by the caller (!)
+        assert(op->buf);
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_reply_id = op->req.hdr.id;
+        cl->read_buf = op->buf;
+        cl->read_remaining = op->reply.hdr.retval;
+    }
+    else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST && op->reply.hdr.retval > 0)
+    {
+        op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_reply_id = op->req.hdr.id;
+        cl->read_buf = op->buf;
+        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
+    }
+    else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
+    {
+        op->buf = malloc(op->reply.hdr.retval);
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_reply_id = op->req.hdr.id;
+        cl->read_buf = op->buf;
+        cl->read_remaining = op->reply.hdr.retval;
+    }
+    else
+    {
+        delete cl->read_op;
+        cl->read_state = 0;
+        cl->read_op = NULL;
+        cl->sent_ops.erase(req_it);
+        // Measure subop latency
+        timespec tv_end;
+        clock_gettime(CLOCK_REALTIME, &tv_end);
+        stats.subop_stat_count[op->req.hdr.opcode]++;
+        if (!stats.subop_stat_count[op->req.hdr.opcode])
+        {
+            stats.subop_stat_count[op->req.hdr.opcode]++;
+            stats.subop_stat_sum[op->req.hdr.opcode] = 0;
+        }
+        stats.subop_stat_sum[op->req.hdr.opcode] += (
+            (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
+            (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
+        );
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(op->callback)(op);
+    }
+}
--- a/msgr_send.cpp
+++ b/msgr_send.cpp
@@ -0,0 +1,154 @@
+#include "messenger.h"
+
+void osd_messenger_t::outbox_push(osd_op_t *cur_op)
+{
+    assert(cur_op->peer_fd);
+    auto & cl = clients.at(cur_op->peer_fd);
+    if (cur_op->op_type == OSD_OP_OUT)
+    {
+        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
+    }
+    else
+    {
+        // Check that operation actually belongs to this client
+        bool found = false;
+        for (auto it = cl.received_ops.begin(); it != cl.received_ops.end(); it++)
+        {
+            if (*it == cur_op)
+            {
+                found = true;
+                cl.received_ops.erase(it, it+1);
+                break;
+            }
+        }
+        if (!found)
+        {
+            delete cur_op;
+            return;
+        }
+    }
+    cl.outbox.push_back(cur_op);
+    if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
+    {
+        if (cl.write_state == 0)
+        {
+            cl.write_state = CL_WRITE_READY;
+            write_ready_clients.push_back(cur_op->peer_fd);
+        }
+        ringloop->wakeup();
+    }
+}
+
+bool osd_messenger_t::try_send(osd_client_t & cl)
+{
+    int peer_fd = cl.peer_fd;
+    if (!cl.write_op)
+    {
+        // pick next command
+        cl.write_op = cl.outbox.front();
+        cl.outbox.pop_front();
+        cl.write_state = CL_WRITE_REPLY;
+        if (cl.write_op->op_type == OSD_OP_IN)
+        {
+            // Measure execution latency
+            timespec tv_end;
+            clock_gettime(CLOCK_REALTIME, &tv_end);
+            stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
+            if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
+            {
+                stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
+                stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
+                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
+            }
+            stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
+                (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
+                (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
+            );
+            if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
+                cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
+            {
+                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.rw.len;
+            }
+            else if (cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
+                cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
+            {
+                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
+            }
+        }
+    }
+    cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
+    cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
+    int result = sendmsg(peer_fd, &cl.write_msg, MSG_NOSIGNAL);
+    if (result < 0)
+        result = -errno;
+    {
+        timespec now;
+        clock_gettime(CLOCK_REALTIME, &now);
+        printf("sendmsg done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+    }
+    handle_send(result, peer_fd);
+    return true;
+}
+
+void osd_messenger_t::send_replies()
+{
+    while (write_ready_clients.size() > 0)
+    {
+        auto & cl = clients[write_ready_clients[0]];
+        write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + 1);
+        try_send(cl);
+    }
+}
+
+void osd_messenger_t::handle_send(int result, int peer_fd)
+{
+    auto cl_it = clients.find(peer_fd);
+    if (cl_it != clients.end())
+    {
+        auto & cl = cl_it->second;
+        if (result < 0 && result != -EAGAIN)
+        {
+            // this is a client socket, so don't panic. just disconnect it
+            printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
+            stop_client(peer_fd);
+            return;
+        }
+        if (result >= 0)
+        {
+            osd_op_t *cur_op = cl.write_op;
+            while (result > 0 && cur_op->send_list.sent < cur_op->send_list.count)
+            {
+                iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
+                if (iov.iov_len <= result)
+                {
+                    result -= iov.iov_len;
+                    cur_op->send_list.sent++;
+                }
+                else
+                {
+                    iov.iov_len -= result;
+                    iov.iov_base += result;
+                    break;
+                }
+            }
+            if (cur_op->send_list.sent >= cur_op->send_list.count)
+            {
+                // Done
+                if (cur_op->op_type == OSD_OP_IN)
+                {
+                    delete cur_op;
+                }
+                else
+                {
+                    cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
+                }
+                cl.write_op = NULL;
+                cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
+            }
+        }
+        if (cl.write_state != 0)
+        {
+            write_ready_clients.push_back(peer_fd);
+        }
+    }
+}
--- a/osd.cpp
+++ b/osd.cpp
@@ -7,7 +7,9 @@

 #include "osd.h"

-static const char* osd_op_names[] = {
+#define MAX_EPOLL_EVENTS 64
+
+const char* osd_op_names[] = {
    "",
    "read",
    "write",
@@ -21,6 +23,7 @@ static const char* osd_op_names[] = {
    "primary_read",
    "primary_write",
    "primary_sync",
+    "primary_delete",
 };

 osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
@@ -28,50 +31,110 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
    this->config = config;
    this->bs = bs;
    this->ringloop = ringloop;
-    this->tick_tfd = new timerfd_interval(ringloop, 3, [this]()
-    {
-        for (int i = 0; i <= OSD_OP_MAX; i++)
-        {
-            if (op_stat_count[i] != 0)
-            {
-                printf("avg latency for op %d (%s): %ld us\n", i, osd_op_names[i], op_stat_sum[i]/op_stat_count[i]);
-                op_stat_count[i] = 0;
-                op_stat_sum[i] = 0;
-            }
-        }
-        for (int i = 0; i <= OSD_OP_MAX; i++)
-        {
-            if (subop_stat_count[i] != 0)
-            {
-                printf("avg latency for subop %d (%s): %ld us\n", i, osd_op_names[i], subop_stat_sum[i]/subop_stat_count[i]);
-                subop_stat_count[i] = 0;
-                subop_stat_sum[i] = 0;
-            }
-        }
-        if (send_stat_count != 0)
-        {
-            printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
-            send_stat_count = 0;
-            send_stat_sum = 0;
-        }
-    });
+
    this->bs_block_size = bs->get_block_size();
    // FIXME: use bitmap granularity instead
    this->bs_disk_alignment = bs->get_disk_alignment();

-    bind_address = config["bind_address"];
-    if (bind_address == "")
-        bind_address = "0.0.0.0";
-    bind_port = strtoull(config["bind_port"].c_str(), NULL, 10);
-    if (!bind_port || bind_port > 65535)
-        bind_port = 11203;
+    parse_config(config);
+
+    epoll_fd = epoll_create(1);
+    if (epoll_fd < 0)
+    {
+        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
+    }
+
+    this->tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
+    this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
+    {
+        print_stats();
+    });
+
+    c_cli.tfd = this->tfd;
+    c_cli.ringloop = this->ringloop;
+    c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
+    c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
+
+    init_cluster();
+
+    consumer.loop = [this]() { loop(); };
+    ringloop->register_consumer(&consumer);
+}
+
+osd_t::~osd_t()
+{
+    if (tfd)
+    {
+        delete tfd;
+        tfd = NULL;
+    }
+    ringloop->unregister_consumer(&consumer);
+    close(epoll_fd);
+    close(listen_fd);
+}
+
+void osd_t::parse_config(blockstore_config_t & config)
+{
+    // Initial startup configuration
+    json11::Json json_config = json11::Json(config);
+    st_cli.parse_config(json_config);
+    etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
+    if (etcd_report_interval <= 0)
+        etcd_report_interval = 30;
    osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
    if (!osd_num)
        throw std::runtime_error("osd_num is required in the configuration");
-    run_primary = config["run_primary"] == "true" || config["run_primary"] == "1" || config["run_primary"] == "yes";
-    if (run_primary)
-        init_primary();
+    c_cli.osd_num = osd_num;
+    run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
+    // Cluster configuration
+    bind_address = config["bind_address"];
+    if (bind_address == "")
+        bind_address = "0.0.0.0";
+    bind_port = stoull_full(config["bind_port"]);
+    if (bind_port <= 0 || bind_port > 65535)
+        bind_port = 0;
+    if (config["immediate_commit"] == "all")
+        immediate_commit = IMMEDIATE_ALL;
+    else if (config["immediate_commit"] == "small")
+        immediate_commit = IMMEDIATE_SMALL;
+    if (config.find("autosync_interval") != config.end())
+    {
+        autosync_interval = strtoull(config["autosync_interval"].c_str(), NULL, 10);
+        if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
+            autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
+    }
+    if (config.find("client_queue_depth") != config.end())
+    {
+        client_queue_depth = strtoull(config["client_queue_depth"].c_str(), NULL, 10);
+        if (client_queue_depth < 128)
+            client_queue_depth = 128;
+    }
+    if (config.find("pg_stripe_size") != config.end())
+    {
+        pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
+        if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
+            pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
+    }
+    recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
+    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
+        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
+        readonly = true;
+    print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
+    if (!print_stats_interval)
+        print_stats_interval = 3;
+    c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
+    if (!c_cli.peer_connect_interval)
+        c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
+    c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
+    if (!c_cli.peer_connect_timeout)
+        c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
+    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
+    c_cli.log_level = log_level;
+}

+void osd_t::bind_socket()
+{
    listen_fd = socket(AF_INET, SOCK_STREAM, 0);
    if (listen_fd < 0)
    {
@@ -88,13 +151,27 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
        throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
    }
    addr.sin_family = AF_INET;
-    addr.sin_port = htons(bind_port);

+    addr.sin_port = htons(bind_port);
    if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
    {
        close(listen_fd);
        throw std::runtime_error(std::string("bind: ") + strerror(errno));
    }
+    if (bind_port == 0)
+    {
+        socklen_t len = sizeof(addr);
+        if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
+        {
+            close(listen_fd);
+            throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
+        }
+        listening_port = ntohs(addr.sin_port);
+    }
+    else
+    {
+        listening_port = bind_port;
+    }

    if (listen(listen_fd, listen_backlog) < 0)
    {
@@ -104,13 +181,6 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo

    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);

-    epoll_fd = epoll_create(1);
-    if (epoll_fd < 0)
-    {
-        close(listen_fd);
-        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
-    }
-
    epoll_event ev;
    ev.data.fd = listen_fd;
    ev.events = EPOLLIN | EPOLLET;
@@ -120,39 +190,6 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
        close(epoll_fd);
        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
    }
-
-    consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(consumer);
-}
-
-osd_t::~osd_t()
-{
-    delete tick_tfd;
-    ringloop->unregister_consumer(consumer);
-    close(epoll_fd);
-    close(listen_fd);
-}
-
-osd_op_t::~osd_op_t()
-{
-    if (bs_op)
-    {
-        delete bs_op;
-    }
-    if (op_data)
-    {
-        free(op_data);
-    }
-    if (rmw_buf)
-    {
-        free(rmw_buf);
-    }
-    if (buf)
-    {
-        // Note: reusing osd_op_t WILL currently lead to memory leaks
-        // So we don't reuse it, but free it every time
-        free(buf);
-    }
 }

 bool osd_t::shutdown()
@@ -173,13 +210,42 @@ void osd_t::loop()
        wait_state = 1;
    }
    handle_peers();
-    read_requests();
-    send_replies();
+    c_cli.read_requests();
+    c_cli.send_replies();
    ringloop->submit();
 }

+void osd_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
+{
+    if (handler != NULL)
+    {
+        bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
+        epoll_event ev;
+        ev.data.fd = fd;
+        ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
+        if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
+        {
+            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+        }
+        epoll_handlers[fd] = handler;
+    }
+    else
+    {
+        if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
+        {
+            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+        }
+        epoll_handlers.erase(fd);
+    }
+}
+
 void osd_t::handle_epoll_events()
 {
+    {
+        timespec now;
+        clock_gettime(CLOCK_REALTIME, &now);
+        printf("epoll %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+    }
    io_uring_sqe *sqe = ringloop->get_sqe();
    if (!sqe)
    {
@@ -204,63 +270,12 @@ restart:
    {
        if (events[i].data.fd == listen_fd)
        {
-            // Accept new connections
-            sockaddr_in addr;
-            socklen_t peer_addr_size = sizeof(addr);
-            int peer_fd;
-            while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
-            {
-                char peer_str[256];
-                printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
-                fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
-                int one = 1;
-                setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-                clients[peer_fd] = {
-                    .peer_addr = addr,
-                    .peer_port = ntohs(addr.sin_port),
-                    .peer_fd = peer_fd,
-                    .peer_state = PEER_CONNECTED,
-                };
-                // Add FD to epoll
-                epoll_event ev;
-                ev.data.fd = peer_fd;
-                ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
-                if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
-                {
-                    throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-                }
-                // Try to accept next connection
-                peer_addr_size = sizeof(addr);
-            }
-            if (peer_fd == -1 && errno != EAGAIN)
-            {
-                throw std::runtime_error(std::string("accept: ") + strerror(errno));
-            }
+            c_cli.accept_connections(listen_fd);
        }
        else
        {
-            auto & cl = clients[events[i].data.fd];
-            if (cl.peer_state == PEER_CONNECTING)
-            {
-                // Either OUT (connected) or HUP
-                handle_connect_result(cl.peer_fd);
-            }
-            else if (events[i].events & EPOLLRDHUP)
-            {
-                // Stop client
-                printf("osd: client %d disconnected\n", cl.peer_fd);
-                stop_client(cl.peer_fd);
-            }
-            else
-            {
-                // Mark client as ready (i.e. some data is available)
-                cl.read_ready++;
-                if (cl.read_ready == 1)
-                {
-                    read_ready_clients.push_back(cl.peer_fd);
-                    ringloop->wakeup();
-                }
-            }
+            auto & cb = epoll_handlers[events[i].data.fd];
+            cb(events[i].data.fd, events[i].events);
        }
    }
    if (nfds == MAX_EPOLL_EVENTS)
@@ -269,85 +284,6 @@ restart:
    }
 }

-void osd_t::cancel_osd_ops(osd_client_t & cl)
-{
-    for (auto p: cl.sent_ops)
-    {
-        cancel_op(p.second);
-    }
-    cl.sent_ops.clear();
-    for (auto op: cl.outbox)
-    {
-        cancel_op(op);
-    }
-    cl.outbox.clear();
-    if (cl.write_op)
-    {
-        cancel_op(cl.write_op);
-        cl.write_op = NULL;
-    }
-}
-
-void osd_t::cancel_op(osd_op_t *op)
-{
-    if (op->op_type == OSD_OP_OUT)
-    {
-        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        op->reply.hdr.id = op->req.hdr.id;
-        op->reply.hdr.opcode = op->req.hdr.opcode;
-        op->reply.hdr.retval = -EPIPE;
-        op->callback(op);
-    }
-    else
-    {
-        delete op;
-    }
-}
-
-void osd_t::stop_client(int peer_fd)
-{
-    auto it = clients.find(peer_fd);
-    if (it == clients.end())
-    {
-        return;
-    }
-    auto & cl = it->second;
-    if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
-    {
-        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-    }
-    if (cl.osd_num)
-    {
-        // Cancel outbound operations
-        cancel_osd_ops(cl);
-        osd_peer_fds.erase(cl.osd_num);
-        repeer_pgs(cl.osd_num, false);
-        peering_state |= OSD_PEERING_PEERS;
-    }
-    if (cl.read_op)
-    {
-        delete cl.read_op;
-    }
-    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
-    {
-        if (*rit == peer_fd)
-        {
-            read_ready_clients.erase(rit);
-            break;
-        }
-    }
-    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
-    {
-        if (*wit == peer_fd)
-        {
-            write_ready_clients.erase(wit);
-            break;
-        }
-    }
-    clients.erase(it);
-    close(peer_fd);
-}
-
 void osd_t::exec_op(osd_op_t *cur_op)
 {
    clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@@ -357,23 +293,29 @@ void osd_t::exec_op(osd_op_t *cur_op)
        delete cur_op;
        return;
    }
+    inflight_ops++;
    cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
    if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
        cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
        (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
-        (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
-        (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
-        (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
+        (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % bs_disk_alignment || cur_op->req.sec_rw.offset % bs_disk_alignment) ||
+        (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
+        (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % bs_disk_alignment || cur_op->req.rw.offset % bs_disk_alignment))
    {
        // Bad command
-        cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        cur_op->reply.hdr.id = cur_op->req.hdr.id;
-        cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-        cur_op->reply.hdr.retval = -EINVAL;
-        outbox_push(this->clients[cur_op->peer_fd], cur_op);
+        finish_op(cur_op, -EINVAL);
+        return;
+    }
+    if (readonly &&
+        cur_op->req.hdr.opcode != OSD_OP_SECONDARY_READ &&
+        cur_op->req.hdr.opcode != OSD_OP_SECONDARY_LIST &&
+        cur_op->req.hdr.opcode != OSD_OP_READ &&
+        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
+    {
+        // Readonly mode
+        finish_op(cur_op, -EROFS);
        return;
    }
-    inflight_ops++;
    if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
    {
        exec_sync_stab_all(cur_op);
@@ -394,8 +336,84 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_sync(cur_op);
    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
+    {
+        continue_primary_del(cur_op);
+    }
    else
    {
        exec_secondary(cur_op);
    }
 }
+
+void osd_t::reset_stats()
+{
+    c_cli.stats = { 0 };
+    prev_stats = { 0 };
+    memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
+    memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
+}
+
+void osd_t::print_stats()
+{
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
+        {
+            uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
+            uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
+            if (c_cli.stats.op_stat_bytes[i] != 0)
+            {
+                printf(
+                    "[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
+                    (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
+                    (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
+                );
+            }
+            else
+            {
+                printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
+            }
+            prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i];
+            prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i];
+            prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
+        }
+    }
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
+        {
+            uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
+            printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
+            prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i];
+            prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i];
+        }
+    }
+    for (int i = 0; i < 2; i++)
+    {
+        if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
+        {
+            uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
+            printf(
+                "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
+                (recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
+                (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
+                (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
+            );
+            recovery_stat_count[1][i] = recovery_stat_count[0][i];
+            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
+        }
+    }
+    if (incomplete_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
+    }
+    if (degraded_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
+    }
+    if (misplaced_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
+    }
+}
--- a/osd.h
+++ b/osd.h
@@ -15,144 +15,29 @@

 #include "blockstore.h"
 #include "ringloop.h"
-#include "timerfd_interval.h"
-#include "osd_ops.h"
+#include "timerfd_manager.h"
 #include "osd_peering_pg.h"
+#include "messenger.h"
+#include "etcd_state_client.h"

-#include "sparsepp/sparsepp/spp.h"
+#define OSD_LOADING_PGS 0x01
+#define OSD_PEERING_PGS 0x04
+#define OSD_FLUSHING_PGS 0x08
+#define OSD_RECOVERING 0x10

-#define OSD_OP_IN 0
-#define OSD_OP_OUT 1
+#define IMMEDIATE_NONE 0
+#define IMMEDIATE_SMALL 1
+#define IMMEDIATE_ALL 2

-#define CL_READ_OP 1
-#define CL_READ_DATA 2
-#define CL_READ_REPLY_DATA 3
-#define CL_WRITE_READY 1
-#define CL_WRITE_REPLY 2
-#define MAX_EPOLL_EVENTS 64
-#define OSD_OP_INLINE_BUF_COUNT 16
-
-#define PEER_CONNECTING 1
-#define PEER_CONNECTED 2
-#define OSD_PEERING_PEERS 1
-#define OSD_PEERING_PGS 2
+#define MAX_AUTOSYNC_INTERVAL 3600
+#define DEFAULT_AUTOSYNC_INTERVAL 5
+#define MAX_RECOVERY_QUEUE 2048
+#define DEFAULT_RECOVERY_QUEUE 4
+#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default

 //#define OSD_STUB

-struct osd_op_buf_list_t
-{
-    int count = 0, alloc = 0, sent = 0;
-    iovec *buf = NULL;
-    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
-
-    ~osd_op_buf_list_t()
-    {
-        if (buf && buf != inline_buf)
-        {
-            free(buf);
-        }
-    }
-
-    inline iovec* get_iovec()
-    {
-        return (buf ? buf : inline_buf) + sent;
-    }
-
-    inline int get_size()
-    {
-        return count - sent;
-    }
-
-    inline void push_back(void *nbuf, size_t len)
-    {
-        if (count >= alloc)
-        {
-            if (!alloc)
-            {
-                alloc = OSD_OP_INLINE_BUF_COUNT;
-                buf = inline_buf;
-            }
-            else if (buf == inline_buf)
-            {
-                int old = alloc;
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)malloc(sizeof(iovec) * alloc);
-                memcpy(buf, inline_buf, sizeof(iovec)*old);
-            }
-            else
-            {
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
-            }
-        }
-        buf[count++] = { .iov_base = nbuf, .iov_len = len };
-    }
-};
-
-struct osd_primary_op_data_t;
-
-struct osd_op_t
-{
-    timespec tv_begin;
-    timespec tv_send;
-    int op_type = OSD_OP_IN;
-    int peer_fd;
-    osd_any_op_t req;
-    osd_any_reply_t reply;
-    blockstore_op_t *bs_op = NULL;
-    void *buf = NULL;
-    void *rmw_buf = NULL;
-    osd_primary_op_data_t* op_data = NULL;
-    std::function<void(osd_op_t*)> callback;
-
-    osd_op_buf_list_t send_list;
-
-    ~osd_op_t();
-};
-
-struct osd_peer_def_t
-{
-    osd_num_t osd_num = 0;
-    std::string addr;
-    int port = 0;
-    time_t last_connect_attempt = 0;
-};
-
-struct osd_client_t
-{
-    sockaddr_in peer_addr;
-    int peer_port;
-    int peer_fd;
-    int peer_state;
-    std::function<void(osd_num_t, int)> connect_callback;
-    osd_num_t osd_num = 0;
-
-    // Read state
-    int read_ready = 0;
-    osd_op_t *read_op = NULL;
-    int read_reply_id = 0;
-    iovec read_iov;
-    msghdr read_msg;
-    void *read_buf = NULL;
-    int read_remaining = 0;
-    int read_state = 0;
-
-    // Outbound operations sent to this client (which is probably an OSD peer)
-    std::map<int, osd_op_t*> sent_ops;
-
-    // Outbound messages (replies or requests)
-    std::deque<osd_op_t*> outbox;
-
-    // PGs dirtied by this client's primary-writes
-    std::set<pg_num_t> dirty_pgs;
-
-    // Write state
-    osd_op_t *write_op = NULL;
-    msghdr write_msg;
-    int write_state = 0;
-};
-
-struct osd_rmw_stripe_t;
+extern const char* osd_op_names[];

 struct osd_object_id_t
 {
@@ -160,26 +45,58 @@ struct osd_object_id_t
    object_id oid;
 };

+struct osd_recovery_op_t
+{
+    int st = 0;
+    bool degraded = false;
+    pg_num_t pg_num = 0;
+    object_id oid = { 0 };
+    osd_op_t *osd_op = NULL;
+};
+
 class osd_t
 {
    // config

+    blockstore_config_t config;
+    int etcd_report_interval = 30;
+
+    bool readonly = false;
    osd_num_t osd_num = 1; // OSD numbers start with 1
    bool run_primary = false;
-    std::vector<osd_peer_def_t> peers;
-    blockstore_config_t config;
    std::string bind_address;
    int bind_port, listen_backlog;
+    // FIXME: Implement client queue depth limit
    int client_queue_depth = 128;
    bool allow_test_ops = true;
+    int print_stats_interval = 3;
+    int immediate_commit = IMMEDIATE_NONE;
+    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
+    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    int log_level = 0;

-    // peer OSDs
+    // cluster state

-    std::map<uint64_t, int> osd_peer_fds;
-    std::vector<pg_t> pgs;
+    etcd_state_client_t st_cli;
+    osd_messenger_t c_cli;
+    int etcd_failed_attempts = 0;
+    std::string etcd_lease_id;
+    json11::Json self_state;
+    bool loading_peer_config = false;
+    std::set<pg_num_t> pg_state_dirty;
+    bool pg_config_applied = false;
+    bool etcd_reporting_pg_state = false;
+    bool etcd_reporting_stats = false;
+
+    // peers and PGs
+
+    std::map<pg_num_t, pg_t> pgs;
+    std::set<pg_num_t> dirty_pgs;
+    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    unsigned pg_count = 0;
-    uint64_t next_subop_id = 1;
+    std::map<object_id, osd_recovery_op_t> recovery_ops;
+    osd_op_t *autosync_op = NULL;

    // Unstable writes
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -191,53 +108,73 @@ class osd_t
    int inflight_ops = 0;
    blockstore_t *bs;
    uint32_t bs_block_size, bs_disk_alignment;
-    uint64_t parity_block_size = 4*1024*1024; // 4 MB by default
+    uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
    ring_loop_t *ringloop;
-    timerfd_interval *tick_tfd;
+    timerfd_manager_t *tfd = NULL;

    int wait_state = 0;
    int epoll_fd = 0;
+    int listening_port = 0;
    int listen_fd = 0;
    ring_consumer_t consumer;
+    std::map<int, std::function<void(int, int)>> epoll_handlers;

-    std::unordered_map<int,osd_client_t> clients;
-    std::vector<int> read_ready_clients;
-    std::vector<int> write_ready_clients;
-    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
-    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
-    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
-    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
-    uint64_t send_stat_sum = 0;
-    uint64_t send_stat_count = 0;
+    // op statistics
+    osd_op_stats_t prev_stats;
+    const char* recovery_stat_names[2] = { "degraded", "misplaced" };
+    uint64_t recovery_stat_count[2][2] = { 0 };
+    uint64_t recovery_stat_bytes[2][2] = { 0 };

-    // methods
+    // cluster connection
+    void parse_config(blockstore_config_t & config);
+    void init_cluster();
+    void on_change_osd_state_hook(uint64_t osd_num);
+    void on_change_etcd_state_hook(json11::Json::object & changes);
+    void on_load_config_hook(json11::Json::object & changes);
+    json11::Json on_load_pgs_checks_hook();
+    void on_load_pgs_hook(bool success);
+    void bind_socket();
+    void acquire_lease();
+    json11::Json get_osd_state();
+    void create_osd_state();
+    void renew_lease();
+    void print_stats();
+    void reset_stats();
+    json11::Json get_statistics();
+    void report_statistics();
+    void report_pg_state(pg_t & pg);
+    void report_pg_states();
+    void apply_pg_count();
+    void apply_pg_config();

    // event loop, socket read/write
    void loop();
+    void set_fd_handler(int fd, std::function<void(int, int)> handler);
    void handle_epoll_events();
-    void read_requests();
-    void handle_read(ring_data_t *data, int peer_fd);
-    void handle_op_hdr(osd_client_t *cl);
-    void handle_reply_hdr(osd_client_t *cl);
-    bool try_send(osd_client_t & cl);
-    void send_replies();
-    void handle_send(ring_data_t *data, int peer_fd);
-    void outbox_push(osd_client_t & cl, osd_op_t *op);

    // peer handling (primary OSD logic)
-    void connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback);
-    void handle_connect_result(int peer_fd);
-    void cancel_osd_ops(osd_client_t & cl);
-    void cancel_op(osd_op_t *op);
-    void stop_client(int peer_fd);
-    osd_peer_def_t parse_peer(std::string peer);
-    void init_primary();
+    void parse_test_peer(std::string peer);
    void handle_peers();
-    void repeer_pgs(osd_num_t osd_num, bool is_connected);
-    void start_pg_peering(int i);
+    void repeer_pgs(osd_num_t osd_num);
+    void start_pg_peering(pg_num_t pg_num);
+    void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
+    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
+    void discard_list_subop(osd_op_t *list_op);
+    bool stop_pg(pg_num_t pg_num);
+    void finish_stop_pg(pg_t & pg);
+
+    // flushing, recovery and backfill
+    void submit_pg_flush_ops(pg_num_t pg_num);
+    void handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
+    void submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
+    bool pick_next_recovery(osd_recovery_op_t &op);
+    void submit_recovery_op(osd_recovery_op_t *op);
+    bool continue_recovery();
+    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

    // op execution
    void exec_op(osd_op_t *cur_op);
+    void finish_op(osd_op_t *cur_op, int retval);

    // secondary ops
    void exec_sync_stab_all(osd_op_t *cur_op);
@@ -246,18 +183,34 @@ class osd_t
    void secondary_op_callback(osd_op_t *cur_op);

    // primary ops
+    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
+    void cancel_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
-    void finish_primary_op(osd_op_t *cur_op, int retval);
-    void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
+    void continue_primary_del(osd_op_t *cur_op);
+    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
+    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
+    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
+    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
+    void handle_primary_bs_subop(osd_op_t *subop);
+    void add_bs_subop_stats(osd_op_t *subop);
+    void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
    void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
+    void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
    void submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);
+
+    inline pg_num_t map_to_pg(object_id oid)
+    {
+        return (oid.inode + oid.stripe / pg_stripe_size) % pg_count + 1;
+    }
+
 public:
    osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
    ~osd_t();
+    void force_stop(int exitcode);
    bool shutdown();
 };

--- a/osd_client.cpp
+++ b/osd_client.cpp
@@ -1,40 +0,0 @@
-void slice()
-{
-    // Slice the request into blockstore requests to individual objects
-    // Primary OSD still operates individual stripes, except they're twice the size of the blockstore's stripe.
-    std::vector read_parts;
-    int block = bs->get_block_size();
-    uint64_t stripe1 = cur_op->req.rw.offset / block / 2;
-    uint64_t stripe2 = (cur_op->req.rw.offset + cur_op->req.rw.len + block*2 - 1) / block / 2 - 1;
-    for (uint64_t s = stripe1; s <= stripe2; s++)
-    {
-        uint64_t start = s == stripe1 ? cur_op->req.rw.offset - stripe1*block*2 : 0;
-        uint64_t end = s == stripe2 ? cur_op->req.rw.offset + cur_op->req.rw.len - stripe2*block*2 : block*2;
-        if (start < block)
-        {
-            read_parts.push_back({
-                .role = 1,
-                .oid = {
-                    .inode = cur_op->req.rw.inode,
-                    .stripe = (s << STRIPE_ROLE_BITS) | 1,
-                },
-                .version = UINT64_MAX,
-                .offset = start,
-                .len = (block < end ? block : end) - start,
-            });
-        }
-        if (end > block)
-        {
-            read_parts.push_back({
-                .role = 2,
-                .oid = {
-                    .inode = cur_op->req.rw.inode,
-                    .stripe = (s << STRIPE_ROLE_BITS) | 2,
-                },
-                .version = UINT64_MAX,
-                .offset = (start > block ? start-block : 0),
-                .len = end - (start > block ? start-block : 0),
-            });
-        }
-    }
-}
--- a/osd_cluster.cpp
+++ b/osd_cluster.cpp
@@ -0,0 +1,746 @@
+#include "osd.h"
+#include "base64.h"
+#include "etcd_state_client.h"
+
+// Startup sequence:
+//   Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
+//   -> Load PG config -> Report&lock PG states -> Load peers -> Connect to peers -> Peer PGs
+// Event handling
+//   Wait for PG changes -> Start/Stop PGs when requested
+//   Peer connection is lost -> Reload connection data -> Try to reconnect
+void osd_t::init_cluster()
+{
+    if (!st_cli.etcd_addresses.size())
+    {
+        if (run_primary)
+        {
+            // Test version of clustering code with 1 PG and 2 peers
+            // Example: peers = 2:127.0.0.1:11204,3:127.0.0.1:11205
+            std::string peerstr = config["peers"];
+            while (peerstr.size())
+            {
+                int pos = peerstr.find(',');
+                parse_test_peer(pos < 0 ? peerstr : peerstr.substr(0, pos));
+                peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
+            }
+            if (st_cli.peer_states.size() < 2)
+            {
+                throw std::runtime_error("run_primary requires at least 2 peers");
+            }
+            pgs[1] = (pg_t){
+                .state = PG_PEERING,
+                .pg_cursize = 0,
+                .pg_num = 1,
+                .target_set = { 1, 2, 3 },
+                .cur_set = { 0, 0, 0 },
+            };
+            report_pg_state(pgs[1]);
+            pg_count = 1;
+        }
+        bind_socket();
+    }
+    else
+    {
+        st_cli.tfd = tfd;
+        st_cli.log_level = log_level;
+        st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
+        st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_etcd_state_hook(changes); };
+        st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
+        st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
+        st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
+        peering_state = OSD_LOADING_PGS;
+        st_cli.load_global_config();
+    }
+    if (run_primary && autosync_interval > 0)
+    {
+        this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
+        {
+            autosync();
+        });
+    }
+}
+
+void osd_t::parse_test_peer(std::string peer)
+{
+    // OSD_NUM:IP:PORT
+    int pos1 = peer.find(':');
+    int pos2 = peer.find(':', pos1+1);
+    if (pos1 < 0 || pos2 < 0)
+        throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
+    std::string addr = peer.substr(pos1+1, pos2-pos1-1);
+    std::string osd_num_str = peer.substr(0, pos1);
+    std::string port_str = peer.substr(pos2+1);
+    osd_num_t peer_osd = strtoull(osd_num_str.c_str(), NULL, 10);
+    if (!peer_osd)
+        throw new std::runtime_error("Could not parse OSD peer osd_num");
+    else if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
+        throw std::runtime_error("Same osd number "+std::to_string(peer_osd)+" specified twice in peers");
+    int port = strtoull(port_str.c_str(), NULL, 10);
+    if (!port)
+        throw new std::runtime_error("Could not parse OSD peer port");
+    st_cli.peer_states[peer_osd] = json11::Json::object {
+        { "state", "up" },
+        { "addresses", json11::Json::array { addr } },
+        { "port", port },
+    };
+    c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
+}
+
+json11::Json osd_t::get_osd_state()
+{
+    std::vector<char> hostname;
+    hostname.resize(1024);
+    while (gethostname(hostname.data(), hostname.size()) < 0 && errno == ENAMETOOLONG)
+        hostname.resize(hostname.size()+1024);
+    hostname.resize(strnlen(hostname.data(), hostname.size()));
+    json11::Json::object st;
+    st["state"] = "up";
+    if (bind_address != "0.0.0.0")
+        st["addresses"] = json11::Json::array { bind_address };
+    else
+        st["addresses"] = getifaddr_list();
+    st["host"] = std::string(hostname.data(), hostname.size());
+    st["port"] = listening_port;
+    st["primary_enabled"] = run_primary;
+    st["blockstore_enabled"] = bs ? true : false;
+    return st;
+}
+
+json11::Json osd_t::get_statistics()
+{
+    json11::Json::object st;
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    char time_str[50] = { 0 };
+    sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
+    st["time"] = time_str;
+    st["blockstore_ready"] = bs->is_started();
+    if (bs)
+    {
+        st["size"] = bs->get_block_count() * bs->get_block_size();
+        st["free"] = bs->get_free_block_count() * bs->get_block_size();
+    }
+    st["host"] = self_state["host"];
+    json11::Json::object op_stats, subop_stats;
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        op_stats[osd_op_names[i]] = json11::Json::object {
+            { "count", c_cli.stats.op_stat_count[i] },
+            { "usec", c_cli.stats.op_stat_sum[i] },
+            { "bytes", c_cli.stats.op_stat_bytes[i] },
+        };
+    }
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        subop_stats[osd_op_names[i]] = json11::Json::object {
+            { "count", c_cli.stats.subop_stat_count[i] },
+            { "usec", c_cli.stats.subop_stat_sum[i] },
+        };
+    }
+    st["op_stats"] = op_stats;
+    st["subop_stats"] = subop_stats;
+    st["recovery_stats"] = json11::Json::object {
+        { recovery_stat_names[0], json11::Json::object {
+            { "count", recovery_stat_count[0][0] },
+            { "bytes", recovery_stat_bytes[0][0] },
+        } },
+        { recovery_stat_names[1], json11::Json::object {
+            { "count", recovery_stat_count[0][1] },
+            { "bytes", recovery_stat_bytes[0][1] },
+        } },
+    };
+    return st;
+}
+
+void osd_t::report_statistics()
+{
+    if (etcd_reporting_stats)
+    {
+        return;
+    }
+    etcd_reporting_stats = true;
+    json11::Json::array txn = { json11::Json::object {
+        { "request_put", json11::Json::object {
+            { "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
+            { "value", base64_encode(get_statistics().dump()) },
+        } }
+    } };
+    for (auto & p: pgs)
+    {
+        auto & pg = p.second;
+        if (pg.state & (PG_OFFLINE | PG_STARTING))
+        {
+            // Don't report statistics for offline PGs
+            continue;
+        }
+        json11::Json::object pg_stats;
+        pg_stats["object_count"] = pg.total_count;
+        pg_stats["clean_count"] = pg.clean_count;
+        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
+        pg_stats["degraded_count"] = pg.degraded_objects.size();
+        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
+        pg_stats["write_osd_set"] = pg.cur_set;
+        txn.push_back(json11::Json::object {
+            { "request_put", json11::Json::object {
+                { "key", base64_encode(st_cli.etcd_prefix+"/pg/stats/"+std::to_string(pg.pg_num)) },
+                { "value", base64_encode(json11::Json(pg_stats).dump()) },
+            } }
+        });
+    }
+    st_cli.etcd_txn(json11::Json::object { { "success", txn } }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
+    {
+        etcd_reporting_stats = false;
+        if (err != "")
+        {
+            printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
+            // Retry indefinitely
+            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
+            {
+                report_statistics();
+            });
+        }
+        else if (res["error"].string_value() != "")
+        {
+            printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, res["error"].string_value().c_str());
+            force_stop(1);
+        }
+    });
+}
+
+void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
+{
+    if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
+    {
+        c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
+    }
+}
+
+void osd_t::on_change_etcd_state_hook(json11::Json::object & changes)
+{
+    // FIXME apply config changes in runtime (maybe, some)
+    apply_pg_count();
+    apply_pg_config();
+}
+
+void osd_t::on_load_config_hook(json11::Json::object & global_config)
+{
+    blockstore_config_t osd_config = this->config;
+    for (auto & cfg_var: global_config)
+    {
+        if (this->config.find(cfg_var.first) == this->config.end())
+        {
+            if (cfg_var.second.is_string())
+            {
+                osd_config[cfg_var.first] = cfg_var.second.string_value();
+            }
+            else
+            {
+                osd_config[cfg_var.first] = cfg_var.second.dump();
+            }
+        }
+    }
+    parse_config(osd_config);
+    bind_socket();
+    st_cli.start_etcd_watcher();
+    acquire_lease();
+}
+
+// Acquire lease
+void osd_t::acquire_lease()
+{
+    // Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
+    st_cli.etcd_call("/lease/grant", json11::Json::object {
+        { "TTL", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 }
+    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "" || data["ID"].string_value() == "")
+        {
+            printf("Error acquiring a lease from etcd: %s\n", err.c_str());
+            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
+            {
+                acquire_lease();
+            });
+            return;
+        }
+        etcd_lease_id = data["ID"].string_value();
+        create_osd_state();
+    });
+    printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].c_str(), etcd_report_interval);
+    tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
+    {
+        renew_lease();
+    });
+}
+
+// Report "up" state once, then keep it alive using the lease
+// Do it first to allow "monitors" check it when moving PGs
+void osd_t::create_osd_state()
+{
+    std::string state_key = base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num));
+    self_state = get_osd_state();
+    st_cli.etcd_txn(json11::Json::object {
+        // Check that the state key does not exist
+        { "compare", json11::Json::array {
+            json11::Json::object {
+                { "target", "CREATE" },
+                { "create_revision", 0 },
+                { "key", state_key },
+            }
+        } },
+        { "success", json11::Json::array {
+            json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", state_key },
+                    { "value", base64_encode(self_state.dump()) },
+                    { "lease", etcd_lease_id },
+                } }
+            },
+        } },
+        { "failure", json11::Json::array {
+            json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", state_key },
+                } }
+            },
+        } },
+    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "")
+        {
+            etcd_failed_attempts++;
+            printf("Error creating OSD state key: %s\n", err.c_str());
+            if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
+            {
+                // Die
+                throw std::runtime_error("Cluster connection failed");
+            }
+            // Retry
+            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
+            {
+                create_osd_state();
+            });
+            return;
+        }
+        if (!data["succeeded"].bool_value())
+        {
+            // OSD is already up
+            auto kv = st_cli.parse_etcd_kv(data["responses"][0]["response_range"]["kvs"][0]);
+            printf("Key %s already exists in etcd, OSD %lu is still up\n", kv.key.c_str(), this->osd_num);
+            int64_t port = kv.value["port"].int64_value();
+            for (auto & addr: kv.value["addresses"].array_items())
+            {
+                printf("  listening at: %s:%ld\n", addr.string_value().c_str(), port);
+            }
+            force_stop(0);
+            return;
+        }
+        if (run_primary)
+        {
+            st_cli.load_pgs();
+        }
+    });
+}
+
+// Renew lease
+void osd_t::renew_lease()
+{
+    st_cli.etcd_call("/lease/keepalive", json11::Json::object {
+        { "ID", etcd_lease_id }
+    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err == "" && data["result"]["TTL"].string_value() == "")
+        {
+            // Die
+            throw std::runtime_error("etcd lease has expired");
+        }
+        if (err != "")
+        {
+            etcd_failed_attempts++;
+            printf("Error renewing etcd lease: %s\n", err.c_str());
+            if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
+            {
+                // Die
+                throw std::runtime_error("Cluster connection failed");
+            }
+            // Retry
+            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
+            {
+                renew_lease();
+            });
+        }
+        else
+        {
+            etcd_failed_attempts = 0;
+            report_statistics();
+        }
+    });
+}
+
+void osd_t::force_stop(int exitcode)
+{
+    if (etcd_lease_id != "")
+    {
+        st_cli.etcd_call("/kv/lease/revoke", json11::Json::object {
+            { "ID", etcd_lease_id }
+        }, ETCD_QUICK_TIMEOUT, [this, exitcode](std::string err, json11::Json data)
+        {
+            if (err != "")
+            {
+                printf("Error revoking etcd lease: %s\n", err.c_str());
+            }
+            printf("[OSD %lu] Force stopping\n", this->osd_num);
+            exit(exitcode);
+        });
+    }
+    else
+    {
+        printf("[OSD %lu] Force stopping\n", this->osd_num);
+        exit(exitcode);
+    }
+}
+
+json11::Json osd_t::on_load_pgs_checks_hook()
+{
+    assert(this->pgs.size() == 0);
+    json11::Json::array checks = {
+        json11::Json::object {
+            { "target", "LEASE" },
+            { "lease", etcd_lease_id },
+            { "key", base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num)) },
+        }
+    };
+    return checks;
+}
+
+void osd_t::on_load_pgs_hook(bool success)
+{
+    if (!success)
+    {
+        printf("Error loading PGs from etcd: lease expired\n");
+        force_stop(1);
+    }
+    else
+    {
+        peering_state &= ~OSD_LOADING_PGS;
+        apply_pg_count();
+        apply_pg_config();
+    }
+}
+
+void osd_t::apply_pg_count()
+{
+    pg_num_t pg_count = st_cli.pg_config.size();
+    if (pg_count > 0 && (st_cli.pg_config.begin()->first != 1 || std::prev(st_cli.pg_config.end())->first != pg_count))
+    {
+        printf("Invalid PG configuration: PG numbers don't cover the whole 1..%d range\n", pg_count);
+        force_stop(1);
+        return;
+    }
+    if (this->pg_count != 0 && this->pg_count != pg_count)
+    {
+        // Check that all PGs are offline. It is not allowed to change PG count when any PGs are online
+        // The external tool must wait for all PGs to come down before changing PG count
+        // If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
+        // So an OSD just dies if it detects PG count change while there are active PGs
+        int still_active = 0;
+        for (auto & kv: pgs)
+        {
+            if (kv.second.state & PG_ACTIVE)
+            {
+                still_active++;
+            }
+        }
+        if (still_active > 0)
+        {
+            printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
+            force_stop(1);
+            return;
+        }
+    }
+    this->pg_count = pg_count;
+}
+
+void osd_t::apply_pg_config()
+{
+    bool all_applied = true;
+    for (auto & kv: st_cli.pg_config)
+    {
+        pg_num_t pg_num = kv.first;
+        auto & pg_cfg = kv.second;
+        bool take = pg_cfg.exists && pg_cfg.primary == this->osd_num &&
+            !pg_cfg.pause && (!pg_cfg.cur_primary || pg_cfg.cur_primary == this->osd_num);
+        bool currently_taken = this->pgs.find(pg_num) != this->pgs.end() &&
+            this->pgs[pg_num].state != PG_OFFLINE;
+        if (currently_taken && !take)
+        {
+            // Stop this PG
+            stop_pg(pg_num);
+        }
+        else if (take)
+        {
+            // Take this PG
+            std::set<osd_num_t> all_peers;
+            for (osd_num_t pg_osd: pg_cfg.target_set)
+            {
+                if (pg_osd != 0)
+                {
+                    all_peers.insert(pg_osd);
+                }
+            }
+            for (osd_num_t pg_osd: pg_cfg.all_peers)
+            {
+                if (pg_osd != 0)
+                {
+                    all_peers.insert(pg_osd);
+                }
+            }
+            for (auto & hist_item: pg_cfg.target_history)
+            {
+                for (auto pg_osd: hist_item)
+                {
+                    if (pg_osd != 0)
+                    {
+                        all_peers.insert(pg_osd);
+                    }
+                }
+            }
+            if (currently_taken)
+            {
+                if (this->pgs[pg_num].state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
+                {
+                    if (this->pgs[pg_num].target_set == pg_cfg.target_set)
+                    {
+                        // No change in osd_set; history changes are ignored
+                        continue;
+                    }
+                    else
+                    {
+                        // Stop PG, reapply change after stopping
+                        stop_pg(pg_num);
+                        all_applied = false;
+                        continue;
+                    }
+                }
+                else if (this->pgs[pg_num].state & PG_STOPPING)
+                {
+                    // Reapply change after stopping
+                    all_applied = false;
+                    continue;
+                }
+                else if (this->pgs[pg_num].state & PG_STARTING)
+                {
+                    if (pg_cfg.cur_primary == this->osd_num)
+                    {
+                        // PG locked, continue
+                    }
+                    else
+                    {
+                        // Reapply change after locking the PG
+                        all_applied = false;
+                        continue;
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(this->pgs[pg_num].state));
+                }
+            }
+            this->pgs[pg_num] = (pg_t){
+                .state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
+                .pg_cursize = 0,
+                .pg_num = pg_num,
+                .target_history = pg_cfg.target_history,
+                .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
+                .target_set = pg_cfg.target_set,
+            };
+            this->pg_state_dirty.insert(pg_num);
+            this->pgs[pg_num].print_state();
+            if (pg_cfg.cur_primary == this->osd_num)
+            {
+                // Add peers
+                for (auto pg_osd: all_peers)
+                {
+                    if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
+                    {
+                        c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
+                    }
+                }
+                start_pg_peering(pg_num);
+            }
+            else
+            {
+                // Reapply change after locking the PG
+                all_applied = false;
+            }
+        }
+    }
+    report_pg_states();
+    this->pg_config_applied = all_applied;
+}
+
+void osd_t::report_pg_states()
+{
+    if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size())
+    {
+        return;
+    }
+    etcd_reporting_pg_state = true;
+    std::vector<std::pair<pg_num_t,bool>> reporting_pgs;
+    json11::Json::array checks;
+    json11::Json::array success;
+    json11::Json::array failure;
+    for (auto it = pg_state_dirty.begin(); it != pg_state_dirty.end(); it++)
+    {
+        auto pg_it = this->pgs.find(*it);
+        if (pg_it == this->pgs.end())
+        {
+            continue;
+        }
+        auto & pg = pg_it->second;
+        reporting_pgs.push_back({ pg.pg_num, pg.history_changed });
+        std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num));
+        if (pg.state == PG_STARTING)
+        {
+            // Check that the PG key does not exist
+            // Failed check indicates an unsuccessful PG lock attempt in this case
+            checks.push_back(json11::Json::object {
+                { "target", "VERSION" },
+                { "version", 0 },
+                { "key", state_key_base64 },
+            });
+        }
+        else
+        {
+            // Check that the key is ours
+            // Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
+            // and an unexpected race condition for started pgs (PG lock is held by someone else)
+            checks.push_back(json11::Json::object {
+                { "target", "LEASE" },
+                { "lease", etcd_lease_id },
+                { "key", state_key_base64 },
+            });
+        }
+        if (pg.state == PG_OFFLINE)
+        {
+            success.push_back(json11::Json::object {
+                { "request_delete_range", json11::Json::object {
+                    { "key", state_key_base64 },
+                } }
+            });
+        }
+        else
+        {
+            json11::Json::array pg_state_keywords;
+            for (int i = 0; i < pg_state_bit_count; i++)
+            {
+                if (pg.state & pg_state_bits[i])
+                {
+                    pg_state_keywords.push_back(pg_state_names[i]);
+                }
+            }
+            success.push_back(json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num)) },
+                    { "value", base64_encode(json11::Json(json11::Json::object {
+                        { "primary", this->osd_num },
+                        { "state", pg_state_keywords },
+                        { "peers", pg.cur_peers },
+                    }).dump()) },
+                    { "lease", etcd_lease_id },
+                } }
+            });
+            if (pg.history_changed)
+            {
+                pg.history_changed = false;
+                if (pg.state == PG_ACTIVE)
+                {
+                    success.push_back(json11::Json::object {
+                        { "request_delete_range", json11::Json::object {
+                            { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
+                        } }
+                    });
+                }
+                else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
+                {
+                    success.push_back(json11::Json::object {
+                        { "request_put", json11::Json::object {
+                            { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
+                            { "value", base64_encode(json11::Json(json11::Json::object {
+                                { "all_peers", pg.all_peers },
+                            }).dump()) },
+                        } }
+                    });
+                }
+            }
+        }
+        failure.push_back(json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", state_key_base64 },
+            } }
+        });
+    }
+    pg_state_dirty.clear();
+    st_cli.etcd_txn(json11::Json::object {
+        { "compare", checks }, { "success", success }, { "failure", failure }
+    }, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data)
+    {
+        etcd_reporting_pg_state = false;
+        if (!data["succeeded"].bool_value())
+        {
+            // One of PG state updates failed, put dirty flags back
+            for (auto pp: reporting_pgs)
+            {
+                this->pg_state_dirty.insert(pp.first);
+                if (pp.second)
+                {
+                    auto pg_it = this->pgs.find(pp.first);
+                    if (pg_it != this->pgs.end())
+                    {
+                        pg_it->second.history_changed = true;
+                    }
+                }
+            }
+            for (auto & res: data["responses"].array_items())
+            {
+                if (res["kvs"].array_items().size())
+                {
+                    auto kv = st_cli.parse_etcd_kv(res["kvs"][0]);
+                    pg_num_t pg_num = stoull_full(kv.key.substr(st_cli.etcd_prefix.length()+10));
+                    auto pg_it = pgs.find(pg_num);
+                    if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
+                    {
+                        // Live PG state update failed
+                        printf("Failed to report state of PG %u which is live. Race condition detected, exiting\n", pg_num);
+                        force_stop(1);
+                        return;
+                    }
+                }
+            }
+            // Retry after a short pause (hope we'll get some updates and update PG states accordingly)
+            tfd->set_timer(500, false, [this](int) { report_pg_states(); });
+        }
+        else
+        {
+            // Success. We'll get our changes back via the watcher and react to them
+            for (auto pp: reporting_pgs)
+            {
+                auto pg_it = this->pgs.find(pp.first);
+                if (pg_it != this->pgs.end())
+                {
+                    if (pg_it->second.state == PG_OFFLINE)
+                    {
+                        // Remove offline PGs after reporting their state
+                        this->pgs.erase(pg_it);
+                    }
+                }
+            }
+            // Push other PG state updates, if any
+            report_pg_states();
+            if (!this->pg_state_dirty.size())
+            {
+                // Update statistics
+                report_statistics();
+            }
+        }
+    });
+}
--- a/osd_flush.cpp
+++ b/osd_flush.cpp
@@ -0,0 +1,300 @@
+#include "osd.h"
+
+#define FLUSH_BATCH 512
+
+void osd_t::submit_pg_flush_ops(pg_num_t pg_num)
+{
+    pg_t & pg = pgs[pg_num];
+    pg_flush_batch_t *fb = new pg_flush_batch_t();
+    pg.flush_batch = fb;
+    auto it = pg.flush_actions.begin(), prev_it = pg.flush_actions.begin();
+    bool first = true;
+    while (it != pg.flush_actions.end())
+    {
+        if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
+            (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
+            fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
+            fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
+        {
+            // Stop only at the object boundary
+            break;
+        }
+        it->second.submitted = true;
+        if (it->second.rollback)
+        {
+            fb->flush_objects++;
+            fb->rollback_lists[it->first.osd_num].push_back((obj_ver_id){
+                .oid = it->first.oid,
+                .version = it->second.rollback_to,
+            });
+        }
+        if (it->second.make_stable)
+        {
+            fb->flush_objects++;
+            fb->stable_lists[it->first.osd_num].push_back((obj_ver_id){
+                .oid = it->first.oid,
+                .version = it->second.stable_to,
+            });
+        }
+        prev_it = it;
+        first = false;
+        it++;
+    }
+    for (auto & l: fb->rollback_lists)
+    {
+        if (l.second.size() > 0)
+        {
+            fb->flush_ops++;
+            submit_flush_op(pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
+        }
+    }
+    for (auto & l: fb->stable_lists)
+    {
+        if (l.second.size() > 0)
+        {
+            fb->flush_ops++;
+            submit_flush_op(pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
+        }
+    }
+}
+
+void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
+{
+    if (pgs.find(pg_num) == pgs.end() || pgs[pg_num].flush_batch != fb)
+    {
+        // Throw the result away
+        return;
+    }
+    if (retval != 0)
+    {
+        if (peer_osd == this->osd_num)
+        {
+            throw std::runtime_error(
+                std::string(rollback
+                    ? "Error while doing local rollback operation: "
+                    : "Error while doing local stabilize operation: "
+                ) + strerror(-retval)
+            );
+        }
+        else
+        {
+            printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
+            auto fd_it = c_cli.osd_peer_fds.find(peer_osd);
+            if (fd_it != c_cli.osd_peer_fds.end())
+            {
+                c_cli.stop_client(fd_it->second);
+            }
+            return;
+        }
+    }
+    fb->flush_done++;
+    if (fb->flush_done == fb->flush_ops)
+    {
+        // This flush batch is done
+        std::vector<osd_op_t*> continue_ops;
+        auto & pg = pgs[pg_num];
+        auto it = pg.flush_actions.begin(), prev_it = it;
+        auto erase_start = it;
+        while (1)
+        {
+            if (it == pg.flush_actions.end() ||
+                it->first.oid.inode != prev_it->first.oid.inode ||
+                (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
+            {
+                pg.ver_override.erase((object_id){
+                    .inode = prev_it->first.oid.inode,
+                    .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
+                });
+                auto wr_it = pg.write_queue.find((object_id){
+                    .inode = prev_it->first.oid.inode,
+                    .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
+                });
+                if (wr_it != pg.write_queue.end())
+                {
+                    continue_ops.push_back(wr_it->second);
+                    pg.write_queue.erase(wr_it);
+                }
+            }
+            if ((it == pg.flush_actions.end() || !it->second.submitted) &&
+                erase_start != it)
+            {
+                pg.flush_actions.erase(erase_start, it);
+            }
+            if (it == pg.flush_actions.end())
+            {
+                break;
+            }
+            prev_it = it;
+            if (!it->second.submitted)
+            {
+                it++;
+                erase_start = it;
+            }
+            else
+            {
+                it++;
+            }
+        }
+        delete fb;
+        pg.flush_batch = NULL;
+        if (!pg.flush_actions.size())
+        {
+            pg.state = pg.state & ~PG_HAS_UNCLEAN;
+            report_pg_state(pg);
+        }
+        for (osd_op_t *op: continue_ops)
+        {
+            continue_primary_write(op);
+        }
+        if (pg.inflight == 0 && (pg.state & PG_STOPPING))
+        {
+            finish_stop_pg(pg);
+        }
+    }
+}
+
+void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
+{
+    osd_op_t *op = new osd_op_t();
+    // Copy buffer so it gets freed along with the operation
+    op->buf = malloc(sizeof(obj_ver_id) * count);
+    memcpy(op->buf, data, sizeof(obj_ver_id) * count);
+    if (peer_osd == this->osd_num)
+    {
+        // local
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t({
+            .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
+            .callback = [this, op, pg_num, fb](blockstore_op_t *bs_op)
+            {
+                add_bs_subop_stats(op);
+                handle_flush_op(bs_op->opcode == BS_OP_ROLLBACK, pg_num, fb, this->osd_num, bs_op->retval);
+                delete op->bs_op;
+                op->bs_op = NULL;
+                delete op;
+            },
+            .len = (uint32_t)count,
+            .buf = op->buf,
+        });
+        bs->enqueue_op(op->bs_op);
+    }
+    else
+    {
+        // Peer
+        int peer_fd = c_cli.osd_peer_fds[peer_osd];
+        op->op_type = OSD_OP_OUT;
+        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+        op->send_list.push_back(op->buf, count * sizeof(obj_ver_id));
+        op->peer_fd = peer_fd;
+        op->req = {
+            .sec_stab = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = (uint64_t)(rollback ? OSD_OP_SECONDARY_ROLLBACK : OSD_OP_SECONDARY_STABILIZE),
+                },
+                .len = count * sizeof(obj_ver_id),
+            },
+        };
+        op->callback = [this, pg_num, fb, peer_osd](osd_op_t *op)
+        {
+            handle_flush_op(op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK, pg_num, fb, peer_osd, op->reply.hdr.retval);
+            delete op;
+        };
+        c_cli.outbox_push(op);
+    }
+}
+
+bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
+{
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    {
+        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
+        {
+            for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
+            {
+                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                {
+                    op.degraded = true;
+                    op.pg_num = pg_it->first;
+                    op.oid = obj_it->first;
+                    return true;
+                }
+            }
+        }
+    }
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    {
+        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+        {
+            for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+            {
+                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                {
+                    op.degraded = false;
+                    op.pg_num = pg_it->first;
+                    op.oid = obj_it->first;
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+void osd_t::submit_recovery_op(osd_recovery_op_t *op)
+{
+    op->osd_op = new osd_op_t();
+    op->osd_op->op_type = OSD_OP_OUT;
+    op->osd_op->req = {
+        .rw = {
+            .header = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = 1,
+                .opcode = OSD_OP_WRITE,
+            },
+            .inode = op->oid.inode,
+            .offset = op->oid.stripe,
+            .len = 0,
+        },
+    };
+    op->osd_op->callback = [this, op](osd_op_t *osd_op)
+    {
+        // Don't sync the write, it will be synced by our regular sync coroutine
+        if (osd_op->reply.hdr.retval < 0)
+        {
+            // Error recovering object
+            if (osd_op->reply.hdr.retval == -EPIPE)
+            {
+                // PG is stopped or one of the OSDs is gone, error is harmless
+            }
+            else
+            {
+                throw std::runtime_error("Failed to recover an object");
+            }
+        }
+        // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
+        op->osd_op = NULL;
+        recovery_ops.erase(op->oid);
+        delete osd_op;
+        continue_recovery();
+    };
+    exec_op(op->osd_op);
+}
+
+// Just trigger write requests for degraded objects. They'll be recovered during writing
+bool osd_t::continue_recovery()
+{
+    while (recovery_ops.size() < recovery_queue_depth)
+    {
+        osd_recovery_op_t op;
+        if (pick_next_recovery(op))
+        {
+            recovery_ops[op.oid] = op;
+            submit_recovery_op(&recovery_ops[op.oid]);
+        }
+        else
+            return false;
+    }
+    return true;
+}
--- a/osd_main.cpp
+++ b/osd_main.cpp
@@ -2,8 +2,17 @@

 #include <signal.h>

-void handle_sigint(int sig)
+static osd_t *osd = NULL;
+static bool force_stopping = false;
+
+static void handle_sigint(int sig)
 {
+    if (osd && !force_stopping)
+    {
+        force_stopping = true;
+        osd->force_stop(0);
+        return;
+    }
    exit(0);
 }

@@ -25,9 +34,11 @@ int main(int narg, char *args[])
        }
    }
    signal(SIGINT, handle_sigint);
+    signal(SIGTERM, handle_sigint);
    ring_loop_t *ringloop = new ring_loop_t(512);
+    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
    blockstore_t *bs = new blockstore_t(config, ringloop);
-    osd_t *osd = new osd_t(config, bs, ringloop);
+    osd = new osd_t(config, bs, ringloop);
    while (1)
    {
        ringloop->loop();
--- a/osd_ops.h
+++ b/osd_ops.h
@@ -22,9 +22,12 @@
 #define OSD_OP_READ                 10
 #define OSD_OP_WRITE                11
 #define OSD_OP_SYNC                 12
-#define OSD_OP_MAX                  12
+#define OSD_OP_DELETE               13
+#define OSD_OP_MAX                  13
 // Alignment & limit for read/write operations
-#define OSD_RW_ALIGN                512
+#ifndef MEM_ALIGNMENT
+#define MEM_ALIGNMENT               512
+#endif
 #define OSD_RW_MAX                  64*1024*1024

 // common request and reply headers
@@ -57,6 +60,7 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
    // object
    object_id oid;
    // read/write version (automatic or specific)
+    // FIXME deny values close to UINT64_MAX
    uint64_t version;
    // offset
    uint32_t offset;
@@ -130,7 +134,7 @@ struct __attribute__((__packed__)) osd_op_secondary_list_t
    osd_op_header_t header;
    // placement group total number and total count
    pg_num_t list_pg, pg_count;
-    uint64_t parity_block_size;
+    uint64_t pg_stripe_size;
 };

 struct __attribute__((__packed__)) osd_reply_secondary_list_t
@@ -142,6 +146,7 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
 };

 // read or write to the primary OSD (must be within individual stripe)
+// FIXME: allow to return used block bitmap (required for snapshots)
 struct __attribute__((__packed__)) osd_op_rw_t
 {
    osd_op_header_t header;
@@ -169,6 +174,7 @@ struct __attribute__((__packed__)) osd_reply_sync_t
    osd_reply_header_t header;
 };

+// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
 union osd_any_op_t
 {
    osd_op_header_t hdr;
--- a/osd_peering.cpp
+++ b/osd_peering.cpp
@@ -3,286 +3,212 @@

 #include <algorithm>

+#include "base64.h"
 #include "osd.h"

-void osd_t::init_primary()
-{
-    // Initial test version of clustering code requires exactly 2 peers
-    // FIXME Hardcode
-    std::string peerstr = config["peers"];
-    while (peerstr.size())
-    {
-        int pos = peerstr.find(',');
-        peers.push_back(parse_peer(pos < 0 ? peerstr : peerstr.substr(0, pos)));
-        peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
-        for (int i = 0; i < peers.size()-1; i++)
-            if (peers[i].osd_num == peers[peers.size()-1].osd_num)
-                throw std::runtime_error("same osd number "+std::to_string(peers[i].osd_num)+" specified twice in peers");
-    }
-    if (peers.size() < 2)
-        throw std::runtime_error("run_primary requires at least 2 peers");
-    pgs.push_back((pg_t){
-        .state = PG_OFFLINE,
-        .pg_cursize = 0,
-        .pg_num = 1,
-        .target_set = { 1, 2, 3 },
-        .cur_set = { 1, 0, 0 },
-    });
-    pg_count = 1;
-    peering_state = OSD_PEERING_PEERS;
-}
-
-osd_peer_def_t osd_t::parse_peer(std::string peer)
-{
-    // OSD_NUM:IP:PORT
-    int pos1 = peer.find(':');
-    int pos2 = peer.find(':', pos1+1);
-    if (pos1 < 0 || pos2 < 0)
-        throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
-    osd_peer_def_t r;
-    r.addr = peer.substr(pos1+1, pos2-pos1-1);
-    std::string osd_num_str = peer.substr(0, pos1);
-    std::string port_str = peer.substr(pos2+1);
-    r.osd_num = strtoull(osd_num_str.c_str(), NULL, 10);
-    if (!r.osd_num)
-        throw new std::runtime_error("Could not parse OSD peer osd_num");
-    r.port = strtoull(port_str.c_str(), NULL, 10);
-    if (!r.port)
-        throw new std::runtime_error("Could not parse OSD peer port");
-    return r;
-}
-
-void osd_t::connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback)
-{
-    struct sockaddr_in addr;
-    int r;
-    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
-    {
-        callback(osd_num, -EINVAL);
-        return;
-    }
-    addr.sin_family = AF_INET;
-    addr.sin_port = htons(peer_port ? peer_port : 11203);
-    int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
-    if (peer_fd < 0)
-    {
-        callback(osd_num, -errno);
-        return;
-    }
-    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
-    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
-    if (r < 0 && errno != EINPROGRESS)
-    {
-        close(peer_fd);
-        callback(osd_num, -errno);
-        return;
-    }
-    clients[peer_fd] = (osd_client_t){
-        .peer_addr = addr,
-        .peer_port = peer_port,
-        .peer_fd = peer_fd,
-        .peer_state = PEER_CONNECTING,
-        .connect_callback = callback,
-        .osd_num = osd_num,
-    };
-    osd_peer_fds[osd_num] = peer_fd;
-    // Add FD to epoll (EPOLLOUT for tracking connect() result)
-    epoll_event ev;
-    ev.data.fd = peer_fd;
-    ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
-    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
-    {
-        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-    }
-}
-
-void osd_t::handle_connect_result(int peer_fd)
-{
-    auto & cl = clients[peer_fd];
-    osd_num_t osd_num = cl.osd_num;
-    auto callback = cl.connect_callback;
-    int result = 0;
-    socklen_t result_len = sizeof(result);
-    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
-    {
-        result = errno;
-    }
-    if (result != 0)
-    {
-        stop_client(peer_fd);
-        callback(osd_num, -result);
-        return;
-    }
-    int one = 1;
-    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-    // Disable EPOLLOUT on this fd
-    cl.connect_callback = NULL;
-    cl.peer_state = PEER_CONNECTED;
-    epoll_event ev;
-    ev.data.fd = peer_fd;
-    ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
-    if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
-    {
-        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-    }
-    callback(osd_num, peer_fd);
-}
-
 // Peering loop
 void osd_t::handle_peers()
 {
-    if (peering_state & OSD_PEERING_PEERS)
-    {
-        for (int i = 0; i < peers.size(); i++)
-        {
-            if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end() &&
-                time(NULL) - peers[i].last_connect_attempt > 5) // FIXME hardcode 5
-            {
-                peers[i].last_connect_attempt = time(NULL);
-                connect_peer(peers[i].osd_num, peers[i].addr.c_str(), peers[i].port, [this](osd_num_t osd_num, int peer_fd)
-                {
-                    // FIXME: Check peer config after connecting
-                    if (peer_fd < 0)
-                    {
-                        printf("Failed to connect to peer OSD %lu: %s\n", osd_num, strerror(-peer_fd));
-                        return;
-                    }
-                    printf("Connected with peer OSD %lu (fd %d)\n", clients[peer_fd].osd_num, peer_fd);
-                    int i;
-                    for (i = 0; i < peers.size(); i++)
-                    {
-                        if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end())
-                            break;
-                    }
-                    if (i >= peers.size())
-                    {
-                        // Connected to all peers
-                        peering_state = peering_state & ~OSD_PEERING_PEERS;
-                    }
-                    repeer_pgs(osd_num, true);
-                });
-            }
-        }
-    }
    if (peering_state & OSD_PEERING_PGS)
    {
-        bool still_doing_pgs = false;
-        for (int i = 0; i < pgs.size(); i++)
+        bool still = false;
+        for (auto & p: pgs)
        {
-            if (pgs[i].state == PG_PEERING)
+            if (p.second.state == PG_PEERING)
            {
-                if (!pgs[i].peering_state->list_ops.size())
+                if (!p.second.peering_state->list_ops.size())
                {
-                    pgs[i].calc_object_states();
+                    p.second.calc_object_states(log_level);
+                    report_pg_state(p.second);
+                    incomplete_objects += p.second.incomplete_objects.size();
+                    misplaced_objects += p.second.misplaced_objects.size();
+                    // FIXME: degraded objects may currently include misplaced, too! Report them separately?
+                    degraded_objects += p.second.degraded_objects.size();
+                    if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
+                        peering_state = peering_state | OSD_FLUSHING_PGS;
+                    else
+                        peering_state = peering_state | OSD_RECOVERING;
                }
                else
                {
-                    still_doing_pgs = true;
+                    still = true;
                }
            }
        }
-        if (!still_doing_pgs)
+        if (!still)
        {
            // Done all PGs
            peering_state = peering_state & ~OSD_PEERING_PGS;
        }
    }
-}
-
-void osd_t::repeer_pgs(osd_num_t osd_num, bool is_connected)
-{
-    // Re-peer affected PGs
-    // FIXME: We shouldn't rely just on target_set. Other OSDs may also contain PG data.
-    osd_num_t real_osd = (is_connected ? osd_num : 0);
-    for (int i = 0; i < pgs.size(); i++)
+    if ((peering_state & OSD_FLUSHING_PGS) && !readonly)
    {
-        bool repeer = false;
-        for (int r = 0; r < pgs[i].target_set.size(); r++)
+        bool still = false;
+        for (auto & p: pgs)
        {
-            if (pgs[i].target_set[r] == osd_num &&
-                pgs[i].cur_set[r] != real_osd)
+            if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
            {
-                pgs[i].cur_set[r] = real_osd;
-                repeer = true;
-                break;
+                if (!p.second.flush_batch)
+                {
+                    submit_pg_flush_ops(p.first);
+                }
+                still = true;
            }
        }
-        if (repeer)
+        if (!still)
        {
-            // Repeer this pg
-            printf("Repeer PG %d because of OSD %lu\n", i, osd_num);
-            start_pg_peering(i);
-            peering_state |= OSD_PEERING_PGS;
+            peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
+        }
+    }
+    if ((peering_state & OSD_RECOVERING) && !readonly)
+    {
+        if (!continue_recovery())
+        {
+            peering_state = peering_state & ~OSD_RECOVERING;
+        }
+    }
+}
+
+void osd_t::repeer_pgs(osd_num_t peer_osd)
+{
+    // Re-peer affected PGs
+    for (auto & p: pgs)
+    {
+        bool repeer = false;
+        if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
+        {
+            for (osd_num_t pg_osd: p.second.all_peers)
+            {
+                if (pg_osd == peer_osd)
+                {
+                    repeer = true;
+                    break;
+                }
+            }
+            if (repeer)
+            {
+                // Repeer this pg
+                printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
+                start_pg_peering(p.second.pg_num);
+            }
        }
    }
 }

 // Repeer on each connect/disconnect peer event
-void osd_t::start_pg_peering(int pg_idx)
+void osd_t::start_pg_peering(pg_num_t pg_num)
 {
-    auto & pg = pgs[pg_idx];
+    auto & pg = pgs[pg_num];
    pg.state = PG_PEERING;
+    this->peering_state |= OSD_PEERING_PGS;
+    report_pg_state(pg);
+    // Reset PG state
+    pg.cur_peers.clear();
    pg.state_dict.clear();
-    pg.obj_states.clear();
+    incomplete_objects -= pg.incomplete_objects.size();
+    misplaced_objects -= pg.misplaced_objects.size();
+    degraded_objects -= pg.degraded_objects.size();
+    pg.incomplete_objects.clear();
+    pg.misplaced_objects.clear();
+    pg.degraded_objects.clear();
+    pg.flush_actions.clear();
    pg.ver_override.clear();
-    pg.pg_cursize = 0;
-    for (int role = 0; role < pg.cur_set.size(); role++)
+    if (pg.flush_batch)
    {
+        delete pg.flush_batch;
+    }
+    pg.flush_batch = NULL;
+    for (auto p: pg.write_queue)
+    {
+        cancel_primary_write(p.second);
+    }
+    pg.write_queue.clear();
+    for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
+    {
+        // Forget this PG's unstable writes
+        pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1;
+        if (n == pg.pg_num)
+            unstable_writes.erase(it++);
+        else
+            it++;
+    }
+    dirty_pgs.erase(pg.pg_num);
+    // Calculate current write OSD set
+    pg.pg_cursize = 0;
+    pg.cur_set.resize(pg.target_set.size());
+    pg.cur_loc_set.clear();
+    for (int role = 0; role < pg.target_set.size(); role++)
+    {
+        pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
+            c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0;
        if (pg.cur_set[role] != 0)
        {
            pg.pg_cursize++;
+            pg.cur_loc_set.push_back({
+                .role = (uint64_t)role,
+                .osd_num = pg.cur_set[role],
+                .outdated = false,
+            });
+        }
+    }
+    if (pg.target_history.size())
+    {
+        // Refuse to start PG if no peers are available from any of the historical OSD sets
+        // (PG history is kept up to the latest active+clean state)
+        for (auto & history_set: pg.target_history)
+        {
+            bool found = false;
+            for (auto history_osd: history_set)
+            {
+                if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
+                {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                pg.state = PG_INCOMPLETE;
+                report_pg_state(pg);
+            }
        }
    }
    if (pg.pg_cursize < pg.pg_minsize)
    {
        pg.state = PG_INCOMPLETE;
+        report_pg_state(pg);
    }
+    std::set<osd_num_t> cur_peers;
+    for (auto pg_osd: pg.all_peers)
+    {
+        if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end())
+        {
+            cur_peers.insert(pg_osd);
+        }
+        else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
+        {
+            c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
+        }
+    }
+    pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
    if (pg.peering_state)
    {
-        // Adjust the peering operation that's still in progress
-        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
+        // Adjust the peering operation that's still in progress - discard unneeded results
+        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
        {
-            int role;
-            for (role = 0; role < pg.cur_set.size(); role++)
-            {
-                if (pg.cur_set[role] == it->first)
-                    break;
-            }
-            if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
+            if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
            {
                // Discard the result after completion, which, chances are, will be unsuccessful
-                auto list_op = it->second;
-                if (list_op->peer_fd == 0)
-                {
-                    // Self
-                    list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
-                    {
-                        if (list_op->bs_op->buf)
-                            free(list_op->bs_op->buf);
-                        delete list_op;
-                    };
-                }
-                else
-                {
-                    // Peer
-                    list_op->callback = [](osd_op_t *list_op)
-                    {
-                        delete list_op;
-                    };
-                }
+                discard_list_subop(it->second);
                pg.peering_state->list_ops.erase(it);
                it = pg.peering_state->list_ops.begin();
            }
+            else
+                it++;
        }
-        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
+        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
        {
-            int role;
-            for (role = 0; role < pg.cur_set.size(); role++)
-            {
-                if (pg.cur_set[role] == it->first)
-                    break;
-            }
-            if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
+            if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
            {
                if (it->second.buf)
                {
@@ -291,6 +217,8 @@ void osd_t::start_pg_peering(int pg_idx)
                pg.peering_state->list_results.erase(it);
                it = pg.peering_state->list_results.begin();
            }
+            else
+                it++;
        }
    }
    if (pg.state == PG_INCOMPLETE)
@@ -300,107 +228,300 @@ void osd_t::start_pg_peering(int pg_idx)
            delete pg.peering_state;
            pg.peering_state = NULL;
        }
-        printf("PG %d is incomplete\n", pg.pg_num);
        return;
    }
    if (!pg.peering_state)
    {
        pg.peering_state = new pg_peering_state_t();
+        pg.peering_state->pg_num = pg.pg_num;
    }
-    auto ps = pg.peering_state;
-    for (int role = 0; role < pg.cur_set.size(); role++)
+    for (osd_num_t peer_osd: cur_peers)
    {
-        osd_num_t role_osd = pg.cur_set[role];
-        if (!role_osd)
+        if (pg.peering_state->list_ops.find(peer_osd) != pg.peering_state->list_ops.end() ||
+            pg.peering_state->list_results.find(peer_osd) != pg.peering_state->list_results.end())
        {
            continue;
        }
-        if (ps->list_ops.find(role_osd) != ps->list_ops.end() ||
-            ps->list_results.find(role_osd) != ps->list_results.end())
-        {
-            continue;
-        }
-        if (role_osd == this->osd_num)
-        {
-            // Self
-            osd_op_t *op = new osd_op_t();
-            op->op_type = 0;
-            op->peer_fd = 0;
-            op->bs_op = new blockstore_op_t();
-            op->bs_op->opcode = BS_OP_LIST;
-            op->bs_op->oid.stripe = parity_block_size;
-            op->bs_op->len = pg_count,
-            op->bs_op->offset = pg.pg_num-1,
-            op->bs_op->callback = [ps, op, role_osd](blockstore_op_t *bs_op)
-            {
-                if (op->bs_op->retval < 0)
-                {
-                    throw std::runtime_error("local OP_LIST failed");
-                }
-                printf(
-                    "Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
-                    role_osd, bs_op->retval, bs_op->version
-                );
-                ps->list_results[role_osd] = {
-                    .buf = (obj_ver_id*)op->bs_op->buf,
-                    .total_count = (uint64_t)op->bs_op->retval,
-                    .stable_count = op->bs_op->version,
-                };
-                ps->list_done++;
-                ps->list_ops.erase(role_osd);
-                delete op;
-            };
-            bs->enqueue_op(op->bs_op);
-            ps->list_ops[role_osd] = op;
-        }
-        else
-        {
-            // Peer
-            auto & cl = clients[osd_peer_fds[role_osd]];
-            osd_op_t *op = new osd_op_t();
-            op->op_type = OSD_OP_OUT;
-            op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
-            op->peer_fd = cl.peer_fd;
-            op->req = {
-                .sec_list = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = this->next_subop_id++,
-                        .opcode = OSD_OP_SECONDARY_LIST,
-                    },
-                    .list_pg = pg.pg_num,
-                    .pg_count = pg_count,
-                    .parity_block_size = parity_block_size,
-                },
-            };
-            op->callback = [this, ps, role_osd](osd_op_t *op)
-            {
-                if (op->reply.hdr.retval < 0)
-                {
-                    printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
-                    ps->list_ops.erase(role_osd);
-                    stop_client(op->peer_fd);
-                    delete op;
-                    return;
-                }
-                printf(
-                    "Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
-                    role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
-                );
-                ps->list_results[role_osd] = {
-                    .buf = (obj_ver_id*)op->buf,
-                    .total_count = (uint64_t)op->reply.hdr.retval,
-                    .stable_count = op->reply.sec_list.stable_count,
-                };
-                // set op->buf to NULL so it doesn't get freed
-                op->buf = NULL;
-                ps->list_done++;
-                ps->list_ops.erase(role_osd);
-                delete op;
-            };
-            outbox_push(cl, op);
-            ps->list_ops[role_osd] = op;
-        }
+        submit_sync_and_list_subop(peer_osd, pg.peering_state);
    }
    ringloop->wakeup();
 }
+
+void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
+{
+    // Sync before listing, if not readonly
+    if (readonly)
+    {
+        submit_list_subop(role_osd, ps);
+    }
+    else if (role_osd == this->osd_num)
+    {
+        // Self
+        osd_op_t *op = new osd_op_t();
+        op->op_type = 0;
+        op->peer_fd = 0;
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t();
+        op->bs_op->opcode = BS_OP_SYNC;
+        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
+        {
+            if (bs_op->retval < 0)
+            {
+                printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
+                force_stop(1);
+                return;
+            }
+            add_bs_subop_stats(op);
+            delete op->bs_op;
+            op->bs_op = NULL;
+            delete op;
+            ps->list_ops.erase(role_osd);
+            submit_list_subop(role_osd, ps);
+        };
+        bs->enqueue_op(op->bs_op);
+        ps->list_ops[role_osd] = op;
+    }
+    else
+    {
+        // Peer
+        auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+        op->peer_fd = cl.peer_fd;
+        op->req = {
+            .sec_sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_SYNC,
+                },
+            },
+        };
+        op->callback = [this, ps, role_osd](osd_op_t *op)
+        {
+            if (op->reply.hdr.retval < 0)
+            {
+                // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
+                printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
+                ps->list_ops.erase(role_osd);
+                c_cli.stop_client(op->peer_fd);
+                delete op;
+                return;
+            }
+            delete op;
+            ps->list_ops.erase(role_osd);
+            submit_list_subop(role_osd, ps);
+        };
+        c_cli.outbox_push(op);
+        ps->list_ops[role_osd] = op;
+    }
+}
+
+void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
+{
+    if (role_osd == this->osd_num)
+    {
+        // Self
+        osd_op_t *op = new osd_op_t();
+        op->op_type = 0;
+        op->peer_fd = 0;
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t();
+        op->bs_op->opcode = BS_OP_LIST;
+        op->bs_op->oid.stripe = pg_stripe_size;
+        op->bs_op->len = pg_count;
+        op->bs_op->offset = ps->pg_num-1;
+        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
+        {
+            if (op->bs_op->retval < 0)
+            {
+                throw std::runtime_error("local OP_LIST failed");
+            }
+            add_bs_subop_stats(op);
+            printf(
+                "[PG %u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
+                ps->pg_num, role_osd, bs_op->retval, bs_op->version
+            );
+            ps->list_results[role_osd] = {
+                .buf = (obj_ver_id*)op->bs_op->buf,
+                .total_count = (uint64_t)op->bs_op->retval,
+                .stable_count = op->bs_op->version,
+            };
+            ps->list_ops.erase(role_osd);
+            delete op->bs_op;
+            op->bs_op = NULL;
+            delete op;
+        };
+        bs->enqueue_op(op->bs_op);
+        ps->list_ops[role_osd] = op;
+    }
+    else
+    {
+        // Peer
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+        op->peer_fd = c_cli.osd_peer_fds[role_osd];
+        op->req = {
+            .sec_list = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_LIST,
+                },
+                .list_pg = ps->pg_num,
+                .pg_count = pg_count,
+                .pg_stripe_size = pg_stripe_size,
+            },
+        };
+        op->callback = [this, ps, role_osd](osd_op_t *op)
+        {
+            if (op->reply.hdr.retval < 0)
+            {
+                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
+                ps->list_ops.erase(role_osd);
+                c_cli.stop_client(op->peer_fd);
+                delete op;
+                return;
+            }
+            printf(
+                "[PG %u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
+                ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
+            );
+            ps->list_results[role_osd] = {
+                .buf = (obj_ver_id*)op->buf,
+                .total_count = (uint64_t)op->reply.hdr.retval,
+                .stable_count = op->reply.sec_list.stable_count,
+            };
+            // set op->buf to NULL so it doesn't get freed
+            op->buf = NULL;
+            ps->list_ops.erase(role_osd);
+            delete op;
+        };
+        c_cli.outbox_push(op);
+        ps->list_ops[role_osd] = op;
+    }
+}
+
+void osd_t::discard_list_subop(osd_op_t *list_op)
+{
+    if (list_op->peer_fd == 0)
+    {
+        // Self
+        list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
+        {
+            if (list_op->bs_op->buf)
+                free(list_op->bs_op->buf);
+            delete list_op->bs_op;
+            list_op->bs_op = NULL;
+            delete list_op;
+        };
+    }
+    else
+    {
+        // Peer
+        list_op->callback = [](osd_op_t *list_op)
+        {
+            delete list_op;
+        };
+    }
+}
+
+bool osd_t::stop_pg(pg_num_t pg_num)
+{
+    auto pg_it = pgs.find(pg_num);
+    if (pg_it == pgs.end())
+    {
+        return false;
+    }
+    auto & pg = pg_it->second;
+    if (pg.peering_state)
+    {
+        // Stop peering
+        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
+        {
+            discard_list_subop(it->second);
+        }
+        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
+        {
+            if (it->second.buf)
+            {
+                free(it->second.buf);
+            }
+        }
+        delete pg.peering_state;
+        pg.peering_state = NULL;
+    }
+    if (!(pg.state & PG_ACTIVE))
+    {
+        return false;
+    }
+    pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
+    if (pg.inflight == 0 && !pg.flush_batch)
+    {
+        finish_stop_pg(pg);
+    }
+    else
+    {
+        report_pg_state(pg);
+    }
+    return true;
+}
+
+void osd_t::finish_stop_pg(pg_t & pg)
+{
+    pg.state = PG_OFFLINE;
+    report_pg_state(pg);
+}
+
+void osd_t::report_pg_state(pg_t & pg)
+{
+    pg.print_state();
+    this->pg_state_dirty.insert(pg.pg_num);
+    if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
+    {
+        // Clear history of active+clean PGs
+        pg.history_changed = true;
+        pg.target_history.clear();
+        pg.all_peers = pg.target_set;
+        pg.cur_peers = pg.target_set;
+    }
+    else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
+    {
+        // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
+        pg.history_changed = true;
+        pg.target_history.clear();
+        std::set<osd_num_t> dead_peers;
+        for (auto pg_osd: pg.all_peers)
+        {
+            dead_peers.insert(pg_osd);
+        }
+        for (auto pg_osd: pg.cur_peers)
+        {
+            dead_peers.erase(pg_osd);
+        }
+        for (auto pg_osd: pg.target_set)
+        {
+            if (pg_osd)
+            {
+                dead_peers.insert(pg_osd);
+            }
+        }
+        pg.all_peers.clear();
+        pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
+        pg.cur_peers.clear();
+        for (auto pg_osd: pg.target_set)
+        {
+            if (pg_osd)
+            {
+                pg.cur_peers.push_back(pg_osd);
+            }
+        }
+    }
+    if (pg.state == PG_OFFLINE && !this->pg_config_applied)
+    {
+        apply_pg_config();
+    }
+    report_pg_states();
+}
--- a/osd_peering_pg.cpp
+++ b/osd_peering_pg.cpp
@@ -1,159 +1,360 @@
 #include "osd_peering_pg.h"

-void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all)
+struct obj_ver_role
 {
-    auto & pg = *this;
-    // Remember the decision
-    uint64_t state = 0;
-    if (st.n_roles == pg.pg_cursize)
+    object_id oid;
+    uint64_t version;
+    uint64_t osd_num;
+    bool is_stable;
+};
+
+inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
+{
+    // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, role ASC, osd_num ASC
+    return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
+        (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
+        (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
+            a.version > b.version ||
+            a.version == b.version && (
+                a.oid.stripe < b.oid.stripe ||
+                a.oid.stripe == b.oid.stripe && a.osd_num < b.osd_num
+            )
+        )
+    );
+}
+
+struct obj_piece_ver_t
+{
+    uint64_t max_ver = 0;
+    uint64_t stable_ver = 0;
+    uint64_t max_target = 0;
+};
+
+struct pg_obj_state_check_t
+{
+    pg_t *pg;
+    std::vector<obj_ver_role> list;
+    int list_pos;
+    int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
+    object_id oid = { 0 };
+    uint64_t max_ver = 0;
+    uint64_t last_ver = 0;
+    uint64_t target_ver = 0;
+    uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
+    uint64_t n_unstable = 0, n_buggy = 0;
+    pg_osd_set_t osd_set;
+    int log_level;
+
+    void walk();
+    void start_object();
+    void handle_version();
+    void finish_object();
+};
+
+void pg_obj_state_check_t::walk()
+{
+    pg->clean_count = 0;
+    pg->total_count = 0;
+    pg->state = 0;
+    for (list_pos = 0; list_pos < list.size(); list_pos++)
    {
-        if (st.n_matched == pg.pg_cursize)
-            state = OBJ_CLEAN;
+        if (oid.inode != list[list_pos].oid.inode ||
+            oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
+        {
+            if (oid.inode != 0)
+            {
+                finish_object();
+            }
+            start_object();
+        }
+        handle_version();
+    }
+    if (oid.inode != 0)
+    {
+        finish_object();
+    }
+    if (pg->pg_cursize < pg->pg_size)
+    {
+        pg->state |= PG_DEGRADED;
+    }
+    pg->state |= PG_ACTIVE;
+    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
+    {
+        pg->state |= PG_LEFT_ON_DEAD;
+    }
+}
+
+void pg_obj_state_check_t::start_object()
+{
+    obj_start = list_pos;
+    oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
+    last_ver = max_ver = list[list_pos].version;
+    target_ver = 0;
+    ver_start = list_pos;
+    has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
+    n_unstable = n_buggy = 0;
+}
+
+void pg_obj_state_check_t::handle_version()
+{
+    if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
+    {
+        // Version is either stable or recoverable
+        target_ver = last_ver;
+        ver_end = list_pos;
+    }
+    if (!target_ver)
+    {
+        if (last_ver != list[list_pos].version)
+        {
+            ver_start = list_pos;
+            has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
+            last_ver = list[list_pos].version;
+        }
+        int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
+        n_copies++;
+        if (replica >= pg->pg_size)
+        {
+            n_buggy++;
+        }
        else
        {
-            state = OBJ_MISPLACED;
-            pg.state = pg.state | PG_HAS_MISPLACED;
+            if (list[list_pos].is_stable)
+            {
+                n_stable++;
+            }
+            if (pg->cur_set[replica] != list[list_pos].osd_num)
+            {
+                n_mismatched++;
+            }
+            if (!(has_roles & (1 << replica)))
+            {
+                has_roles = has_roles | (1 << replica);
+                n_roles++;
+            }
        }
    }
-    else if (st.n_roles < pg.pg_minsize)
+    if (!list[list_pos].is_stable)
    {
-        printf("Object is unfound: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
-        state = OBJ_INCOMPLETE;
-        pg.state = pg.state | PG_HAS_UNFOUND;
+        n_unstable++;
    }
-    else
+}
+
+void pg_obj_state_check_t::finish_object()
+{
+    if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
    {
-        printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
-        state = OBJ_DEGRADED;
-        pg.state = pg.state | PG_HAS_DEGRADED;
+        // Version is either stable or recoverable
+        target_ver = last_ver;
+        ver_end = list_pos;
    }
-    if (st.n_copies > pg.pg_size)
+    obj_end = list_pos;
+    // Remember the decision
+    uint64_t state = 0;
+    if (n_buggy > 0)
    {
-        state |= OBJ_OVERCOPIED;
-        pg.state = pg.state | PG_HAS_UNCLEAN;
-    }
-    if (st.n_stable < st.n_copies)
-    {
-        state |= OBJ_NEEDS_STABLE;
-        pg.state = pg.state | PG_HAS_UNCLEAN;
-    }
-    if (st.target_ver < st.max_ver || st.has_old_unstable)
-    {
-        state |= OBJ_NEEDS_ROLLBACK;
-        pg.state = pg.state | PG_HAS_UNCLEAN;
-        pg.ver_override[st.oid] = st.target_ver;
-    }
-    if (st.is_buggy)
-    {
-        state |= OBJ_BUGGY;
+        state = OBJ_BUGGY;
        // FIXME: bring pg offline
        throw std::runtime_error("buggy object state");
    }
-    if (state != OBJ_CLEAN)
+    if (n_unstable > 0)
    {
-        st.osd_set.clear();
-        for (int i = st.ver_start; i < st.ver_end; i++)
+        pg->state |= PG_HAS_UNCLEAN;
+        std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
+        for (int i = obj_start; i < obj_end; i++)
        {
-            st.osd_set.push_back((pg_obj_loc_t){
-                .role = (all[i].oid.stripe & STRIPE_MASK),
-                .osd_num = all[i].osd_num,
-                .stable = all[i].is_stable,
+            auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
+            if (!pcs.max_ver)
+            {
+                pcs.max_ver = list[i].version;
+            }
+            if (list[i].is_stable && !pcs.stable_ver)
+            {
+                pcs.stable_ver = list[i].version;
+            }
+            if (list[i].version <= target_ver && !pcs.max_target)
+            {
+                pcs.max_target = list[i].version;
+            }
+        }
+        for (auto pp: pieces)
+        {
+            auto & pcs = pp.second;
+            if (pcs.stable_ver < pcs.max_ver)
+            {
+                auto & act = pg->flush_actions[pp.first];
+                // osd_set doesn't include rollback/stable states, so don't include them in the state code either
+                if (pcs.max_ver > target_ver)
+                {
+                    act.rollback = true;
+                    act.rollback_to = pcs.max_target;
+                }
+                if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
+                {
+                    act.make_stable = true;
+                    act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
+                }
+            }
+        }
+    }
+    if (!target_ver)
+    {
+        return;
+    }
+    if (n_roles < pg->pg_minsize)
+    {
+        if (log_level > 1)
+        {
+            printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+        }
+        state = OBJ_INCOMPLETE;
+        pg->state = pg->state | PG_HAS_INCOMPLETE;
+    }
+    else if (n_roles < pg->pg_cursize)
+    {
+        if (log_level > 1)
+        {
+            printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+        }
+        state = OBJ_DEGRADED;
+        pg->state = pg->state | PG_HAS_DEGRADED;
+    }
+    if (n_mismatched > 0)
+    {
+        if (n_roles >= pg->pg_cursize && log_level > 1)
+        {
+            printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+        }
+        state |= OBJ_MISPLACED;
+        pg->state = pg->state | PG_HAS_MISPLACED;
+    }
+    if (log_level > 1 && (n_roles < pg->pg_cursize || n_mismatched > 0))
+    {
+        if (log_level > 2)
+        {
+            for (int i = obj_start; i < obj_end; i++)
+            {
+                printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
+            }
+        }
+        else
+        {
+            for (int i = ver_start; i < ver_end; i++)
+            {
+                printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
+            }
+        }
+    }
+    pg->total_count++;
+    if (state != 0 || ver_end < obj_end)
+    {
+        osd_set.clear();
+        for (int i = ver_start; i < ver_end; i++)
+        {
+            osd_set.push_back((pg_obj_loc_t){
+                .role = (list[i].oid.stripe & STRIPE_MASK),
+                .osd_num = list[i].osd_num,
+                .outdated = false,
            });
        }
-        std::sort(st.osd_set.begin(), st.osd_set.end());
-        auto it = pg.state_dict.find(st.osd_set);
-        if (it == pg.state_dict.end())
+    }
+    if (ver_end < obj_end)
+    {
+        // Check for outdated versions not present in the current target OSD set
+        for (int i = ver_end; i < obj_end; i++)
+        {
+            int j;
+            for (j = 0; j < osd_set.size(); j++)
+            {
+                if (osd_set[j].osd_num == list[i].osd_num)
+                {
+                    break;
+                }
+            }
+            if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
+            {
+                osd_set.push_back((pg_obj_loc_t){
+                    .role = (list[i].oid.stripe & STRIPE_MASK),
+                    .osd_num = list[i].osd_num,
+                    .outdated = true,
+                });
+                state |= OBJ_MISPLACED;
+                pg->state = pg->state | PG_HAS_MISPLACED;
+            }
+        }
+    }
+    if (target_ver < max_ver)
+    {
+        pg->ver_override[oid] = target_ver;
+    }
+    if (state == 0)
+    {
+        pg->clean_count++;
+    }
+    else
+    {
+        auto it = pg->state_dict.find(osd_set);
+        if (it == pg->state_dict.end())
        {
            std::vector<uint64_t> read_target;
-            read_target.resize(pg.pg_size);
-            for (int i = 0; i < pg.pg_size; i++)
+            read_target.resize(pg->pg_size);
+            for (int i = 0; i < pg->pg_size; i++)
            {
                read_target[i] = 0;
            }
-            for (auto & o: st.osd_set)
+            for (auto & o: osd_set)
            {
-                read_target[o.role] = o.osd_num;
+                if (!o.outdated)
+                {
+                    read_target[o.role] = o.osd_num;
+                }
            }
-            pg.state_dict[st.osd_set] = {
+            pg->state_dict[osd_set] = {
                .read_target = read_target,
-                .osd_set = st.osd_set,
+                .osd_set = osd_set,
                .state = state,
                .object_count = 1,
            };
-            it = pg.state_dict.find(st.osd_set);
+            it = pg->state_dict.find(osd_set);
        }
        else
        {
            it->second.object_count++;
        }
-        pg.obj_states[st.oid] = &it->second;
-        if (st.target_ver < st.max_ver)
+        if (state & OBJ_INCOMPLETE)
        {
-            pg.ver_override[st.oid] = st.target_ver;
+            pg->incomplete_objects[oid] = &it->second;
        }
-        if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
+        else if (state & OBJ_DEGRADED)
        {
-            spp::sparse_hash_map<obj_piece_id_t, obj_piece_ver_t> pieces;
-            for (int i = st.obj_start; i < st.obj_end; i++)
-            {
-                auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }];
-                if (!pcs.max_ver)
-                {
-                    pcs.max_ver = all[i].version;
-                }
-                if (all[i].is_stable && !pcs.stable_ver)
-                {
-                    pcs.stable_ver = all[i].version;
-                }
-            }
-            for (auto pp: pieces)
-            {
-                auto & pcs = pp.second;
-                if (pcs.stable_ver < pcs.max_ver)
-                {
-                    auto & act = obj_stab_actions[pp.first];
-                    if (pcs.max_ver > st.target_ver)
-                    {
-                        act.rollback = true;
-                        act.rollback_to = st.target_ver;
-                    }
-                    else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver)
-                    {
-                        act.rollback = true;
-                        act.rollback_to = pcs.stable_ver;
-                    }
-                    if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver)
-                    {
-                        act.make_stable = true;
-                        act.stable_to = st.target_ver;
-                    }
-                }
-            }
+            pg->degraded_objects[oid] = &it->second;
+        }
+        else
+        {
+            pg->misplaced_objects[oid] = &it->second;
        }
    }
-    else
-        pg.clean_count++;
-    pg.total_count++;
 }

 // FIXME: Write at least some tests for this function
-void pg_t::calc_object_states()
+void pg_t::calc_object_states(int log_level)
 {
-    auto & pg = *this;
    // Copy all object lists into one array
-    std::vector<obj_ver_role> all;
-    auto ps = pg.peering_state;
+    pg_obj_state_check_t st;
+    st.log_level = log_level;
+    st.pg = this;
+    auto ps = peering_state;
    for (auto it: ps->list_results)
    {
        auto nstab = it.second.stable_count;
        auto n = it.second.total_count;
        auto osd_num = it.first;
-        uint64_t start = all.size();
-        all.resize(start + n);
+        uint64_t start = st.list.size();
+        st.list.resize(start + n);
        obj_ver_id *ov = it.second.buf;
        for (uint64_t i = 0; i < n; i++, ov++)
        {
-            all[start+i] = {
+            st.list[start+i] = {
                .oid = ov->oid,
                .version = ov->version,
                .osd_num = osd_num,
@@ -165,101 +366,26 @@ void pg_t::calc_object_states()
    }
    ps->list_results.clear();
    // Sort
-    std::sort(all.begin(), all.end());
+    std::sort(st.list.begin(), st.list.end());
    // Walk over it and check object states
-    pg.clean_count = 0;
-    pg.total_count = 0;
-    pg.state = 0;
-    int replica = 0;
-    pg_obj_state_check_t st;
-    for (int i = 0; i < all.size(); i++)
-    {
-        if (st.oid.inode != all[i].oid.inode ||
-            st.oid.stripe != (all[i].oid.stripe & ~STRIPE_MASK))
-        {
-            if (st.oid.inode != 0)
-            {
-                // Remember object state
-                st.obj_end = st.ver_end = i;
-                remember_object(st, all);
-            }
-            st.obj_start = st.ver_start = i;
-            st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe & ~STRIPE_MASK };
-            st.max_ver = st.target_ver = all[i].version;
-            st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
-            st.is_buggy = st.has_old_unstable = false;
-        }
-        else if (st.target_ver != all[i].version)
-        {
-            if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
-            {
-                // Last processed version is either recoverable or stable, choose it as target and skip previous versions
-                st.ver_end = i;
-                i++;
-                while (i < all.size() && st.oid.inode == all[i].oid.inode &&
-                    st.oid.stripe == (all[i].oid.stripe & ~STRIPE_MASK))
-                {
-                    if (!all[i].is_stable)
-                    {
-                        st.has_old_unstable = true;
-                    }
-                    i++;
-                }
-                st.obj_end = i;
-                i--;
-                continue;
-            }
-            else
-            {
-                // Last processed version is unstable and unrecoverable
-                // We'll know that because target_ver < max_ver
-                st.ver_start = i;
-                st.target_ver = all[i].version;
-                st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
-            }
-        }
-        replica = (all[i].oid.stripe & STRIPE_MASK);
-        st.n_copies++;
-        if (replica >= pg.pg_size)
-        {
-            // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
-            st.is_buggy = true;
-        }
-        else
-        {
-            if (all[i].is_stable)
-            {
-                st.n_stable++;
-            }
-            if (pg.cur_set[replica] == all[i].osd_num)
-            {
-                st.n_matched++;
-            }
-            if (!(st.has_roles & (1 << replica)))
-            {
-                st.has_roles = st.has_roles | (1 << replica);
-                st.n_roles++;
-            }
-        }
-    }
-    if (st.oid.inode != 0)
-    {
-        // Remember object state
-        st.obj_end = st.ver_end = all.size();
-        remember_object(st, all);
-    }
-    if (pg.pg_cursize < pg.pg_size)
-    {
-        pg.state = pg.state | PG_DEGRADED;
-    }
-    printf(
-        "PG %u is active%s%s%s%s%s (%lu objects)\n", pg.pg_num,
-        (pg.state & PG_DEGRADED) ? " + degraded" : "",
-        (pg.state & PG_HAS_UNFOUND) ? " + has_unfound" : "",
-        (pg.state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
-        (pg.state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
-        (pg.state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
-        pg.total_count
-    );
-    pg.state = pg.state | PG_ACTIVE;
+    st.walk();
+}
+
+void pg_t::print_state()
+{
+    printf(
+        "[PG %u] is %s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
+        (state & PG_STARTING) ? "starting" : "",
+        (state & PG_OFFLINE) ? "offline" : "",
+        (state & PG_PEERING) ? "peering" : "",
+        (state & PG_INCOMPLETE) ? "incomplete" : "",
+        (state & PG_ACTIVE) ? "active" : "",
+        (state & PG_STOPPING) ? "stopping" : "",
+        (state & PG_DEGRADED) ? " + degraded" : "",
+        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
+        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
+        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
+        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
+        total_count
+    );
 }
--- a/osd_peering_pg.h
+++ b/osd_peering_pg.h
@@ -1,43 +1,19 @@
 #include <map>
+#include <unordered_map>
 #include <vector>
 #include <algorithm>

+#include "cpp-btree/btree_map.h"
+
 #include "object_id.h"
 #include "osd_ops.h"
-
-#include "sparsepp/sparsepp/spp.h"
-
-// Placement group states
-// Exactly one of these:
-#define PG_OFFLINE (1<<0)
-#define PG_PEERING (1<<1)
-#define PG_INCOMPLETE (1<<2)
-#define PG_ACTIVE (1<<3)
-// Plus any of these:
-#define PG_DEGRADED (1<<4)
-#define PG_HAS_UNFOUND (1<<5)
-#define PG_HAS_DEGRADED (1<<6)
-#define PG_HAS_MISPLACED (1<<7)
-#define PG_HAS_UNCLEAN (1<<8)
-
-// FIXME: Safe default that doesn't depend on parity_block_size of pg_parity_size
-#define STRIPE_MASK ((uint64_t)4096 - 1)
-
-// OSD object states
-#define OBJ_CLEAN 0x01
-#define OBJ_MISPLACED 0x02
-#define OBJ_DEGRADED 0x03
-#define OBJ_INCOMPLETE 0x04
-#define OBJ_NEEDS_STABLE 0x10000
-#define OBJ_NEEDS_ROLLBACK 0x20000
-#define OBJ_OVERCOPIED 0x40000
-#define OBJ_BUGGY 0x80000
+#include "pg_states.h"

 struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    bool stable;
+    bool outdated;
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -64,28 +40,9 @@ struct osd_op_t;
 struct pg_peering_state_t
 {
    // osd_num -> list result
-    spp::sparse_hash_map<osd_num_t, osd_op_t*> list_ops;
-    spp::sparse_hash_map<osd_num_t, pg_list_result_t> list_results;
-    int list_done = 0;
-};
-
-struct pg_obj_state_check_t
-{
-    int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
-    object_id oid = { 0 };
-    uint64_t max_ver = 0;
-    uint64_t target_ver = 0;
-    uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
-    bool is_buggy = false, has_old_unstable = false;
-    pg_osd_set_t osd_set;
-};
-
-struct obj_ver_role
-{
-    object_id oid;
-    uint64_t version;
-    uint64_t osd_num;
-    bool is_stable;
+    std::unordered_map<osd_num_t, osd_op_t*> list_ops;
+    std::unordered_map<osd_num_t, pg_list_result_t> list_results;
+    pg_num_t pg_num = 0;
 };

 struct obj_piece_id_t
@@ -94,60 +51,63 @@ struct obj_piece_id_t
    uint64_t osd_num;
 };

-struct obj_piece_ver_t
-{
-    uint64_t max_ver = 0;
-    uint64_t stable_ver = 0;
-};
-
-struct obj_stab_action_t
+struct flush_action_t
 {
    bool rollback = false, make_stable = false;
    uint64_t stable_to = 0, rollback_to = 0;
+    bool submitted = false;
+};
+
+struct pg_flush_batch_t
+{
+    std::map<osd_num_t, std::vector<obj_ver_id>> rollback_lists;
+    std::map<osd_num_t, std::vector<obj_ver_id>> stable_lists;
+    int flush_ops = 0, flush_done = 0;
+    int flush_objects = 0;
 };

 struct pg_t
 {
-    int state;
+    int state = 0;
    uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
    pg_num_t pg_num;
    uint64_t clean_count = 0, total_count = 0;
+    // target history and all potential peers
+    std::vector<std::vector<osd_num_t>> target_history;
+    std::vector<osd_num_t> all_peers;
+    bool history_changed = false;
+    // peer list from the last peering event
+    std::vector<osd_num_t> cur_peers;
    // target_set is the "correct" peer OSD set for this PG
    std::vector<osd_num_t> target_set;
    // cur_set is the current set of connected peer OSDs for this PG
    // cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
    std::vector<osd_num_t> cur_set;
+    // same thing in state_dict-like format
+    pg_osd_set_t cur_loc_set;
    // moved object map. by default, each object is considered to reside on the cur_set.
    // this map stores all objects that differ.
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
-    spp::sparse_hash_map<object_id, pg_osd_set_state_t*> obj_states;
-    std::map<obj_piece_id_t, obj_stab_action_t> obj_stab_actions;
-    spp::sparse_hash_map<object_id, uint64_t> ver_override;
+    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
+    std::map<obj_piece_id_t, flush_action_t> flush_actions;
+    btree::btree_map<object_id, uint64_t> ver_override;
    pg_peering_state_t *peering_state = NULL;
+    pg_flush_batch_t *flush_batch = NULL;

+    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

-    void calc_object_states();
-    void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
+    void calc_object_states(int log_level);
+    void print_state();
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num ||
-        a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
-}
-
-inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
-{
-    // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
-    return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
-        (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
-        (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
-            a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
-        )
-    );
+    return a.outdated < b.outdated ||
+        a.outdated == b.outdated && a.role < b.role ||
+        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
@@ -172,7 +132,6 @@ namespace std
                // Copy-pasted from spp::hash_combine()
                seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
                seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-                seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
            }
            return seed;
        }
--- a/osd_peering_pg_test.cpp
+++ b/osd_peering_pg_test.cpp
@@ -0,0 +1,54 @@
+#define _LARGEFILE64_SOURCE
+
+#include "osd_peering_pg.h"
+#define STRIPE_SHIFT 12
+
+/**
+ * TODO tests for object & pg state calculation.
+ *
+ * 1) pg=1,2,3. objects:
+ *    v1=1s,2s,3s -> clean
+ *    v1=1s,2s,3 v2=1s,2s,_ -> degraded + needs_rollback
+ *    v1=1s,2s,_ -> degraded
+ *    v1=1s,2s,3s v2=1,6,_ -> degraded + needs_stabilize
+ *    v1=2s,1s,3s -> misplaced
+ *    v1=4,5,6 -> misplaced + needs_stabilize
+ *    v1=1s,2s,6s -> misplaced
+ * 2) ...
+ */
+int main(int argc, char *argv[])
+{
+    pg_t pg = {
+        .state = PG_PEERING,
+        .pg_num = 1,
+        .target_set = { 1, 2, 3 },
+        .cur_set = { 1, 2, 3 },
+        .peering_state = new pg_peering_state_t(),
+    };
+    for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
+    {
+        pg_list_result_t r = {
+            .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
+            .total_count = 1024*1024*8,
+            .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
+        };
+        for (uint64_t i = 0; i < r.total_count; i++)
+        {
+            r.buf[i] = {
+                .oid = {
+                    .inode = 1,
+                    .stripe = (i << STRIPE_SHIFT) | (osd_num-1),
+                },
+                .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
+            };
+        }
+        pg.peering_state->list_results[osd_num] = r;
+    }
+    pg.calc_object_states(0);
+    printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
+    for (auto it: pg.state_dict)
+    {
+        printf("dev: state=%lx\n", it.second.state);
+    }
+    return 0;
+}
--- a/osd_primary.cpp
+++ b/osd_primary.cpp
@@ -1,98 +1,81 @@
-#include "osd.h"
-#include "osd_rmw.h"
-
-#define SUBMIT_READ 0
-#define SUBMIT_RMW_READ 1
-#define SUBMIT_WRITE 2
+#include "osd_primary.h"

 // read: read directly or read paired stripe(s), reconstruct, return
-// write: read paired stripe(s), modify, write
+// write: read paired stripe(s), reconstruct, modify, calculate parity, write
 //
 // nuance: take care to read the same version from paired stripes!
 // to do so, we remember "last readable" version until a write request completes
 // and we postpone other write requests to the same stripe until completion of previous ones
 //
-// sync: sync peers, get unstable versions from somewhere, stabilize them
-
-struct unstable_osd_num_t
-{
-    osd_num_t osd_num;
-    int start, len;
-};
-
-struct osd_primary_op_data_t
-{
-    int st = 0;
-    pg_num_t pg_num;
-    object_id oid;
-    uint64_t target_ver;
-    uint64_t fact_ver = 0;
-    int n_subops = 0, done = 0, errors = 0;
-    int degraded = 0, pg_size, pg_minsize;
-    osd_rmw_stripe_t *stripes;
-    osd_op_t *subops = NULL;
-    // for sync. oops, requires freeing
-    std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
-    obj_ver_id *unstable_writes = NULL;
-};
-
-void osd_t::finish_primary_op(osd_op_t *cur_op, int retval)
-{
-    // FIXME add separate magic number
-    auto cl_it = clients.find(cur_op->peer_fd);
-    if (cl_it != clients.end())
-    {
-        cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        cur_op->reply.hdr.id = cur_op->req.hdr.id;
-        cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-        cur_op->reply.hdr.retval = retval;
-        outbox_push(cl_it->second, cur_op);
-    }
-    else
-    {
-        delete cur_op;
-    }
-}
+// sync: sync peers, get unstable versions, stabilize them

 bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
-    // But we must not use K in the process of calculating the PG number
-    // So we calculate the PG number using a separate setting which should be per-inode (FIXME)
-    // FIXME Real pg_num should equal the below expression + 1
-    pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / parity_block_size) % pg_count;
-    // FIXME: Postpone operations in inactive PGs
-    if (pg_num > pgs.size() || !(pgs[pg_num].state & PG_ACTIVE))
-    {
-        finish_primary_op(cur_op, -EINVAL);
-        return false;
-    }
-    uint64_t pg_parity_size = bs_block_size * pgs[pg_num].pg_minsize;
+    // K = pg_minsize and will be a property of the inode. Not it's hardcoded (FIXME)
+    uint64_t pg_block_size = bs_block_size * 2;
    object_id oid = {
        .inode = cur_op->req.rw.inode,
-        // oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
-        .stripe = (cur_op->req.rw.offset / parity_block_size) * parity_block_size +
-            ((cur_op->req.rw.offset % parity_block_size) / pg_parity_size) * pg_parity_size
+        // oid.stripe = starting offset of the parity stripe
+        .stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
    };
-    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_parity_size) ||
+    pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_count + 1;
+    auto pg_it = pgs.find(pg_num);
+    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
+    {
+        // This OSD is not primary for this PG or the PG is inactive
+        finish_op(cur_op, -EPIPE);
+        return false;
+    }
+    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
        (cur_op->req.rw.len % bs_disk_alignment) != 0)
    {
-        finish_primary_op(cur_op, -EINVAL);
+        finish_op(cur_op, -EINVAL);
        return false;
    }
    osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc(
-        sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pgs[pg_num].pg_size, 1
+        sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pg_it->second.pg_size, 1
    );
    op_data->pg_num = pg_num;
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    cur_op->op_data = op_data;
-    split_stripes(pgs[pg_num].pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
+    split_stripes(pg_it->second.pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
+    pg_it->second.inflight++;
    return true;
 }

+static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
+{
+    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
+    {
+        *object_state = NULL;
+        return def;
+    }
+    auto st_it = pg.incomplete_objects.find(oid);
+    if (st_it != pg.incomplete_objects.end())
+    {
+        *object_state = st_it->second;
+        return st_it->second->read_target.data();
+    }
+    st_it = pg.degraded_objects.find(oid);
+    if (st_it != pg.degraded_objects.end())
+    {
+        *object_state = st_it->second;
+        return st_it->second->read_target.data();
+    }
+    st_it = pg.misplaced_objects.find(oid);
+    if (st_it != pg.misplaced_objects.end())
+    {
+        *object_state = st_it->second;
+        return st_it->second->read_target.data();
+    }
+    *object_state = NULL;
+    return def;
+}
+
 void osd_t::continue_primary_read(osd_op_t *cur_op)
 {
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
@@ -123,14 +106,10 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        else
        {
            // PG may be degraded or have misplaced objects
-            auto st_it = pg.obj_states.find(op_data->oid);
-            uint64_t* cur_set = (st_it != pg.obj_states.end()
-                ? st_it->second->read_target.data()
-                : pg.cur_set.data());
+            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
            {
-                free(op_data);
-                finish_primary_op(cur_op, -EIO);
+                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
@@ -147,9 +126,7 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        free(op_data);
-        cur_op->op_data = NULL;
-        finish_primary_op(cur_op, -EIO);
+        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    if (op_data->degraded)
@@ -173,143 +150,34 @@ resume_2:
            }
        }
    }
-    free(op_data);
-    cur_op->op_data = NULL;
-    finish_primary_op(cur_op, cur_op->req.rw.len);
+    finish_op(cur_op, cur_op->req.rw.len);
 }

-void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
-{
-    bool w = submit_type == SUBMIT_WRITE;
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    osd_rmw_stripe_t *stripes = op_data->stripes;
-    // Allocate subops
-    int n_subops = 0, zero_read = -1;
-    for (int role = 0; role < pg_size; role++)
-    {
-        if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
-        {
-            zero_read = role;
-        }
-        if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
-        {
-            n_subops++;
-        }
-    }
-    if (!n_subops && submit_type == SUBMIT_RMW_READ)
-    {
-        n_subops = 1;
-    }
-    else
-    {
-        zero_read = -1;
-    }
-    osd_op_t *subops = new osd_op_t[n_subops];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_subops;
-    op_data->subops = subops;
-    int subop = 0;
-    for (int role = 0; role < pg_size; role++)
-    {
-        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(w || stripes[role].read_end != 0 || zero_read == role))
-        {
-            continue;
-        }
-        osd_num_t role_osd_num = osd_set[role];
-        if (role_osd_num != 0)
-        {
-            if (role_osd_num == this->osd_num)
-            {
-                subops[subop].bs_op = new blockstore_op_t({
-                    .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
-                    .callback = [cur_op, this](blockstore_op_t *subop)
-                    {
-                        handle_primary_subop(cur_op, subop->retval == subop->len, subop->version);
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | role,
-                    },
-                    .version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
-                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
-                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
-                    .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
-                });
-                bs->enqueue_op(subops[subop].bs_op);
-            }
-            else
-            {
-                subops[subop].op_type = OSD_OP_OUT;
-                subops[subop].send_list.push_back(subops[subop].req.buf, OSD_PACKET_SIZE);
-                subops[subop].peer_fd = this->osd_peer_fds.at(role_osd_num);
-                subops[subop].req.sec_rw = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = this->next_subop_id++,
-                        .opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | role,
-                    },
-                    .version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
-                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
-                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
-                };
-                subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
-                if (w && stripes[role].write_end > 0)
-                {
-                    subops[subop].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
-                }
-                subops[subop].callback = [cur_op, this](osd_op_t *subop)
-                {
-                    // so it doesn't get freed
-                    subop->buf = NULL;
-                    handle_primary_subop(cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
-                };
-                outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
-            }
-            subop++;
-        }
-    }
-}
-
-void osd_t::handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version)
+bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    op_data->fact_ver = version;
-    if (!ok)
+    // Check if actions are pending for this object
+    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
+        .oid = op_data->oid,
+        .osd_num = 0,
+    });
+    if (act_it != pg.flush_actions.end() &&
+        act_it->first.oid.inode == op_data->oid.inode &&
+        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
    {
-        // FIXME: Handle errors
-        op_data->errors++;
+        pg.write_queue.emplace(op_data->oid, cur_op);
+        return false;
    }
-    else
+    // Check if there are other write requests to the same object
+    auto vo_it = pg.write_queue.find(op_data->oid);
+    if (vo_it != pg.write_queue.end())
    {
-        op_data->done++;
-    }
-    if ((op_data->errors + op_data->done) >= op_data->n_subops)
-    {
-        delete[] op_data->subops;
-        op_data->subops = NULL;
-        op_data->st++;
-        if (cur_op->req.hdr.opcode == OSD_OP_READ)
-        {
-            continue_primary_read(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
-        {
-            continue_primary_write(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
-        {
-            continue_primary_sync(cur_op);
-        }
-        else
-        {
-            throw std::runtime_error("BUG: unknown opcode");
-        }
+        op_data->st = 1;
+        pg.write_queue.emplace(op_data->oid, cur_op);
+        return false;
    }
+    pg.write_queue.emplace(op_data->oid, cur_op);
+    return true;
 }

 void osd_t::continue_primary_write(osd_op_t *cur_op)
@@ -319,89 +187,116 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    // FIXME: Handle operation cancel
    auto & pg = pgs[op_data->pg_num];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    else if (op_data->st == 7) goto resume_7;
+    else if (op_data->st == 8) goto resume_8;
+    else if (op_data->st == 9) goto resume_9;
    assert(op_data->st == 0);
-    // Check if actions are pending for this object
+    printf("primary_write\n");
+    if (!check_write_queue(cur_op, pg))
    {
-        auto act_it = pg.obj_stab_actions.lower_bound((obj_piece_id_t){
-            .oid = op_data->oid,
-            .osd_num = 0,
-        });
-        if (act_it != pg.obj_stab_actions.end() &&
-            act_it->first.oid.inode == op_data->oid.inode &&
-            (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
-        {
-            // FIXME postpone the request until actions are done
-            free(op_data);
-            finish_primary_op(cur_op, -EIO);
-            return;
-        }
-    }
-    // Check if there are other write requests to the same object
-    {
-        auto vo_it = pg.write_queue.find(op_data->oid);
-        if (vo_it != pg.write_queue.end())
-        {
-            op_data->st = 1;
-            pg.write_queue.emplace(op_data->oid, cur_op);
-            return;
-        }
-        pg.write_queue.emplace(op_data->oid, cur_op);
+        return;
    }
 resume_1:
-    // Determine blocks to read
-    cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
+    // Determine blocks to read and write
+    // Missing chunks are allowed to be overwritten even in incomplete objects
+    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for the lower performance impact
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
+        pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
    // Read required blocks
-    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
+    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
-    // Calculate parity
-    calc_rmw_parity(op_data->stripes, pg.pg_size);
+    // Recover missing stripes, calculate parity
+    calc_rmw_parity(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
    // Send writes
    submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
 resume_4:
    op_data->st = 4;
    return;
 resume_5:
-    // Remember version as unstable
-    osd_num_t *osd_set = pg.cur_set.data();
-    for (int role = 0; role < pg.pg_size; role++)
+    if (op_data->errors > 0)
    {
-        if (osd_set[role] != 0)
-        {
-            this->unstable_writes[(osd_object_id_t){
-                .osd_num = osd_set[role],
-                .oid = {
-                    .inode = op_data->oid.inode,
-                    .stripe = op_data->oid.stripe | role,
-                },
-            }] = op_data->fact_ver;
-        }
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    if (op_data->fact_ver == 1)
+    {
+        // Object is created
+        pg.clean_count++;
+        pg.total_count++;
+    }
+    if (op_data->object_state)
+    {
+        {
+            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
+            recovery_stat_count[0][recovery_type]++;
+            if (!recovery_stat_count[0][recovery_type])
+            {
+                recovery_stat_count[0][recovery_type]++;
+                recovery_stat_bytes[0][recovery_type] = 0;
+            }
+            for (int role = 0; role < pg.pg_size; role++)
+            {
+                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
+            }
+        }
+        if (op_data->object_state->state & OBJ_MISPLACED)
+        {
+            // Remove extra chunks
+            submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set);
+            if (op_data->n_subops > 0)
+            {
+resume_8:
+                op_data->st = 8;
+                return;
+resume_9:
+                if (op_data->errors > 0)
+                {
+                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                    return;
+                }
+            }
+        }
+        // Clear object state
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        pg.clean_count++;
    }
-    // Remember PG as dirty to drop the connection when PG goes offline
-    // (this is required because of the "lazy sync")
-    this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
    // Remove version override
    pg.ver_override.erase(op_data->oid);
-    finish_primary_op(cur_op, cur_op->req.rw.len);
-    // Continue other write operations to the same object
+    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
+resume_6:
+resume_7:
+    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
+    {
+        return;
+    }
+    object_id oid = op_data->oid;
+    finish_op(cur_op, cur_op->req.rw.len);
+    // Continue other write operations to the same object
+    auto next_it = pg.write_queue.find(oid);
+    auto this_it = next_it;
+    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
-        auto next_it = pg.write_queue.find(op_data->oid);
-        auto this_it = next_it;
        next_it++;
        pg.write_queue.erase(this_it);
-        if (next_it != pg.write_queue.end() &&
-            next_it->first == op_data->oid)
+        if (next_it != pg.write_queue.end() && next_it->first == oid)
        {
            osd_op_t *next_op = next_it->second;
            continue_primary_write(next_op);
@@ -409,27 +304,99 @@ resume_5:
    }
 }

+bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == base_state)
+    {
+        goto resume_6;
+    }
+    else if (op_data->st == base_state+1)
+    {
+        goto resume_7;
+    }
+    if (immediate_commit == IMMEDIATE_ALL)
+    {
+        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+        op_data->unstable_writes = new obj_ver_id[loc_set.size()];
+        {
+            int last_start = 0;
+            for (auto & chunk: loc_set)
+            {
+                op_data->unstable_writes[last_start] = (obj_ver_id){
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                    .version = op_data->fact_ver,
+                };
+                op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                    .osd_num = chunk.osd_num,
+                    .start = last_start,
+                    .len = 1,
+                });
+                last_start++;
+            }
+        }
+        submit_primary_stab_subops(cur_op);
+resume_6:
+        op_data->st = 6;
+        return false;
+resume_7:
+        // FIXME: Free those in the destructor?
+        delete op_data->unstable_write_osds;
+        delete[] op_data->unstable_writes;
+        op_data->unstable_writes = NULL;
+        op_data->unstable_write_osds = NULL;
+        if (op_data->errors > 0)
+        {
+            pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+            return false;
+        }
+    }
+    else
+    {
+        // Remember version as unstable
+        for (auto & chunk: loc_set)
+        {
+            this->unstable_writes[(osd_object_id_t){
+                .osd_num = chunk.osd_num,
+                .oid = {
+                    .inode = op_data->oid.inode,
+                    .stripe = op_data->oid.stripe | chunk.role,
+                },
+            }] = op_data->fact_ver;
+        }
+        // Remember PG as dirty to drop the connection when PG goes offline
+        // (this is required because of the "lazy sync")
+        c_cli.clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
+        dirty_pgs.insert(op_data->pg_num);
+    }
+    return true;
+}
+
 // Save and clear unstable_writes -> SYNC all -> STABLE all
-// FIXME: Run regular automatic syncs based on the number of unstable writes and/or system time
 void osd_t::continue_primary_sync(osd_op_t *cur_op)
 {
    if (!cur_op->op_data)
    {
        cur_op->op_data = (osd_primary_op_data_t*)calloc(sizeof(osd_primary_op_data_t), 1);
    }
-    if (cur_op->op_data->st == 1)      goto resume_1;
-    else if (cur_op->op_data->st == 2) goto resume_2;
-    else if (cur_op->op_data->st == 3) goto resume_3;
-    else if (cur_op->op_data->st == 4) goto resume_4;
-    else if (cur_op->op_data->st == 5) goto resume_5;
-    else if (cur_op->op_data->st == 6) goto resume_6;
-    assert(cur_op->op_data->st == 0);
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == 1)      goto resume_1;
+    else if (op_data->st == 2) goto resume_2;
+    else if (op_data->st == 3) goto resume_3;
+    else if (op_data->st == 4) goto resume_4;
+    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    assert(op_data->st == 0);
+    printf("primary_sync\n");
    if (syncs_in_progress.size() > 0)
    {
        // Wait for previous syncs, if any
        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
        syncs_in_progress.push_back(cur_op);
-        cur_op->op_data->st = 1;
+        op_data->st = 1;
 resume_1:
        return;
    }
@@ -438,27 +405,28 @@ resume_1:
        syncs_in_progress.push_back(cur_op);
    }
 resume_2:
-    // FIXME: Handle operation cancel
    if (unstable_writes.size() == 0)
    {
        // Nothing to sync
        goto finish;
    }
    // Save and clear unstable_writes
-    // FIXME: This is possible to do it on a per-client basis
-    // It would be cool not to copy them here at all, but someone has to deduplicate them by object IDs anyway
-    cur_op->op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-    cur_op->op_data->unstable_writes = new obj_ver_id[unstable_writes.size()];
+    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
+    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
    {
+        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
+        op_data->dirty_pgs = new pg_num_t[dirty_pgs.size()];
+        op_data->dirty_pg_count = dirty_pgs.size();
        osd_num_t last_osd = 0;
        int last_start = 0, last_end = 0;
-        for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++)
+        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
        {
            if (last_osd != it->first.osd_num)
            {
                if (last_osd != 0)
                {
-                    cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = last_osd,
                        .start = last_start,
                        .len = last_end - last_start,
@@ -467,7 +435,7 @@ resume_2:
                last_osd = it->first.osd_num;
                last_start = last_end;
            }
-            cur_op->op_data->unstable_writes[last_end] = (obj_ver_id){
+            op_data->unstable_writes[last_end] = (obj_ver_id){
                .oid = it->first.oid,
                .version = it->second,
            };
@@ -475,129 +443,226 @@ resume_2:
        }
        if (last_osd != 0)
        {
-            cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                .osd_num = last_osd,
                .start = last_start,
                .len = last_end - last_start,
            });
        }
+        int dpg = 0;
+        for (auto dirty_pg_num: dirty_pgs)
+        {
+            pgs[dirty_pg_num].inflight++;
+            op_data->dirty_pgs[dpg++] = dirty_pg_num;
+        }
+        dirty_pgs.clear();
+        this->unstable_writes.clear();
    }
-    unstable_writes.clear();
-    // SYNC
-    submit_primary_sync_subops(cur_op);
+    if (immediate_commit != IMMEDIATE_ALL)
+    {
+        // SYNC
+        submit_primary_sync_subops(cur_op);
 resume_3:
-    cur_op->op_data->st = 3;
-    return;
+        op_data->st = 3;
+        return;
 resume_4:
+        if (op_data->errors > 0)
+        {
+            goto resume_6;
+        }
+    }
    // Stabilize version sets
    submit_primary_stab_subops(cur_op);
 resume_5:
-    cur_op->op_data->st = 5;
+    op_data->st = 5;
    return;
 resume_6:
-    // FIXME: Free them correctly (via a destructor or so)
-    delete cur_op->op_data->unstable_write_osds;
-    delete[] cur_op->op_data->unstable_writes;
-    cur_op->op_data->unstable_writes = NULL;
-    cur_op->op_data->unstable_write_osds = NULL;
+    if (op_data->errors > 0)
+    {
+        // Return objects back into the unstable write set
+        for (auto unstable_osd: *(op_data->unstable_write_osds))
+        {
+            for (int i = 0; i < unstable_osd.len; i++)
+            {
+                // Except those from peered PGs
+                auto & w = op_data->unstable_writes[i];
+                pg_num_t wpg = map_to_pg(w.oid);
+                if (pgs[wpg].state & PG_ACTIVE)
+                {
+                    uint64_t & dest = this->unstable_writes[(osd_object_id_t){
+                        .osd_num = unstable_osd.osd_num,
+                        .oid = w.oid,
+                    }];
+                    dest = dest < w.version ? w.version : dest;
+                    dirty_pgs.insert(wpg);
+                }
+            }
+        }
+    }
+    for (int i = 0; i < op_data->dirty_pg_count; i++)
+    {
+        auto & pg = pgs.at(op_data->dirty_pgs[i]);
+        pg.inflight--;
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
+        {
+            finish_stop_pg(pg);
+        }
+    }
+    // FIXME: Free those in the destructor?
+    delete op_data->dirty_pgs;
+    delete op_data->unstable_write_osds;
+    delete[] op_data->unstable_writes;
+    op_data->unstable_writes = NULL;
+    op_data->unstable_write_osds = NULL;
+    if (op_data->errors > 0)
+    {
+        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+    }
+    else
+    {
 finish:
+        if (cur_op->peer_fd)
+        {
+            auto it = c_cli.clients.find(cur_op->peer_fd);
+            if (it != c_cli.clients.end())
+                it->second.dirty_pgs.clear();
+        }
+        finish_op(cur_op, 0);
+    }
    assert(syncs_in_progress.front() == cur_op);
    syncs_in_progress.pop_front();
-    finish_primary_op(cur_op, 0);
    if (syncs_in_progress.size() > 0)
    {
        cur_op = syncs_in_progress.front();
-        cur_op->op_data->st++;
+        op_data = cur_op->op_data;
+        op_data->st++;
        goto resume_2;
    }
 }

-void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
+// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
+void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
 {
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int n_osds = op_data->unstable_write_osds->size();
-    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_osds;
-    op_data->subops = subops;
-    for (int i = 0; i < n_osds; i++)
+    if (object_state->state & OBJ_INCOMPLETE)
    {
-        osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
-        if (sync_osd == this->osd_num)
+        // Successful write means that object is not incomplete anymore
+        this->incomplete_objects--;
+        pg.incomplete_objects.erase(oid);
+        if (!pg.incomplete_objects.size())
        {
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_SYNC,
-                .callback = [cur_op, this](blockstore_op_t *subop)
-                {
-                    handle_primary_subop(cur_op, subop->retval == 0, 0);
-                },
-            });
-            bs->enqueue_op(subops[i].bs_op);
+            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
+            report_pg_state(pg);
        }
-        else
+    }
+    else if (object_state->state & OBJ_DEGRADED)
+    {
+        this->degraded_objects--;
+        pg.degraded_objects.erase(oid);
+        if (!pg.degraded_objects.size())
        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-            subops[i].peer_fd = osd_peer_fds.at(sync_osd);
-            subops[i].req.sec_sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = this->next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_SYNC,
-                },
-            };
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
-            {
-                handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
-            };
-            outbox_push(clients[subops[i].peer_fd], &subops[i]);
+            pg.state = pg.state & ~PG_HAS_DEGRADED;
+            report_pg_state(pg);
        }
    }
+    else if (object_state->state & OBJ_MISPLACED)
+    {
+        this->misplaced_objects--;
+        pg.misplaced_objects.erase(oid);
+        if (!pg.misplaced_objects.size())
+        {
+            pg.state = pg.state & ~PG_HAS_MISPLACED;
+            report_pg_state(pg);
+        }
+    }
+    else
+    {
+        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
+    }
+    object_state->object_count--;
+    if (!object_state->object_count)
+    {
+        pg.state_dict.erase(object_state->osd_set);
+    }
 }

-void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
+void osd_t::continue_primary_del(osd_op_t *cur_op)
 {
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int n_osds = op_data->unstable_write_osds->size();
-    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_osds;
-    op_data->subops = subops;
-    for (int i = 0; i < n_osds; i++)
+    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
-        auto & stab_osd = (*(op_data->unstable_write_osds))[i];
-        if (stab_osd.osd_num == this->osd_num)
+        return;
+    }
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    auto & pg = pgs[op_data->pg_num];
+    if (op_data->st == 1)      goto resume_1;
+    else if (op_data->st == 2) goto resume_2;
+    else if (op_data->st == 3) goto resume_3;
+    else if (op_data->st == 4) goto resume_4;
+    else if (op_data->st == 5) goto resume_5;
+    assert(op_data->st == 0);
+    // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
+    if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
+    {
+        finish_op(cur_op, -EBUSY);
+        return;
+    }
+    if (!check_write_queue(cur_op, pg))
+    {
+        return;
+    }
+resume_1:
+    // Determine which OSDs contain this object and delete it
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    // Submit 1 read to determine the actual version number
+    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
+resume_2:
+    op_data->st = 2;
+    return;
+resume_3:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    // Save version override for parallel reads
+    pg.ver_override[op_data->oid] = op_data->fact_ver;
+    // Submit deletes
+    op_data->fact_ver++;
+    submit_primary_del_subops(cur_op, NULL, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
+resume_4:
+    op_data->st = 4;
+    return;
+resume_5:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    // Remove version override
+    pg.ver_override.erase(op_data->oid);
+    // Adjust PG stats after "instant stabilize", because we need object_state above
+    if (!op_data->object_state)
+    {
+        pg.clean_count--;
+    }
+    else
+    {
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+    }
+    pg.total_count--;
+    object_id oid = op_data->oid;
+    finish_op(cur_op, cur_op->req.rw.len);
+    // Continue other write operations to the same object
+    auto next_it = pg.write_queue.find(oid);
+    auto this_it = next_it;
+    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
+    {
+        next_it++;
+        pg.write_queue.erase(this_it);
+        if (next_it != pg.write_queue.end() &&
+            next_it->first == oid)
        {
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_STABLE,
-                .callback = [cur_op, this](blockstore_op_t *subop)
-                {
-                    handle_primary_subop(cur_op, subop->retval == 0, 0);
-                },
-                .len = (uint32_t)stab_osd.len,
-                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
-            });
-            bs->enqueue_op(subops[i].bs_op);
-        }
-        else
-        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-            subops[i].peer_fd = osd_peer_fds.at(stab_osd.osd_num);
-            subops[i].req.sec_stab = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = this->next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_STABILIZE,
-                },
-                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
-            };
-            subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
-            {
-                handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
-            };
-            outbox_push(clients[subops[i].peer_fd], &subops[i]);
+            osd_op_t *next_op = next_it->second;
+            continue_primary_write(next_op);
        }
    }
 }
--- a/osd_primary.h
+++ b/osd_primary.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "osd.h"
+#include "osd_rmw.h"
+
+#define SUBMIT_READ 0
+#define SUBMIT_RMW_READ 1
+#define SUBMIT_WRITE 2
+
+struct unstable_osd_num_t
+{
+    osd_num_t osd_num;
+    int start, len;
+};
+
+struct osd_primary_op_data_t
+{
+    int st = 0;
+    pg_num_t pg_num;
+    object_id oid;
+    uint64_t target_ver;
+    uint64_t fact_ver = 0;
+    int n_subops = 0, done = 0, errors = 0, epipe = 0;
+    int degraded = 0, pg_size, pg_minsize;
+    osd_rmw_stripe_t *stripes;
+    osd_op_t *subops = NULL;
+    uint64_t *prev_set = NULL;
+    pg_osd_set_state_t *object_state = NULL;
+
+    // for sync. oops, requires freeing
+    std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
+    pg_num_t *dirty_pgs = NULL;
+    int dirty_pg_count = 0;
+    obj_ver_id *unstable_writes = NULL;
+};
--- a/osd_primary_subops.cpp
+++ b/osd_primary_subops.cpp
@@ -0,0 +1,551 @@
+#include "osd_primary.h"
+
+void osd_t::autosync()
+{
+    // FIXME Autosync based on the number of unstable writes to prevent
+    // "journal_sector_buffer_count is too low for this batch" errors
+    if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
+    {
+        autosync_op = new osd_op_t();
+        autosync_op->op_type = OSD_OP_IN;
+        autosync_op->req = {
+            .sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = 1,
+                    .opcode = OSD_OP_SYNC,
+                },
+            },
+        };
+        autosync_op->callback = [this](osd_op_t *op)
+        {
+            if (op->reply.hdr.retval < 0)
+            {
+                printf("Warning: automatic sync resulted in an error: %ld (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
+            }
+            delete autosync_op;
+            autosync_op = NULL;
+        };
+        exec_op(autosync_op);
+    }
+}
+
+void osd_t::finish_op(osd_op_t *cur_op, int retval)
+{
+    inflight_ops--;
+    if (cur_op->op_data)
+    {
+        if (cur_op->op_data->pg_num > 0)
+        {
+            auto & pg = pgs[cur_op->op_data->pg_num];
+            pg.inflight--;
+            assert(pg.inflight >= 0);
+            if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
+            {
+                finish_stop_pg(pg);
+            }
+        }
+        assert(!cur_op->op_data->subops);
+        assert(!cur_op->op_data->unstable_write_osds);
+        assert(!cur_op->op_data->unstable_writes);
+        assert(!cur_op->op_data->dirty_pgs);
+        free(cur_op->op_data);
+        cur_op->op_data = NULL;
+    }
+    if (!cur_op->peer_fd)
+    {
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
+    }
+    else
+    {
+        // FIXME add separate magic number
+        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
+        if (cl_it != c_cli.clients.end())
+        {
+            cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+            cur_op->reply.hdr.id = cur_op->req.hdr.id;
+            cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+            cur_op->reply.hdr.retval = retval;
+            c_cli.outbox_push(cur_op);
+        }
+        else
+        {
+            delete cur_op;
+        }
+    }
+}
+
+void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
+{
+    bool w = submit_type == SUBMIT_WRITE;
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    osd_rmw_stripe_t *stripes = op_data->stripes;
+    // Allocate subops
+    int n_subops = 0, zero_read = -1;
+    for (int role = 0; role < pg_size; role++)
+    {
+        if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
+        {
+            zero_read = role;
+        }
+        if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
+        {
+            n_subops++;
+        }
+    }
+    if (!n_subops && submit_type == SUBMIT_RMW_READ)
+    {
+        n_subops = 1;
+    }
+    else
+    {
+        zero_read = -1;
+    }
+    uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
+    osd_op_t *subops = new osd_op_t[n_subops];
+    op_data->fact_ver = 0;
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_subops;
+    op_data->subops = subops;
+    int i = 0;
+    for (int role = 0; role < pg_size; role++)
+    {
+        // We always submit zero-length writes to all replicas, even if the stripe is not modified
+        if (!(w || stripes[role].read_end != 0 || zero_read == role))
+        {
+            continue;
+        }
+        osd_num_t role_osd_num = osd_set[role];
+        if (role_osd_num != 0)
+        {
+            if (role_osd_num == this->osd_num)
+            {
+                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+                subops[i].op_type = (uint64_t)cur_op;
+                subops[i].bs_op = new blockstore_op_t({
+                    .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
+                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                    {
+                        handle_primary_bs_subop(subop);
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | role,
+                    },
+                    .version = op_version,
+                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
+                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
+                    .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
+                });
+#ifdef OSD_DEBUG
+                printf(
+                    "Submit %s to local: %lu:%lu v%lu %u-%u\n", w ? "write" : "read",
+                    op_data->oid.inode, op_data->oid.stripe | role, op_version,
+                    subops[i].bs_op->offset, subops[i].bs_op->len
+                );
+#endif
+                bs->enqueue_op(subops[i].bs_op);
+            }
+            else
+            {
+                subops[i].op_type = OSD_OP_OUT;
+                subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+                subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
+                subops[i].req.sec_rw = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = c_cli.next_subop_id++,
+                        .opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | role,
+                    },
+                    .version = op_version,
+                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
+                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
+                };
+#ifdef OSD_DEBUG
+                printf(
+                    "Submit %s to osd %lu: %lu:%lu v%lu %u-%u\n", w ? "write" : "read", role_osd_num,
+                    op_data->oid.inode, op_data->oid.stripe | role, op_version,
+                    subops[i].req.sec_rw.offset, subops[i].req.sec_rw.len
+                );
+#endif
+                subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
+                if (w && stripes[role].write_end > 0)
+                {
+                    subops[i].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
+                }
+                subops[i].callback = [cur_op, this](osd_op_t *subop)
+                {
+                    int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE &&
+                        subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
+                    // so it doesn't get freed
+                    subop->buf = NULL;
+                    handle_primary_subop(subop, cur_op);
+                    if (fail_fd >= 0)
+                    {
+                        // write operation failed, drop the connection
+                        c_cli.stop_client(fail_fd);
+                    }
+                };
+                c_cli.outbox_push(&subops[i]);
+            }
+            i++;
+        }
+    }
+}
+
+static uint64_t bs_op_to_osd_op[] = {
+    0,
+    OSD_OP_SECONDARY_READ,      // BS_OP_READ
+    OSD_OP_SECONDARY_WRITE,     // BS_OP_WRITE
+    OSD_OP_SECONDARY_SYNC,      // BS_OP_SYNC
+    OSD_OP_SECONDARY_STABILIZE, // BS_OP_STABLE
+    OSD_OP_SECONDARY_DELETE,    // BS_OP_DELETE
+    OSD_OP_SECONDARY_LIST,      // BS_OP_LIST
+    OSD_OP_SECONDARY_ROLLBACK,  // BS_OP_ROLLBACK
+    OSD_OP_TEST_SYNC_STAB_ALL,  // BS_OP_SYNC_STAB_ALL
+};
+
+void osd_t::handle_primary_bs_subop(osd_op_t *subop)
+{
+    osd_op_t *cur_op = (osd_op_t*)subop->op_type;
+    blockstore_op_t *bs_op = subop->bs_op;
+    int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE ? bs_op->len : 0;
+    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
+    {
+        // die
+        throw std::runtime_error(
+            "local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
+            " retval = "+std::to_string(bs_op->retval)+")"
+        );
+    }
+    add_bs_subop_stats(subop);
+    subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode];
+    subop->reply.hdr.retval = bs_op->retval;
+    if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE)
+    {
+        subop->req.sec_rw.len = bs_op->len;
+        subop->reply.sec_rw.version = bs_op->version;
+    }
+    delete bs_op;
+    subop->bs_op = NULL;
+    handle_primary_subop(subop, cur_op);
+}
+
+void osd_t::add_bs_subop_stats(osd_op_t *subop)
+{
+    // Include local blockstore ops in statistics
+    uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
+    timespec tv_end;
+    clock_gettime(CLOCK_REALTIME, &tv_end);
+    c_cli.stats.op_stat_count[opcode]++;
+    if (!c_cli.stats.op_stat_count[opcode])
+    {
+        c_cli.stats.op_stat_count[opcode] = 1;
+        c_cli.stats.op_stat_sum[opcode] = 0;
+        c_cli.stats.op_stat_bytes[opcode] = 0;
+    }
+    c_cli.stats.op_stat_sum[opcode] += (
+        (tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
+        (tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
+    );
+    if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
+    {
+        c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
+    }
+}
+
+void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
+{
+    uint64_t opcode = subop->req.hdr.opcode;
+    int retval = subop->reply.hdr.retval;
+    int expected = opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE
+        ? subop->req.sec_rw.len : 0;
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (retval != expected)
+    {
+        printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected);
+        if (retval == -EPIPE)
+        {
+            op_data->epipe++;
+        }
+        op_data->errors++;
+    }
+    else
+    {
+        op_data->done++;
+        if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
+        {
+            uint64_t version = subop->reply.sec_rw.version;
+#ifdef OSD_DEBUG
+            uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
+                ? c_cli.clients[subop->peer_fd].osd_num : osd_num;
+            printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
+#endif
+            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
+            {
+                throw std::runtime_error(
+                    "different fact_versions returned from "+std::string(osd_op_names[opcode])+
+                    " subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
+                );
+            }
+            op_data->fact_ver = version;
+        }
+    }
+    if ((op_data->errors + op_data->done) >= op_data->n_subops)
+    {
+        delete[] op_data->subops;
+        op_data->subops = NULL;
+        op_data->st++;
+        if (cur_op->req.hdr.opcode == OSD_OP_READ)
+        {
+            continue_primary_read(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
+        {
+            continue_primary_write(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
+        {
+            continue_primary_sync(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
+        {
+            continue_primary_del(cur_op);
+        }
+        else
+        {
+            throw std::runtime_error("BUG: unknown opcode");
+        }
+    }
+}
+
+void osd_t::cancel_primary_write(osd_op_t *cur_op)
+{
+    if (cur_op->op_data && cur_op->op_data->subops)
+    {
+        // Primary-write operation is waiting for subops, subops
+        // are sent to peer OSDs, so we can't just throw them away.
+        // Mark them with an extra EPIPE.
+        cur_op->op_data->errors++;
+        cur_op->op_data->epipe++;
+        cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
+    }
+    else
+    {
+        finish_op(cur_op, -EPIPE);
+    }
+}
+
+void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int extra_chunks = 0;
+    for (auto & chunk: loc_set)
+    {
+        if (!cur_set || chunk.osd_num != cur_set[chunk.role])
+        {
+            extra_chunks++;
+        }
+    }
+    op_data->n_subops = extra_chunks;
+    op_data->done = op_data->errors = 0;
+    if (!extra_chunks)
+    {
+        return;
+    }
+    osd_op_t *subops = new osd_op_t[extra_chunks];
+    op_data->subops = subops;
+    int i = 0;
+    for (auto & chunk: loc_set)
+    {
+        if (!cur_set || chunk.osd_num != cur_set[chunk.role])
+        {
+            if (chunk.osd_num == this->osd_num)
+            {
+                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+                subops[i].op_type = (uint64_t)cur_op;
+                subops[i].bs_op = new blockstore_op_t({
+                    .opcode = BS_OP_DELETE,
+                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                    {
+                        handle_primary_bs_subop(subop);
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                    // Same version as write
+                    .version = op_data->fact_ver,
+                });
+                bs->enqueue_op(subops[i].bs_op);
+            }
+            else
+            {
+                subops[i].op_type = OSD_OP_OUT;
+                subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+                subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
+                subops[i].req.sec_del = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = c_cli.next_subop_id++,
+                        .opcode = OSD_OP_SECONDARY_DELETE,
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                    // Same version as write
+                    .version = op_data->fact_ver,
+                };
+                subops[i].callback = [cur_op, this](osd_op_t *subop)
+                {
+                    int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                    handle_primary_subop(subop, cur_op);
+                    if (fail_fd >= 0)
+                    {
+                        // delete operation failed, drop the connection
+                        c_cli.stop_client(fail_fd);
+                    }
+                };
+                c_cli.outbox_push(&subops[i]);
+            }
+            i++;
+        }
+    }
+}
+
+void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int n_osds = op_data->unstable_write_osds->size();
+    osd_op_t *subops = new osd_op_t[n_osds];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_osds;
+    op_data->subops = subops;
+    for (int i = 0; i < n_osds; i++)
+    {
+        osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
+        if (sync_osd == this->osd_num)
+        {
+            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+            subops[i].op_type = (uint64_t)cur_op;
+            subops[i].bs_op = new blockstore_op_t({
+                .opcode = BS_OP_SYNC,
+                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                {
+                    handle_primary_bs_subop(subop);
+                },
+            });
+            bs->enqueue_op(subops[i].bs_op);
+        }
+        else
+        {
+            subops[i].op_type = OSD_OP_OUT;
+            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+            subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
+            subops[i].req.sec_sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_SYNC,
+                },
+            };
+            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            {
+                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                handle_primary_subop(subop, cur_op);
+                if (fail_fd >= 0)
+                {
+                    // sync operation failed, drop the connection
+                    c_cli.stop_client(fail_fd);
+                }
+            };
+            c_cli.outbox_push(&subops[i]);
+        }
+    }
+}
+
+void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int n_osds = op_data->unstable_write_osds->size();
+    osd_op_t *subops = new osd_op_t[n_osds];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_osds;
+    op_data->subops = subops;
+    for (int i = 0; i < n_osds; i++)
+    {
+        auto & stab_osd = (*(op_data->unstable_write_osds))[i];
+        if (stab_osd.osd_num == this->osd_num)
+        {
+            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+            subops[i].op_type = (uint64_t)cur_op;
+            subops[i].bs_op = new blockstore_op_t({
+                .opcode = BS_OP_STABLE,
+                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                {
+                    handle_primary_bs_subop(subop);
+                },
+                .len = (uint32_t)stab_osd.len,
+                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
+            });
+            bs->enqueue_op(subops[i].bs_op);
+        }
+        else
+        {
+            subops[i].op_type = OSD_OP_OUT;
+            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+            subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
+            subops[i].req.sec_stab = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_STABILIZE,
+                },
+                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
+            };
+            subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
+            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            {
+                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                handle_primary_subop(subop, cur_op);
+                if (fail_fd >= 0)
+                {
+                    // sync operation failed, drop the connection
+                    c_cli.stop_client(fail_fd);
+                }
+            };
+            c_cli.outbox_push(&subops[i]);
+        }
+    }
+}
+
+void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
+{
+    auto st_it = pg.write_queue.find(oid), it = st_it;
+    finish_op(first_op, retval);
+    if (it != pg.write_queue.end() && it->second == first_op)
+    {
+        it++;
+    }
+    else
+    {
+        // Write queue doesn't match the first operation.
+        // first_op is a leftover operation from the previous peering of the same PG.
+        return;
+    }
+    while (it != pg.write_queue.end() && it->first == oid)
+    {
+        finish_op(it->second, retval);
+        it++;
+    }
+    if (st_it != it)
+    {
+        pg.write_queue.erase(st_it, it);
+    }
+}
--- a/osd_receive.cpp
+++ b/osd_receive.cpp
@@ -1,204 +0,0 @@
-#include "osd.h"
-
-void osd_t::read_requests()
-{
-    for (int i = 0; i < read_ready_clients.size(); i++)
-    {
-        int peer_fd = read_ready_clients[i];
-        auto & cl = clients[peer_fd];
-        io_uring_sqe* sqe = ringloop->get_sqe();
-        if (!sqe)
-        {
-            read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
-            return;
-        }
-        ring_data_t* data = ((ring_data_t*)sqe->user_data);
-        if (!cl.read_buf)
-        {
-            // no reads in progress
-            // so this is either a new command or a reply to a previously sent command
-            if (!cl.read_op)
-            {
-                cl.read_op = new osd_op_t;
-                cl.read_op->peer_fd = peer_fd;
-            }
-            cl.read_op->op_type = OSD_OP_IN;
-            cl.read_buf = &cl.read_op->req.buf;
-            cl.read_remaining = OSD_PACKET_SIZE;
-            cl.read_state = CL_READ_OP;
-        }
-        cl.read_iov.iov_base = cl.read_buf;
-        cl.read_iov.iov_len = cl.read_remaining;
-        cl.read_msg.msg_iov = &cl.read_iov;
-        cl.read_msg.msg_iovlen = 1;
-        data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
-        my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
-    }
-    read_ready_clients.clear();
-}
-
-void osd_t::handle_read(ring_data_t *data, int peer_fd)
-{
-    auto cl_it = clients.find(peer_fd);
-    if (cl_it != clients.end())
-    {
-        auto & cl = cl_it->second;
-        if (data->res == -EAGAIN)
-        {
-            cl.read_ready--;
-            if (cl.read_ready > 0)
-                read_ready_clients.push_back(peer_fd);
-            return;
-        }
-        else if (data->res < 0)
-        {
-            // this is a client socket, so don't panic. just disconnect it
-            printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
-            stop_client(peer_fd);
-            return;
-        }
-        read_ready_clients.push_back(peer_fd);
-        if (data->res > 0)
-        {
-            cl.read_remaining -= data->res;
-            cl.read_buf += data->res;
-            if (cl.read_remaining <= 0)
-            {
-                cl.read_buf = NULL;
-                if (cl.read_state == CL_READ_OP)
-                {
-                    if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
-                    {
-                        handle_reply_hdr(&cl);
-                    }
-                    else
-                    {
-                        handle_op_hdr(&cl);
-                    }
-                }
-                else if (cl.read_state == CL_READ_DATA)
-                {
-                    // Operation is ready
-                    exec_op(cl.read_op);
-                    cl.read_op = NULL;
-                    cl.read_state = 0;
-                }
-                else if (cl.read_state == CL_READ_REPLY_DATA)
-                {
-                    // Reply is ready
-                    auto req_it = cl.sent_ops.find(cl.read_reply_id);
-                    osd_op_t *request = req_it->second;
-                    cl.sent_ops.erase(req_it);
-                    cl.read_reply_id = 0;
-                    cl.read_state = 0;
-                    // Measure subop latency
-                    timespec tv_end;
-                    clock_gettime(CLOCK_REALTIME, &tv_end);
-                    subop_stat_count[request->req.hdr.opcode]++;
-                    subop_stat_sum[request->req.hdr.opcode] += (
-                        (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
-                        (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
-                    );
-                    request->callback(request);
-                }
-            }
-        }
-    }
-}
-
-void osd_t::handle_op_hdr(osd_client_t *cl)
-{
-    osd_op_t *cur_op = cl->read_op;
-    if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
-    {
-        if (cur_op->req.sec_rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
-        cl->read_remaining = 0;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
-    {
-        if (cur_op->req.sec_rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
-        cl->read_remaining = cur_op->req.sec_rw.len;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
-        cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
-    {
-        if (cur_op->req.sec_stab.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.sec_stab.len);
-        cl->read_remaining = cur_op->req.sec_stab.len;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
-    {
-        if (cur_op->req.rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.rw.len);
-        cl->read_remaining = 0;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
-    {
-        if (cur_op->req.rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.rw.len);
-        cl->read_remaining = cur_op->req.rw.len;
-    }
-    if (cl->read_remaining > 0)
-    {
-        // Read data
-        cl->read_buf = cur_op->buf;
-        cl->read_state = CL_READ_DATA;
-    }
-    else
-    {
-        // Operation is ready
-        cl->read_op = NULL;
-        cl->read_state = 0;
-        exec_op(cur_op);
-    }
-}
-
-void osd_t::handle_reply_hdr(osd_client_t *cl)
-{
-    osd_op_t *cur_op = cl->read_op;
-    auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
-    if (req_it == cl->sent_ops.end())
-    {
-        // Command out of sync. Drop connection
-        printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
-        stop_client(cl->peer_fd);
-        return;
-    }
-    osd_op_t *op = req_it->second;
-    memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
-    if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
-        op->reply.hdr.retval > 0)
-    {
-        // Read data. In this case we assume that the buffer is preallocated by the caller (!)
-        assert(op->buf);
-        cl->read_state = CL_READ_REPLY_DATA;
-        cl->read_reply_id = op->req.hdr.id;
-        cl->read_buf = op->buf;
-        cl->read_remaining = op->reply.hdr.retval;
-    }
-    else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
-        op->reply.hdr.retval > 0)
-    {
-        op->buf = memalign(512, sizeof(obj_ver_id) * op->reply.hdr.retval);
-        cl->read_state = CL_READ_REPLY_DATA;
-        cl->read_reply_id = op->req.hdr.id;
-        cl->read_buf = op->buf;
-        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
-    }
-    else
-    {
-        cl->read_state = 0;
-        cl->sent_ops.erase(req_it);
-        // Measure subop latency
-        timespec tv_end;
-        clock_gettime(CLOCK_REALTIME, &tv_end);
-        subop_stat_count[op->req.hdr.opcode]++;
-        subop_stat_sum[op->req.hdr.opcode] += (
-            (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
-            (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
-        );
-        op->callback(op);
-    }
-}
--- a/osd_rmw.cpp
+++ b/osd_rmw.cpp
@@ -1,4 +1,5 @@
 #include <malloc.h>
+#include <string.h>
 #include <assert.h>
 #include "xor.h"
 #include "osd_rmw.h"
@@ -55,6 +56,11 @@ static inline void cover_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & s

 void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t end, osd_rmw_stripe_t *stripes)
 {
+    if (end == 0)
+    {
+        // Zero length request - offset doesn't matter
+        return;
+    }
    end = start+end;
    for (int role = 0; role < pg_minsize; role++)
    {
@@ -79,18 +85,21 @@ void reconstruct_stripe(osd_rmw_stripe_t *stripes, int pg_size, int role)
            }
            else if (prev >= 0)
            {
+                assert(stripes[role].read_start >= stripes[prev].read_start &&
+                    stripes[role].read_start >= stripes[other].read_start);
                memxor(
-                    stripes[prev].read_buf + (stripes[prev].read_start - stripes[role].read_start),
-                    stripes[other].read_buf + (stripes[other].read_start - stripes[other].read_start),
+                    stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
+                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
                );
                prev = -1;
            }
            else
            {
+                assert(stripes[role].read_start >= stripes[other].read_start);
                memxor(
                    stripes[role].read_buf,
-                    stripes[other].read_buf + (stripes[other].read_start - stripes[role].read_start),
+                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
                );
            }
@@ -156,10 +165,11 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
    return buf;
 }

-void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize)
+void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
+    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size)
 {
    // Generic parity modification (read-modify-write) algorithm
-    // Reconstruct -> Read -> Calc parity -> Write
+    // Read -> Reconstruct missing chunks -> Calc parity chunks -> Write
    // Now we always read continuous ranges. This means that an update of the beginning
    // of one data stripe and the end of another will lead to a read of full paired stripes.
    // FIXME: (Maybe) read small individual ranges in that case instead.
@@ -174,64 +184,90 @@ void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_s
            stripes[role].write_end = stripes[role].req_end;
        }
    }
-    for (int role = 0; role < pg_minsize; role++)
-    {
-        cover_read(start, end, stripes[role]);
-    }
-    int has_parity = 0;
+    int write_parity = 0;
    for (int role = pg_minsize; role < pg_size; role++)
    {
-        if (osd_set[role] != 0)
+        if (write_osd_set[role] != 0)
        {
-            has_parity++;
+            write_parity = 1;
            stripes[role].write_start = start;
            stripes[role].write_end = end;
        }
-        else
-            stripes[role].missing = true;
+    }
+    if (write_parity)
+    {
+        for (int role = 0; role < pg_minsize; role++)
+        {
+            cover_read(start, end, stripes[role]);
+        }
+    }
+    if (write_osd_set != read_osd_set)
+    {
+        pg_cursize = 0;
+        // Object is degraded/misplaced and will be moved to <write_osd_set>
+        for (int role = 0; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role])
+            {
+                // FIXME: For EC more than 2+1: handle case when write_osd_set == 0 and read_osd_set != 0
+                // We need to get data for any moved / recovered chunk
+                // And we need a continuous write buffer so we'll only optimize
+                // for the case when the whole chunk is ovewritten in the request
+                if (stripes[role].req_start != 0 ||
+                    stripes[role].req_end != chunk_size)
+                {
+                    stripes[role].read_start = 0;
+                    stripes[role].read_end = chunk_size;
+                    // Warning: We don't modify write_start/write_end here, we do it in calc_rmw_parity()
+                }
+            }
+            if (read_osd_set[role] != 0)
+            {
+                pg_cursize++;
+            }
+        }
    }
    if (pg_cursize < pg_size)
    {
-        if (has_parity == 0)
+        // Some stripe(s) are missing, so we need to read parity
+        for (int role = 0; role < pg_size; role++)
        {
-            // Parity is missing, we don't need to read anything
-            for (int role = 0; role < pg_minsize; role++)
+            if (read_osd_set[role] == 0)
            {
-                stripes[role].read_end = 0;
-            }
-        }
-        else
-        {
-            // Other stripe(s) are missing
-            for (int role = 0; role < pg_minsize; role++)
-            {
-                if (osd_set[role] == 0 && stripes[role].read_end != 0)
+                stripes[role].missing = true;
+                if (stripes[role].read_end != 0)
                {
-                    stripes[role].missing = true;
-                    for (int r2 = 0; r2 < pg_size; r2++)
+                    int found = 0;
+                    for (int r2 = 0; r2 < pg_size && found < pg_minsize; r2++)
                    {
-                        // Read the non-covered range of <role> from all other stripes to reconstruct it
-                        if (r2 != role && osd_set[r2] != 0)
+                        // Read the non-covered range of <role> from at least <minsize> other stripes to reconstruct it
+                        if (read_osd_set[r2] != 0)
                        {
                            extend_read(stripes[role].read_start, stripes[role].read_end, stripes[r2]);
+                            found++;
                        }
                    }
+                    if (found < pg_minsize)
+                    {
+                        // FIXME Object is incomplete - refuse partial overwrite
+                        assert(0);
+                    }
                }
            }
        }
    }
    // Allocate read buffers
-    void *rmw_buf = alloc_read_buffer(stripes, pg_size, has_parity * (end - start));
-    // Position parity & write buffers
+    void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
+    // Position write buffers
    uint64_t buf_pos = 0, in_pos = 0;
    for (int role = 0; role < pg_size; role++)
    {
        if (stripes[role].req_end != 0)
        {
-            stripes[role].write_buf = write_buf + in_pos;
+            stripes[role].write_buf = request_buf + in_pos;
            in_pos += stripes[role].req_end - stripes[role].req_start;
        }
-        else if (role >= pg_minsize && osd_set[role] != 0)
+        else if (role >= pg_minsize && write_osd_set[role] != 0 && end != 0)
        {
            stripes[role].write_buf = rmw_buf + buf_pos;
            buf_pos += end - start;
@@ -321,13 +357,9 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
    }
 }

-void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size)
+void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
 {
-    if (stripes[pg_size-1].missing)
-    {
-        // Parity OSD is unavailable
-        return;
-    }
+    int pg_minsize = pg_size-1;
    for (int role = 0; role < pg_size; role++)
    {
        if (stripes[role].read_end != 0 && stripes[role].missing)
@@ -337,31 +369,82 @@ void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size)
            break;
        }
    }
-    // Calculate new parity (EC k+1)
-    int parity = pg_size-1, prev = -2;
-    auto wr_end = stripes[parity].write_end;
-    auto wr_start = stripes[parity].write_start;
-    for (int other = 0; other < pg_size-1; other++)
+    uint32_t start = 0, end = 0;
+    if (!stripes[pg_minsize].missing || write_osd_set != read_osd_set)
    {
-        if (prev == -2)
+        for (int role = 0; role < pg_minsize; role++)
        {
-            prev = other;
-        }
-        else
-        {
-            int n1 = 0, n2 = 0;
-            buf_len_t xor1[3], xor2[3];
-            if (prev == -1)
+            if (stripes[role].req_end != 0)
            {
-                xor1[n1++] = { .buf = stripes[parity].write_buf, .len = wr_end-wr_start };
+                start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
+                end = std::max(stripes[role].req_end, end);
+            }
+        }
+    }
+    if (write_osd_set != read_osd_set)
+    {
+        for (int role = 0; role < pg_minsize; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role] &&
+                (stripes[role].req_start != 0 || stripes[role].req_end != chunk_size))
+            {
+                // FIXME again, handle case when write_osd_set[role] is 0
+                // Copy modified chunk into the read buffer to write it back
+                memcpy(
+                    stripes[role].read_buf + stripes[role].req_start,
+                    stripes[role].write_buf,
+                    stripes[role].req_end - stripes[role].req_start
+                );
+                stripes[role].write_buf = stripes[role].read_buf;
+                stripes[role].write_start = 0;
+                stripes[role].write_end = chunk_size;
+            }
+        }
+    }
+    if (!stripes[pg_minsize].missing && end != 0)
+    {
+        // Calculate new parity (EC k+1)
+        int parity = pg_minsize, prev = -2;
+        for (int other = 0; other < pg_minsize; other++)
+        {
+            if (prev == -2)
+            {
+                prev = other;
            }
            else
            {
-                get_old_new_buffers(stripes[prev], wr_start, wr_end, xor1, n1);
-                prev = -1;
+                int n1 = 0, n2 = 0;
+                buf_len_t xor1[3], xor2[3];
+                if (prev == -1)
+                {
+                    xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
+                }
+                else
+                {
+                    get_old_new_buffers(stripes[prev], start, end, xor1, n1);
+                    prev = -1;
+                }
+                get_old_new_buffers(stripes[other], start, end, xor2, n2);
+                xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
+            }
+        }
+    }
+    if (write_osd_set != read_osd_set)
+    {
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            {
+                // Copy new parity into the read buffer to write it back
+                memcpy(
+                    stripes[role].read_buf + start,
+                    stripes[role].write_buf,
+                    end - start
+                );
+                stripes[role].write_buf = stripes[role].read_buf;
+                stripes[role].write_start = 0;
+                stripes[role].write_end = chunk_size;
            }
-            get_old_new_buffers(stripes[other], wr_start, wr_end, xor2, n2);
-            xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, wr_end-wr_start);
        }
    }
 }
--- a/osd_rmw.h
+++ b/osd_rmw.h
@@ -31,6 +31,7 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi

 void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);

-void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize);
+void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
+    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);

-void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size);
+void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
--- a/osd_rmw_test.cpp
+++ b/osd_rmw_test.cpp
@@ -2,16 +2,147 @@
 #include "osd_rmw.cpp"
 #include "test_pattern.h"

+void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size);
+void test1();
+void test4();
+void test5();
+void test6();
+void test7();
+void test8();
+void test9();
+
+/***
+
+Cases:
+
+1. split(offset=128K-4K, len=8K)
+   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
+
+2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
+
+3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
+   = { read: [ 0, 128K-4K ] }
+
+4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = {
+     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   + check write2 buffer
+
+5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+
+6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+
+7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity(): {
+     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write1==read1,
+   }
+   + check write1 buffer
+   + check write2 buffer
+
+8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+   + check write2 buffer
+
+9. object recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     input buffer: NULL,
+     rmw buffer: [ read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity(): {
+     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write0==read0,
+   }
+   + check write0 buffer
+
+***/
+
 int main(int narg, char *args[])
+{
+    // Test 1
+    test1();
+    // Test 4
+    test4();
+    // Test 5
+    test5();
+    // Test 6
+    test6();
+    // Test 7
+    test7();
+    // Test 8
+    test8();
+    // Test 9
+    test9();
+    // End
+    printf("all ok\n");
+    return 0;
+}
+
+void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
+{
+    printf("request");
+    for (int i = 0; i < pg_size; i++)
+    {
+        printf(" {%uK-%uK}", stripes[i].req_start/1024, stripes[i].req_end/1024);
+    }
+    printf("\n");
+    printf("read");
+    for (int i = 0; i < pg_size; i++)
+    {
+        printf(" {%uK-%uK}", stripes[i].read_start/1024, stripes[i].read_end/1024);
+    }
+    printf("\n");
+    printf("write");
+    for (int i = 0; i < pg_size; i++)
+    {
+        printf(" {%uK-%uK}", stripes[i].write_start/1024, stripes[i].write_end/1024);
+    }
+    printf("\n");
+}
+
+void test1()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 1
+    // Test 1.1
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
    assert(stripes[2].req_end == 0);
-    // Test 2
+    // Test 1.2
    for (int i = 0; i < 3; i++)
    {
        stripes[i].read_start = stripes[i].req_start;
@@ -20,18 +151,26 @@ int main(int narg, char *args[])
    assert(extend_missing_stripes(stripes, osd_set, 2, 3) == 0);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
-    // Test 3
+    // Test 1.3
    stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
    cover_read(0, 128*1024, stripes[0]);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
+}
+
+void test4()
+{
+    osd_num_t osd_set[3] = { 1, 0, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 4.1
-    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    void* write_buf = malloc(8192);
-    void* rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
+    void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 4096 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == rmw_buf+128*1024);
    assert(stripes[1].read_buf == rmw_buf+128*1024*2);
    assert(stripes[2].read_buf == rmw_buf+128*1024*3-4096);
@@ -43,24 +182,32 @@ int main(int narg, char *args[])
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
    set_pattern(stripes[1].read_buf, 128*1024-4096, UINT64_MAX); // didn't read it, it's missing
    set_pattern(stripes[2].read_buf, 128*1024-4096, 0); // old parity = 0
-    calc_rmw_parity(stripes, 3);
+    calc_rmw_parity(stripes, 3, osd_set, osd_set, 128*1024);
    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
    free(rmw_buf);
    free(write_buf);
+}
+
+void test5()
+{
+    osd_num_t osd_set[3] = { 1, 0, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 5.1
-    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 64*1024);
    assert(stripes[2].req_end == 0);
    // Test 5.2
-    write_buf = malloc(64*1024*3);
-    rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
+    void *write_buf = malloc(64*1024*3);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
    assert(stripes[0].read_start == 64*1024 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 64*1024 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == rmw_buf+128*1024);
    assert(stripes[1].read_buf == rmw_buf+64*3*1024);
    assert(stripes[2].read_buf == rmw_buf+64*4*1024);
@@ -69,15 +216,22 @@ int main(int narg, char *args[])
    assert(stripes[2].write_buf == rmw_buf);
    free(rmw_buf);
    free(write_buf);
+}
+
+void test6()
+{
+    osd_num_t osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 6.1
-    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
-    osd_set[1] = 2;
-    write_buf = malloc(64*1024*3);
-    rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 3);
+    void *write_buf = malloc(64*1024*3);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024);
    assert(stripes[0].read_end == 0);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == 0);
    assert(stripes[1].read_buf == rmw_buf+128*1024);
    assert(stripes[2].read_buf == 0);
@@ -86,8 +240,121 @@ int main(int narg, char *args[])
    assert(stripes[2].write_buf == rmw_buf);
    free(rmw_buf);
    free(write_buf);
-    osd_set[1] = 0;
-    // End
-    printf("all ok\n");
-    return 0;
+}
+
+void test7()
+{
+    osd_num_t osd_set[3] = { 1, 0, 3 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 7.1
+    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
+    void *write_buf = malloc(8192);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+128*1024);
+    assert(stripes[1].read_buf == rmw_buf+128*1024*2);
+    assert(stripes[2].read_buf == rmw_buf+128*1024*3);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 7.2
+    set_pattern(write_buf, 8192, PATTERN0);
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
+    set_pattern(stripes[1].read_buf, 128*1024, UINT64_MAX); // didn't read it, it's missing
+    set_pattern(stripes[2].read_buf, 128*1024, 0); // old parity = 0
+    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[1].write_buf == stripes[1].read_buf);
+    check_pattern(stripes[1].write_buf, 4096, PATTERN0);
+    check_pattern(stripes[1].write_buf+4096, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
+    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
+    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
+    free(rmw_buf);
+    free(write_buf);
+}
+
+void test8()
+{
+    osd_num_t osd_set[3] = { 0, 2, 3 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 8.1
+    split_stripes(2, 128*1024, 0, 128*1024+4096, stripes);
+    void *write_buf = malloc(128*1024+4096);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
+    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == NULL);
+    assert(stripes[1].read_buf == rmw_buf+128*1024);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+128*1024);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 8.2
+    set_pattern(write_buf, 128*1024+4096, PATTERN0);
+    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN1);
+    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); // recheck again
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);     // recheck again
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); // recheck again
+    assert(stripes[0].write_buf == write_buf);                               // recheck again
+    assert(stripes[1].write_buf == write_buf+128*1024);                      // recheck again
+    assert(stripes[2].write_buf == rmw_buf);                                 // recheck again
+    check_pattern(stripes[2].write_buf, 4096, 0); // new parity
+    check_pattern(stripes[2].write_buf+4096, 128*1024-4096, PATTERN0^PATTERN1); // new parity
+    free(rmw_buf);
+    free(write_buf);
+}
+
+void test9()
+{
+    osd_num_t osd_set[3] = { 0, 2, 3 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 9.0
+    split_stripes(2, 128*1024, 64*1024, 0, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 9.1
+    void *write_buf = NULL;
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
+    assert(stripes[0].read_buf == rmw_buf);
+    assert(stripes[1].read_buf == rmw_buf+128*1024);
+    assert(stripes[2].read_buf == rmw_buf+128*1024*2);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == NULL);
+    // Test 8.2
+    set_pattern(stripes[1].read_buf, 128*1024, 0);
+    set_pattern(stripes[2].read_buf, 128*1024, PATTERN1);
+    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
+    assert(stripes[0].write_buf == rmw_buf);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == NULL);
+    check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    check_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
+    free(rmw_buf);
 }
--- a/osd_secondary.cpp
+++ b/osd_secondary.cpp
@@ -4,45 +4,34 @@

 void osd_t::secondary_op_callback(osd_op_t *op)
 {
-    inflight_ops--;
-    auto cl_it = clients.find(op->peer_fd);
-    if (cl_it != clients.end())
+    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
+        op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
    {
-        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        op->reply.hdr.id = op->req.hdr.id;
-        op->reply.hdr.opcode = op->req.hdr.opcode;
-        op->reply.hdr.retval = op->bs_op->retval;
-        if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
-            op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
-        {
-            op->reply.sec_rw.version = op->bs_op->version;
-        }
-        else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
-        {
-            op->reply.sec_del.version = op->bs_op->version;
-        }
-        if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
-            op->reply.hdr.retval > 0)
-        {
-            op->send_list.push_back(op->buf, op->reply.hdr.retval);
-        }
-        else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
-        {
-            // allocated by blockstore
-            op->buf = op->bs_op->buf;
-            if (op->reply.hdr.retval > 0)
-            {
-                op->send_list.push_back(op->buf, op->reply.hdr.retval * sizeof(obj_ver_id));
-            }
-            op->reply.sec_list.stable_count = op->bs_op->version;
-        }
-        auto & cl = cl_it->second;
-        outbox_push(cl, op);
+        op->reply.sec_rw.version = op->bs_op->version;
    }
-    else
+    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
    {
-        delete op;
+        op->reply.sec_del.version = op->bs_op->version;
    }
+    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
+        op->bs_op->retval > 0)
+    {
+        op->send_list.push_back(op->buf, op->bs_op->retval);
+    }
+    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
+    {
+        // allocated by blockstore
+        op->buf = op->bs_op->buf;
+        if (op->bs_op->retval > 0)
+        {
+            op->send_list.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id));
+        }
+        op->reply.sec_list.stable_count = op->bs_op->version;
+    }
+    int retval = op->bs_op->retval;
+    delete op->bs_op;
+    op->bs_op = NULL;
+    finish_op(op, retval);
 }

 void osd_t::exec_secondary(osd_op_t *cur_op)
@@ -95,7 +84,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.parity_block_size;
+        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
 #ifdef OSD_STUB
@@ -114,15 +103,10 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
 {
    // FIXME: Send the real config, not its source
    std::string cfg_str = json11::Json(config).dump();
-    cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-    cur_op->reply.hdr.id = cur_op->req.hdr.id;
-    cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-    cur_op->reply.hdr.retval = cfg_str.size()+1;
    cur_op->buf = malloc(cfg_str.size()+1);
    memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
-    auto & cl = clients[cur_op->peer_fd];
-    cur_op->send_list.push_back(cur_op->buf, cur_op->reply.hdr.retval);
-    outbox_push(cl, cur_op);
+    cur_op->send_list.push_back(cur_op->buf, cfg_str.size()+1);
+    finish_op(cur_op, cfg_str.size()+1);
 }

 void osd_t::exec_sync_stab_all(osd_op_t *cur_op)
--- a/osd_send.cpp
+++ b/osd_send.cpp
@@ -1,131 +0,0 @@
-#include "osd.h"
-
-void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
-{
-    assert(cur_op->peer_fd);
-    if (cur_op->op_type == OSD_OP_OUT)
-    {
-        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
-    }
-    cl.outbox.push_back(cur_op);
-    if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
-    {
-        if (cl.write_state == 0)
-        {
-            cl.write_state = CL_WRITE_READY;
-            write_ready_clients.push_back(cur_op->peer_fd);
-        }
-        ringloop->wakeup();
-    }
-}
-
-bool osd_t::try_send(osd_client_t & cl)
-{
-    int peer_fd = cl.peer_fd;
-    io_uring_sqe* sqe = ringloop->get_sqe();
-    if (!sqe)
-    {
-        return false;
-    }
-    ring_data_t* data = ((ring_data_t*)sqe->user_data);
-    if (!cl.write_op)
-    {
-        // pick next command
-        cl.write_op = cl.outbox.front();
-        cl.outbox.pop_front();
-        cl.write_state = CL_WRITE_REPLY;
-        clock_gettime(CLOCK_REALTIME, &cl.write_op->tv_send);
-        if (cl.write_op->op_type == OSD_OP_IN)
-        {
-            // Measure execution latency
-            timespec tv_end = cl.write_op->tv_send;
-            op_stat_count[cl.write_op->req.hdr.opcode]++;
-            op_stat_sum[cl.write_op->req.hdr.opcode] += (
-                (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
-                (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
-            );
-        }
-    }
-    cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
-    cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
-    data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
-    my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
-    return true;
-}
-
-void osd_t::send_replies()
-{
-    for (int i = 0; i < write_ready_clients.size(); i++)
-    {
-        int peer_fd = write_ready_clients[i];
-        if (!try_send(clients[peer_fd]))
-        {
-            write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
-            return;
-        }
-    }
-    write_ready_clients.clear();
-}
-
-void osd_t::handle_send(ring_data_t *data, int peer_fd)
-{
-    auto cl_it = clients.find(peer_fd);
-    if (cl_it != clients.end())
-    {
-        auto & cl = cl_it->second;
-        if (data->res < 0 && data->res != -EAGAIN)
-        {
-            // this is a client socket, so don't panic. just disconnect it
-            printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
-            stop_client(peer_fd);
-            return;
-        }
-        if (data->res >= 0)
-        {
-            osd_op_t *cur_op = cl.write_op;
-            while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
-            {
-                iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
-                if (iov.iov_len <= data->res)
-                {
-                    data->res -= iov.iov_len;
-                    cur_op->send_list.sent++;
-                }
-                else
-                {
-                    iov.iov_len -= data->res;
-                    iov.iov_base += data->res;
-                    break;
-                }
-            }
-            if (cur_op->send_list.sent >= cur_op->send_list.count)
-            {
-                // Done
-                if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
-                {
-                    timespec tv_end;
-                    clock_gettime(CLOCK_REALTIME, &tv_end);
-                    send_stat_count++;
-                    send_stat_sum += (
-                        (tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
-                        (tv_end.tv_nsec - cl.write_op->tv_send.tv_nsec)/1000
-                    );
-                }
-                if (cur_op->op_type == OSD_OP_IN)
-                {
-                    delete cur_op;
-                }
-                else
-                {
-                    cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
-                }
-                cl.write_op = NULL;
-                cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
-            }
-        }
-        if (cl.write_state != 0)
-        {
-            write_ready_clients.push_back(peer_fd);
-        }
-    }
-}
--- a/osd_test.cpp
+++ b/osd_test.cpp
@@ -19,6 +19,8 @@

 int connect_osd(const char *osd_address, int osd_port);

+uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len);
+
 uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern);

 void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
@@ -29,6 +31,8 @@ void test_primary_sync(int connect_fd);

 void test_sync_stab_all(int connect_fd);

+void test_list_stab(int connect_fd);
+
 int main0(int narg, char *args[])
 {
    int connect_fd;
@@ -94,7 +98,16 @@ int main2(int narg, char *args[])
    return 0;
 }

-int main(int narg, char *args[])
+int main3(int narg, char *args[])
+{
+    int connect_fd;
+    connect_fd = connect_osd("127.0.0.1", 11203);
+    test_list_stab(connect_fd);
+    close(connect_fd);
+    return 0;
+}
+
+int main4(int narg, char *args[])
 {
    int connect_fd;
    // Cluster write (sync not implemented yet)
@@ -106,6 +119,15 @@ int main(int narg, char *args[])
    return 0;
 }

+int main(int narg, char *args[])
+{
+    int connect_fd;
+    connect_fd = connect_osd("192.168.7.2", 43051);
+    test_read(connect_fd, 1, 1039663104, UINT64_MAX, 0, 128*1024);
+    close(connect_fd);
+    return 0;
+}
+
 int connect_osd(const char *osd_address, int osd_port)
 {
    struct sockaddr_in addr;
@@ -148,7 +170,7 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
        printf("bad reply: magic, id or opcode does not match request\n");
        return false;
    }
-    if (reply.hdr.retval != expected)
+    if (expected >= 0 && reply.hdr.retval != expected)
    {
        printf("operation failed, retval=%ld\n", reply.hdr.retval);
        return false;
@@ -156,6 +178,66 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
    return true;
 }

+uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len)
+{
+    osd_any_op_t op;
+    osd_any_reply_t reply;
+    op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
+    op.hdr.id = 1;
+    op.hdr.opcode = OSD_OP_SECONDARY_READ;
+    op.sec_rw.oid = {
+        .inode = inode,
+        .stripe = stripe,
+    };
+    op.sec_rw.version = version;
+    op.sec_rw.offset = offset;
+    op.sec_rw.len = len;
+    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
+    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
+    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
+    if (!check_reply(r, op, reply, op.sec_rw.len))
+    {
+        free(data);
+        return 0;
+    }
+    r = read_blocking(connect_fd, data, len);
+    if (r != len)
+    {
+        free(data);
+        perror("read data");
+        return 0;
+    }
+    free(data);
+    printf("Read %lu:%lu v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
+    op.hdr.opcode = OSD_OP_SECONDARY_LIST;
+    op.sec_list.list_pg = 1;
+    op.sec_list.pg_count = 1;
+    op.sec_list.pg_stripe_size = 4*1024*1024;
+    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
+    r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
+    if (reply.hdr.retval < 0 || !check_reply(r, op, reply, reply.hdr.retval))
+    {
+        return 0;
+    }
+    data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
+    r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
+    if (r != sizeof(obj_ver_id)*reply.hdr.retval)
+    {
+        free(data);
+        perror("read data");
+        return 0;
+    }
+    obj_ver_id *ov = (obj_ver_id*)data;
+    for (int i = 0; i < reply.hdr.retval; i++)
+    {
+        if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
+        {
+            printf("list: %lu:%lu v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
+        }
+    }
+    return 0;
+}
+
 uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern)
 {
    osd_any_op_t op;
@@ -170,7 +252,7 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
    op.sec_rw.version = version;
    op.sec_rw.offset = 0;
    op.sec_rw.len = 128*1024;
-    void *data = memalign(512, op.sec_rw.len);
+    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
    for (int i = 0; i < (op.sec_rw.len)/sizeof(uint64_t); i++)
        ((uint64_t*)data)[i] = pattern;
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
@@ -205,7 +287,7 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(512, len);
+    void *data = memalign(MEM_ALIGNMENT, len);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    if (!check_reply(r, op, reply, len))
@@ -233,7 +315,7 @@ void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(512, len);
+    void *data = memalign(MEM_ALIGNMENT, len);
    set_pattern(data, len, pattern);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    write_blocking(connect_fd, data, len);
@@ -265,3 +347,40 @@ void test_sync_stab_all(int connect_fd)
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    assert(check_reply(r, op, reply, 0));
 }
+
+void test_list_stab(int connect_fd)
+{
+    osd_any_op_t op;
+    osd_any_reply_t reply;
+    op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
+    op.hdr.id = 1;
+    op.hdr.opcode = OSD_OP_SECONDARY_LIST;
+    op.sec_list.pg_count = 0;
+    assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
+    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
+    assert(check_reply(r, op, reply, -1));
+    int total_count = reply.hdr.retval;
+    int stable_count = reply.sec_list.stable_count;
+    obj_ver_id *data = (obj_ver_id*)malloc(total_count * sizeof(obj_ver_id));
+    assert(data);
+    assert(read_blocking(connect_fd, data, total_count * sizeof(obj_ver_id)) == (total_count * sizeof(obj_ver_id)));
+    int last_start = stable_count;
+    for (int i = stable_count; i <= total_count; i++)
+    {
+        // Stabilize in portions of 32 entries
+        if (i - last_start >= 32 || i == total_count)
+        {
+            op.hdr.opcode = OSD_OP_SECONDARY_STABILIZE;
+            op.sec_stab.len = sizeof(obj_ver_id) * (i - last_start);
+            assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
+            assert(write_blocking(connect_fd, data + last_start, op.sec_stab.len) == op.sec_stab.len);
+            r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
+            assert(check_reply(r, op, reply, 0));
+            last_start = i;
+        }
+    }
+    obj_ver_id *data2 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 32);
+    assert(data2);
+    free(data2);
+    free(data);
+}
--- a/pg_states.cpp
+++ b/pg_states.cpp
@@ -0,0 +1,33 @@
+#include "pg_states.h"
+
+const int pg_state_bit_count = 13;
+
+const int pg_state_bits[13] = {
+    PG_STARTING,
+    PG_PEERING,
+    PG_INCOMPLETE,
+    PG_ACTIVE,
+    PG_STOPPING,
+    PG_OFFLINE,
+    PG_DEGRADED,
+    PG_HAS_INCOMPLETE,
+    PG_HAS_DEGRADED,
+    PG_HAS_MISPLACED,
+    PG_HAS_UNCLEAN,
+    PG_LEFT_ON_DEAD,
+};
+
+const char *pg_state_names[13] = {
+    "starting",
+    "peering",
+    "incomplete",
+    "active",
+    "stopping",
+    "offline",
+    "degraded",
+    "has_incomplete",
+    "has_degraded",
+    "has_misplaced",
+    "has_unclean",
+    "left_on_dead",
+};
--- a/pg_states.h
+++ b/pg_states.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// Placement group states
+// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
+// Exactly one of these:
+#define PG_STARTING (1<<0)
+#define PG_PEERING (1<<1)
+#define PG_INCOMPLETE (1<<2)
+#define PG_ACTIVE (1<<3)
+#define PG_STOPPING (1<<4)
+#define PG_OFFLINE (1<<5)
+// Plus any of these:
+#define PG_DEGRADED (1<<6)
+#define PG_HAS_INCOMPLETE (1<<7)
+#define PG_HAS_DEGRADED (1<<8)
+#define PG_HAS_MISPLACED (1<<9)
+#define PG_HAS_UNCLEAN (1<<10)
+#define PG_LEFT_ON_DEAD (1<<11)
+
+// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
+#define STRIPE_MASK ((uint64_t)4096 - 1)
+
+// OSD object states
+#define OBJ_DEGRADED 0x02
+#define OBJ_INCOMPLETE 0x04
+#define OBJ_MISPLACED 0x08
+#define OBJ_NEEDS_STABLE 0x10000
+#define OBJ_NEEDS_ROLLBACK 0x20000
+#define OBJ_BUGGY 0x80000
+
+extern const int pg_state_bits[];
+extern const char *pg_state_names[];
+extern const int pg_state_bit_count;
--- a/ringloop.cpp
+++ b/ringloop.cpp
@@ -18,6 +18,7 @@ ring_loop_t::ring_loop_t(int qd)
    {
        free_ring_data[i] = i;
    }
+    wait_sqe_id = 1;
 }

 ring_loop_t::~ring_loop_t()
@@ -27,11 +28,10 @@ ring_loop_t::~ring_loop_t()
    io_uring_queue_exit(&ring);
 }

-int ring_loop_t::register_consumer(ring_consumer_t & consumer)
+void ring_loop_t::register_consumer(ring_consumer_t *consumer)
 {
-    consumer.number = consumers.size();
+    unregister_consumer(consumer);
    consumers.push_back(consumer);
-    return consumer.number;
 }

 void ring_loop_t::wakeup()
@@ -39,12 +39,15 @@ void ring_loop_t::wakeup()
    loop_again = true;
 }

-void ring_loop_t::unregister_consumer(ring_consumer_t & consumer)
+void ring_loop_t::unregister_consumer(ring_consumer_t *consumer)
 {
-    if (consumer.number >= 0 && consumer.number < consumers.size())
+    for (int i = 0; i < consumers.size(); i++)
    {
-        consumers[consumer.number].loop = NULL;
-        consumer.number = -1;
+        if (consumers[i] == consumer)
+        {
+            consumers.erase(consumers.begin()+i, consumers.begin()+i+1);
+            break;
+        }
    }
 }

@@ -62,12 +65,17 @@ void ring_loop_t::loop()
        free_ring_data[free_ring_data_ptr++] = d - ring_datas;
        io_uring_cqe_seen(&ring, cqe);
    }
+    while (get_sqe_queue.size() > 0)
+    {
+        (get_sqe_queue[0].second)();
+        get_sqe_queue.erase(get_sqe_queue.begin());
+    }
    do
    {
        loop_again = false;
        for (int i = 0; i < consumers.size(); i++)
        {
-            consumers[i].loop();
+            consumers[i]->loop();
        }
    } while (loop_again);
 }
--- a/ringloop.h
+++ b/ringloop.h
@@ -4,6 +4,8 @@
 #define _LARGEFILE64_SOURCE
 #endif

+#include <stdio.h>
+#include <time.h>
 #include <string.h>
 #include <assert.h>
 #include <liburing.h>
@@ -113,23 +115,24 @@ struct ring_data_t

 struct ring_consumer_t
 {
-    int number;
    std::function<void(void)> loop;
 };

 class ring_loop_t
 {
-    std::vector<ring_consumer_t> consumers;
+    std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
+    std::vector<ring_consumer_t*> consumers;
    struct ring_data_t *ring_datas;
    int *free_ring_data;
+    int wait_sqe_id;
    unsigned free_ring_data_ptr;
    bool loop_again;
    struct io_uring ring;
 public:
    ring_loop_t(int qd);
    ~ring_loop_t();
-    int register_consumer(ring_consumer_t & consumer);
-    void unregister_consumer(ring_consumer_t & consumer);
+    void register_consumer(ring_consumer_t *consumer);
+    void unregister_consumer(ring_consumer_t *consumer);

    inline struct io_uring_sqe* get_sqe()
    {
@@ -140,9 +143,30 @@ public:
            io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
        return sqe;
    }
+    inline int wait_sqe(std::function<void()> cb)
+    {
+        get_sqe_queue.push_back({ wait_sqe_id, cb });
+        return wait_sqe_id++;
+    }
+    inline void cancel_wait_sqe(int wait_id)
+    {
+        for (int i = 0; i < get_sqe_queue.size(); i++)
+        {
+            if (get_sqe_queue[i].first == wait_id)
+            {
+                get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
+            }
+        }
+    }
    inline int submit()
    {
-        return io_uring_submit(&ring);
+        int r = io_uring_submit(&ring);
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("submit %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+        return r;
    }
    inline int wait()
    {
@@ -153,7 +177,7 @@ public:
    {
        return free_ring_data_ptr;
    }
-    inline bool get_loop_again()
+    inline bool has_work()
    {
        return loop_again;
    }
--- a/rw_blocking.cpp
+++ b/rw_blocking.cpp
@@ -51,6 +51,40 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
    return done;
 }

+int readv_blocking(int fd, iovec *iov, int iovcnt)
+{
+    int v = 0;
+    int done = 0;
+    while (v < iovcnt)
+    {
+        ssize_t r = readv(fd, iov, iovcnt);
+        if (r < 0)
+        {
+            if (errno != EAGAIN && errno != EPIPE)
+            {
+                perror("writev");
+                exit(1);
+            }
+            continue;
+        }
+        while (v < iovcnt)
+        {
+            if (iov[v].iov_len > r)
+            {
+                iov[v].iov_len -= r;
+                iov[v].iov_base += r;
+                break;
+            }
+            else
+            {
+                v++;
+            }
+        }
+        done += r;
+    }
+    return done;
+}
+
 int writev_blocking(int fd, iovec *iov, int iovcnt)
 {
    int v = 0;
--- a/rw_blocking.h
+++ b/rw_blocking.h
@@ -5,4 +5,5 @@

 int read_blocking(int fd, void *read_buf, size_t remaining);
 int write_blocking(int fd, void *write_buf, size_t remaining);
+int readv_blocking(int fd, iovec *iov, int iovcnt);
 int writev_blocking(int fd, iovec *iov, int iovcnt);
--- a/stub_bench.cpp
+++ b/stub_bench.cpp
@@ -25,20 +25,37 @@ int connect_stub(const char *server_address, int server_port);

 void run_bench(int peer_fd);

+static uint64_t read_sum = 0, read_count = 0;
 static uint64_t write_sum = 0, write_count = 0;
 static uint64_t sync_sum = 0, sync_count = 0;

 void handle_sigint(int sig)
 {
-    printf("4k randwrite: %lu us avg\n", write_sum/write_count);
-    printf("sync: %lu us avg\n", sync_sum/sync_count);
+    printf("4k randread: %lu us avg\n", read_count ? read_sum/read_count : 0);
+    printf("4k randwrite: %lu us avg\n", write_count ? write_sum/write_count : 0);
+    printf("sync: %lu us avg\n", sync_count ? sync_sum/sync_count : 0);
    exit(0);
 }

 int main(int narg, char *args[])
 {
+    if (narg < 2)
+    {
+        printf("USAGE: %s SERVER_IP [PORT]\n", args[0]);
+        return 1;
+    }
+    int port = 11203;
+    if (narg >= 3)
+    {
+        port = atoi(args[2]);
+        if (port <= 0 || port >= 65536)
+        {
+            printf("Bad port number\n");
+            return 1;
+        }
+    }
    signal(SIGINT, handle_sigint);
-    int peer_fd = connect_stub("127.0.0.1", 11203);
+    int peer_fd = connect_stub(args[1], port);
    run_bench(peer_fd);
    close(peer_fd);
    return 0;
@@ -98,10 +115,37 @@ void run_bench(int peer_fd)
    osd_any_reply_t reply;
    void *buf = NULL;
    int r;
+    iovec iov[2];
    timespec tv_begin, tv_end;
    clock_gettime(CLOCK_REALTIME, &tv_begin);
    while (1)
    {
+        // read
+        op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
+        op.hdr.id = 1;
+        op.hdr.opcode = OSD_OP_SECONDARY_READ;
+        op.sec_rw.oid.inode = 3;
+        op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
+        op.sec_rw.version = 0;
+        op.sec_rw.len = 4096;
+        op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
+        r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
+        if (!r)
+            break;
+        buf = malloc(op.sec_rw.len);
+        iov[0] = { reply.buf, OSD_PACKET_SIZE };
+        iov[1] = { buf, op.sec_rw.len };
+        r = readv_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
+        free(buf);
+        if (!r || !check_reply(OSD_PACKET_SIZE, op, reply, op.sec_rw.len))
+            break;
+        clock_gettime(CLOCK_REALTIME, &tv_end);
+        read_count++;
+        read_sum += (
+            (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
+            tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
+        );
+        tv_begin = tv_end;
        // write
        op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
        op.hdr.id = 1;
@@ -113,9 +157,9 @@ void run_bench(int peer_fd)
        op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
        buf = malloc(op.sec_rw.len);
        memset(buf, rand() % 255, op.sec_rw.len);
-        r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
-        if (r)
-            r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
+        iov[0] = { op.buf, OSD_PACKET_SIZE };
+        iov[1] = { buf, op.sec_rw.len };
+        r = writev_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
        free(buf);
        if (!r)
            break;
@@ -128,6 +172,7 @@ void run_bench(int peer_fd)
            (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
            tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
        );
+        tv_begin = tv_end;
        // sync/stab
        op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
        op.hdr.id = 1;
@@ -138,11 +183,12 @@ void run_bench(int peer_fd)
        r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
        if (!check_reply(r, op, reply, 0))
            break;
-        clock_gettime(CLOCK_REALTIME, &tv_begin);
+        clock_gettime(CLOCK_REALTIME, &tv_end);
        sync_count++;
        sync_sum += (
-            (tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
-            tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
+            (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
+            tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
        );
+        tv_begin = tv_end;
    }
 }
--- a/stub_uring_osd.cpp
+++ b/stub_uring_osd.cpp
@@ -0,0 +1,129 @@
+/**
+ * Stub "OSD" implemented on top of osd_messenger to test & compare
+ * network performance with sync read/write and io_uring
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include <stdexcept>
+
+#include "ringloop.h"
+#include "epoll_manager.h"
+#include "messenger.h"
+
+int bind_stub(const char *bind_address, int bind_port);
+
+void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
+
+int main(int narg, char *args[])
+{
+    ring_consumer_t looper;
+    ring_loop_t *ringloop = new ring_loop_t(512);
+    epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
+    osd_messenger_t *msgr = new osd_messenger_t();
+    msgr->osd_num = 1351;
+    msgr->tfd = epmgr->tfd;
+    msgr->ringloop = ringloop;
+    msgr->repeer_pgs = [](osd_num_t) {};
+    msgr->exec_op = [msgr](osd_op_t *op) { stub_exec_op(msgr, op); };
+    // Accept new connections
+    int listen_fd = bind_stub("0.0.0.0", 11203);
+    epmgr->set_fd_handler(listen_fd, [listen_fd, msgr](int fd, int events)
+    {
+        msgr->accept_connections(listen_fd);
+    });
+    looper.loop = [msgr, ringloop]()
+    {
+        msgr->read_requests();
+        msgr->send_replies();
+        ringloop->submit();
+    };
+    ringloop->register_consumer(&looper);
+    printf("stub_uring_osd: waiting for clients\n");
+    while (true)
+    {
+        ringloop->loop();
+        ringloop->wait();
+    }
+    delete msgr;
+    delete epmgr;
+    delete ringloop;
+    return 0;
+}
+
+int bind_stub(const char *bind_address, int bind_port)
+{
+    int listen_backlog = 128;
+
+    int listen_fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (listen_fd < 0)
+    {
+        throw std::runtime_error(std::string("socket: ") + strerror(errno));
+    }
+    int enable = 1;
+    setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
+
+    sockaddr_in addr;
+    int r;
+    if ((r = inet_pton(AF_INET, bind_address, &addr.sin_addr)) != 1)
+    {
+        close(listen_fd);
+        throw std::runtime_error("bind address "+std::string(bind_address)+(r == 0 ? " is not valid" : ": no ipv4 support"));
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(bind_port);
+
+    if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("bind: ") + strerror(errno));
+    }
+
+    if (listen(listen_fd, listen_backlog) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("listen: ") + strerror(errno));
+    }
+
+    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
+
+    return listen_fd;
+}
+
+void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
+{
+    op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+    op->reply.hdr.id = op->req.hdr.id;
+    op->reply.hdr.opcode = op->req.hdr.opcode;
+    op->send_list.push_back(op->reply.buf, OSD_PACKET_SIZE);
+    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
+    {
+        op->reply.hdr.retval = op->req.sec_rw.len;
+        op->buf = malloc(op->req.sec_rw.len);
+        op->send_list.push_back(op->buf, op->req.sec_rw.len);
+    }
+    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
+    {
+        op->reply.hdr.retval = op->req.sec_rw.len;
+    }
+    else if (op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
+    {
+        op->reply.hdr.retval = 0;
+    }
+    else
+    {
+        printf("client %d: unsupported stub opcode: %lu\n", op->peer_fd, op->req.hdr.opcode);
+        op->reply.hdr.retval = -EINVAL;
+    }
+    msgr->outbox_push(op);
+}
--- a/test.cpp
+++ b/test.cpp
@@ -13,6 +13,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <liburing.h>
+#include <math.h>

 #include <sys/socket.h>
 #include <sys/epoll.h>
@@ -61,24 +62,6 @@ static void test_write(struct io_uring *ring, int fd)
    free(buf);
 }

-class obj_ver_hash
-{
-public:
-    size_t operator()(const obj_ver_id &s) const
-    {
-        size_t seed = 0;
-        spp::hash_combine(seed, s.oid.inode);
-        spp::hash_combine(seed, s.oid.stripe);
-        spp::hash_combine(seed, s.version);
-        return seed;
-    }
-};
-
-inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
-{
-    return a.oid == b.oid && a.version == b.version;
-}
-
 int main00(int argc, char *argv[])
 {
    // queue with random removal: vector is best :D
@@ -170,9 +153,9 @@ int main0(int argc, char *argv[])
    // btree_map 5M entries monotone -> 0.458s, random -> 5.429s
    // absl::btree_map 5M entries random -> 5.09s
    // sparse_hash_map 5M entries -> 2.193s, random -> 2.586s
-    //btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
+    btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
    //std::map<obj_ver_id, dirty_entry> dirty_db;
-    spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
+    //spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
    for (int i = 0; i < 5000000; i++)
    {
        dirty_db[(obj_ver_id){
@@ -182,7 +165,7 @@ int main0(int argc, char *argv[])
            },
            .version = 1,
        }] = (dirty_entry){
-            .state = ST_D_META_SYNCED,
+            .state = ST_D_SYNCED,
            .flags = 0,
            .location = (uint64_t)i << 17,
            .offset = 0,
@@ -337,87 +320,253 @@ int main04(int argc, char *argv[])
    return 0;
 }

-int main05(int argc, char *argv[])
+uint64_t jumphash(uint64_t key, int count)
 {
-    // FIXME extract this into a test
-    pg_t pg = {
-        .state = PG_PEERING,
-        .pg_num = 1,
-        .target_set = { 1, 2, 3 },
-        .cur_set = { 1, 2, 3 },
-        .peering_state = new pg_peering_state_t(),
-    };
-    for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
+    uint64_t b = 0;
+    uint64_t seed = key;
+    for (int j = 1; j < count; j++)
    {
-        pg_list_result_t r = {
-            .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
-            .total_count = 1024*1024*8,
-            .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
-        };
-        for (uint64_t i = 0; i < r.total_count; i++)
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        if (seed < (UINT64_MAX / (j+1)))
        {
-            r.buf[i] = {
-                .oid = {
-                    .inode = 1,
-                    .stripe = (i << STRIPE_SHIFT) | (osd_num-1),
-                },
-                .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
-            };
+            b = j;
        }
-        pg.peering_state->list_results[osd_num] = r;
    }
-    pg.calc_object_states();
-    printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
-    for (auto it: pg.state_dict)
+    return b;
+}
+
+void jumphash_prepare(int count, uint64_t *out_weights, uint64_t *in_weights)
+{
+    if (count <= 0)
    {
-        printf("dev: state=%lx\n", it.second.state);
+        return;
+    }
+    uint64_t total_weight = in_weights[0];
+    out_weights[0] = UINT64_MAX;
+    for (int j = 1; j < count; j++)
+    {
+        total_weight += in_weights[j];
+        out_weights[j] = UINT64_MAX / total_weight * in_weights[j];
+    }
+}
+
+uint64_t jumphash_weights(uint64_t key, int count, uint64_t *prepared_weights)
+{
+    uint64_t b = 0;
+    uint64_t seed = key;
+    for (int j = 1; j < count; j++)
+    {
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        if (seed < prepared_weights[j])
+        {
+            b = j;
+        }
+    }
+    return b;
+}
+
+void jumphash3(uint64_t key, int count, uint64_t *weights, uint64_t *r)
+{
+    r[0] = 0;
+    r[1] = 1;
+    r[2] = 2;
+    uint64_t total_weight = weights[0]+weights[1]+weights[2];
+    uint64_t seed = key;
+    for (int j = 3; j < count; j++)
+    {
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        total_weight += weights[j];
+        if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
+            r[0] = j;
+        else
+        {
+            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+            if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
+                r[1] = j;
+            else
+            {
+                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+                if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
+                    r[2] = j;
+            }
+        }
+    }
+}
+
+uint64_t crush(uint64_t key, int count, uint64_t *weights)
+{
+    uint64_t b = 0;
+    uint64_t seed = 0;
+    uint64_t max = 0;
+    for (int j = 0; j < count; j++)
+    {
+        seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+        seed ^= (j + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        seed = -log(((double)seed) / (1ul << 32) / (1ul << 32)) * weights[j];
+        if (seed > max)
+        {
+            max = seed;
+            b = j;
+        }
+    }
+    return b;
+}
+
+void crush3(uint64_t key, int count, uint64_t *weights, uint64_t *r, uint64_t total_weight)
+{
+    uint64_t seed = 0;
+    uint64_t max = 0;
+    for (int k1 = 0; k1 < count; k1++)
+    {
+        for (int k2 = k1+1; k2 < count; k2++)
+        {
+            if (k2 == k1)
+            {
+                continue;
+            }
+            for (int k3 = k2+1; k3 < count; k3++)
+            {
+                if (k3 == k1 || k3 == k2)
+                {
+                    continue;
+                }
+                seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= (k1 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= (k2 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= (k3 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+                //seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (weights[k1] + weights[k2] + weights[k3]);
+                seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (1 -
+                    (1 - 1.0*weights[k1]/total_weight)*
+                    (1 - 1.0*weights[k2]/total_weight)*
+                    (1 - 1.0*weights[k3]/total_weight)
+                ) * UINT64_MAX;
+                if (seed > max)
+                {
+                    r[0] = k1;
+                    r[1] = k2;
+                    r[2] = k3;
+                    max = seed;
+                }
+            }
+        }
    }
-    return 0;
 }

 int main(int argc, char *argv[])
 {
-    timeval fill_start, fill_end, filter_end;
-    spp::sparse_hash_map<object_id, clean_entry> clean_db;
-    //std::map<object_id, clean_entry> clean_db;
-    //btree::btree_map<object_id, clean_entry> clean_db;
-    gettimeofday(&fill_start, NULL);
-    printf("filling\n");
-    uint64_t total = 1024*1024*8*4;
-    clean_db.resize(total);
-    for (uint64_t i = 0; i < total; i++)
+    int host_count = 6;
+    uint64_t host_weights[] = {
+        34609*3,
+        34931*3,
+        35850+36387+35859,
+        36387,
+        36387*2,
+        36387,
+    };
+    /*int osd_count[] = { 3, 3, 3, 1, 2 };
+    uint64_t osd_weights[][3] = {
+        { 34609, 34609, 34609 },
+        { 34931, 34931, 34931 },
+        { 35850, 36387, 35859 },
+        { 36387 },
+        { 36387, 36387 },
+    };*/
+    uint64_t total_weight = 0;
+    for (int i = 0; i < host_count; i++)
    {
-        clean_db[(object_id){
-            .inode = 1,
-            //.stripe = (i << STRIPE_SHIFT),
-            .stripe = (((367*i) % total) << STRIPE_SHIFT),
-        }] = (clean_entry){
-            .version = 1,
-            .location = i << DEFAULT_ORDER,
-        };
+        total_weight += host_weights[i];
    }
-    gettimeofday(&fill_end, NULL);
-    // no resize():
-    // spp = 17.87s (seq), 41.81s (rand), 3.29s (seq+resize), 8.3s (rand+resize), ~1.3G RAM in all cases
-    // std::unordered_map = 6.14 sec, ~2.3G RAM
-    // std::map = 13 sec (seq), 5.54 sec (rand), ~2.5G RAM
-    // cpp-btree = 2.47 sec (seq) ~1.2G RAM, 20.6 sec (pseudo-random 367*i % total) ~1.5G RAM
-    printf("filled %.2f sec\n", (fill_end.tv_sec - fill_start.tv_sec) + (fill_end.tv_usec - fill_start.tv_usec) / 1000000.0);
-    for (int pg = 0; pg < 100; pg++)
+    uint64_t host_weights_prepared[host_count];
+    jumphash_prepare(host_count, host_weights_prepared, host_weights);
+    uint64_t total_pgs[host_count] = { 0 };
+    int pg_count = 256;
+    double uniformity[pg_count] = { 0 };
+    for (uint64_t pg = 1; pg <= pg_count; pg++)
    {
-        obj_ver_id* buf1 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * ((total+99)/100));
-        int j = 0;
-        for (auto it: clean_db)
-            if ((it.first % 100) == pg)
-                buf1[j++] = { .oid = it.first, .version = it.second.version };
-        free(buf1);
-        printf("filtered %d\n", j);
+        uint64_t r[3];
+
+/*
+        // Select first host
+        //r[0] = jumphash_weights(pg, host_count, host_weights_prepared);
+        r[0] = crush(pg, host_count, host_weights);
+        // Select second host
+        uint64_t seed = pg;
+        r[1] = r[0];
+        while (r[1] == r[0])
+        {
+            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+            //r[1] = jumphash_weights(seed, host_count, host_weights_prepared);
+            r[1] = crush(seed, host_count, host_weights);
+        }
+        // Select third host
+        seed = pg;
+        r[2] = r[0];
+        while (r[2] == r[0] || r[2] == r[1])
+        {
+            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+            //r[2] = jumphash_weights(seed, host_count, host_weights_prepared);
+            r[2] = crush(seed, host_count, host_weights);
+        }
+*/
+
+/*
+        // Select second host
+        uint64_t host_weights1[host_count];
+        for (int i = 0; i < r[0]; i++)
+            host_weights1[i] = host_weights[i];
+        for (int i = r[0]+1; i < host_count; i++)
+            host_weights1[i-1] = host_weights[i];
+        r[1] = crush(pg, host_count-1, host_weights1);
+        // Select third host
+        for (int i = r[1]+1; i < host_count-1; i++)
+            host_weights1[i-1] = host_weights[i];
+        r[2] = crush(pg, host_count-2, host_weights1);
+        // Transform numbers
+        r[2] = r[2] >= r[1] ? 1+r[2] : r[2];
+        r[2] = r[2] >= r[0] ? 1+r[2] : r[2];
+        r[1] = r[1] >= r[0] ? 1+r[1] : r[1];
+*/
+
+        crush3(pg, host_count, host_weights, r, total_weight);
+        uint64_t shift = (2862933555777941757ull*pg + 3037000493ull) % host_count;
+        if (shift == 1)
+        {
+            uint64_t tmp;
+            tmp = r[0];
+            r[0] = r[1];
+            r[1] = r[2];
+            r[2] = tmp;
+        }
+        else if (shift == 2)
+        {
+            uint64_t tmp;
+            tmp = r[0];
+            r[0] = r[2];
+            r[2] = r[1];
+            r[1] = tmp;
+        }
+
+        total_pgs[r[0]]++;
+        total_pgs[r[1]]++;
+        total_pgs[r[2]]++;
+
+        double u = 0;
+        for (int i = 0; i < host_count; i++)
+        {
+            double d = abs(1 - total_pgs[i]/3.0/pg * total_weight/host_weights[i]);
+            u += d;
+        }
+        uniformity[pg-1] = u/host_count;
+
+        printf("pg %lu: hosts %lu, %lu, %lu ; avg deviation = %.2f\n", pg, r[0], r[1], r[2], u/host_count);
    }
-    gettimeofday(&filter_end, NULL);
-    // spp = 42.15 sec / 60 sec (rand)
-    // std::unordered_map = 43.7 sec
-    // std::map = 156.13 sec
-    // cpp-btree = 21.87 sec (seq), 44.33 sec (rand)
-    printf("100 times filter %.2f sec\n", (filter_end.tv_sec - fill_end.tv_sec) + (filter_end.tv_usec - fill_end.tv_usec) / 1000000.0);
+    printf("total PGs: ");
+    for (int i = 0; i < host_count; i++)
+    {
+        printf(i > 0 ? ", %lu (%.2f)" : "%lu (%.2f)", total_pgs[i], total_pgs[i]/3.0/pg_count * total_weight/host_weights[i]);
+    }
+    printf("\n");
    return 0;
 }
--- a/test_blockstore.cpp
+++ b/test_blockstore.cpp
@@ -115,7 +115,7 @@ int main(int narg, char *args[])
        }
    };

-    ringloop->register_consumer(main_cons);
+    ringloop->register_consumer(&main_cons);
    while (1)
    {
        ringloop->loop();
--- a/timerfd_interval.cpp
+++ b/timerfd_interval.cpp
@@ -20,14 +20,14 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func
        throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
    }
    consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(consumer);
+    ringloop->register_consumer(&consumer);
    this->ringloop = ringloop;
    this->callback = cb;
 }

 timerfd_interval::~timerfd_interval()
 {
-    ringloop->unregister_consumer(consumer);
+    ringloop->unregister_consumer(&consumer);
    close(timerfd);
 }

--- a/timerfd_interval.h
+++ b/timerfd_interval.h
@@ -6,7 +6,6 @@ class timerfd_interval
 {
    int wait_state;
    int timerfd;
-    int status;
    ring_loop_t *ringloop;
    ring_consumer_t consumer;
    std::function<void(void)> callback;
--- a/timerfd_manager.cpp
+++ b/timerfd_manager.cpp
@@ -0,0 +1,159 @@
+#include <sys/timerfd.h>
+#include <sys/poll.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include "timerfd_manager.h"
+
+timerfd_manager_t::timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler)
+{
+    this->set_fd_handler = set_fd_handler;
+    wait_state = 0;
+    timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+    if (timerfd < 0)
+    {
+        throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
+    }
+    set_fd_handler(timerfd, [this](int fd, int events)
+    {
+        handle_readable();
+    });
+}
+
+timerfd_manager_t::~timerfd_manager_t()
+{
+    set_fd_handler(timerfd, NULL);
+    close(timerfd);
+}
+
+void timerfd_manager_t::inc_timer(timerfd_timer_t & t)
+{
+    t.next.tv_sec += t.millis/1000;
+    t.next.tv_nsec += (t.millis%1000)*1000000;
+    if (t.next.tv_nsec > 1000000000)
+    {
+        t.next.tv_sec++;
+        t.next.tv_nsec -= 1000000000;
+    }
+}
+
+int timerfd_manager_t::set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback)
+{
+    int timer_id = id++;
+    timespec start;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    timers.push_back({
+        .id = timer_id,
+        .millis = millis,
+        .start = start,
+        .next = start,
+        .repeat = repeat,
+        .callback = callback,
+    });
+    inc_timer(timers[timers.size()-1]);
+    set_nearest();
+    return timer_id;
+}
+
+void timerfd_manager_t::clear_timer(int timer_id)
+{
+    for (int i = 0; i < timers.size(); i++)
+    {
+        if (timers[i].id == timer_id)
+        {
+            timers.erase(timers.begin()+i, timers.begin()+i+1);
+            if (nearest == i)
+            {
+                nearest = -1;
+                wait_state = wait_state & ~1;
+            }
+            else if (nearest > i)
+            {
+                nearest--;
+            }
+            set_nearest();
+            break;
+        }
+    }
+}
+
+void timerfd_manager_t::set_nearest()
+{
+again:
+    if (!timers.size())
+    {
+        nearest = -1;
+        itimerspec exp = { 0 };
+        if (timerfd_settime(timerfd, 0, &exp, NULL))
+        {
+            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
+        }
+        wait_state = wait_state & ~1;
+    }
+    else
+    {
+        nearest = 0;
+        for (int i = 1; i < timers.size(); i++)
+        {
+            if (timers[i].next.tv_sec < timers[nearest].next.tv_sec ||
+                timers[i].next.tv_sec == timers[nearest].next.tv_sec &&
+                timers[i].next.tv_nsec < timers[nearest].next.tv_nsec)
+            {
+                nearest = i;
+            }
+        }
+        timespec now;
+        clock_gettime(CLOCK_MONOTONIC, &now);
+        itimerspec exp = {
+            .it_interval = { 0 },
+            .it_value = timers[nearest].next,
+        };
+        exp.it_value.tv_sec -= now.tv_sec;
+        exp.it_value.tv_nsec -= now.tv_nsec;
+        if (exp.it_value.tv_nsec < 0)
+        {
+            exp.it_value.tv_sec--;
+            exp.it_value.tv_nsec += 1000000000;
+        }
+        if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
+        {
+            // It already happened
+            trigger_nearest();
+            goto again;
+        }
+        if (timerfd_settime(timerfd, 0, &exp, NULL))
+        {
+            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
+        }
+        wait_state = wait_state | 1;
+    }
+}
+
+void timerfd_manager_t::handle_readable()
+{
+    uint64_t n;
+    size_t res = read(timerfd, &n, 8);
+    if (res == 8 && nearest >= 0)
+    {
+        trigger_nearest();
+    }
+    wait_state = 0;
+    set_nearest();
+}
+
+void timerfd_manager_t::trigger_nearest()
+{
+    int nearest_id = timers[nearest].id;
+    auto cb = timers[nearest].callback;
+    if (timers[nearest].repeat)
+    {
+        inc_timer(timers[nearest]);
+    }
+    else
+    {
+        timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
+    }
+    cb(nearest_id);
+    nearest = -1;
+}
--- a/timerfd_manager.h
+++ b/timerfd_manager.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <time.h>
+#include <vector>
+#include <functional>
+
+struct timerfd_timer_t
+{
+    int id;
+    uint64_t millis;
+    timespec start, next;
+    bool repeat;
+    std::function<void(int)> callback;
+};
+
+class timerfd_manager_t
+{
+    int wait_state = 0;
+    int timerfd;
+    int nearest = -1;
+    int id = 1;
+    std::vector<timerfd_timer_t> timers;
+
+    void inc_timer(timerfd_timer_t & t);
+    void set_nearest();
+    void trigger_nearest();
+    void handle_readable();
+public:
+    std::function<void(int, std::function<void(int, int)>)> set_fd_handler;
+
+    timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler);
+    ~timerfd_manager_t();
+    int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
+    void clear_timer(int timer_id);
+};
Author	SHA1	Message	Date
Vitaliy Filippov	735b97fe33	Trace I/O operations (SQEs, recvmsg/sendmsg, uring_submit)	2020-06-09 00:52:29 +03:00
Vitaliy Filippov	d56633843f	Replace io_uring sendmsg/recvmsg with synchronous sendmsg/recvmsg	2020-06-09 00:52:29 +03:00
Vitaliy Filippov	4dde8b8a42	Oops, fix fio_sec_osd block_order parsing	2020-06-09 00:52:00 +03:00
Vitaliy Filippov	f5ccb154af	Benchmark reads in stub_bench, too	2020-06-08 01:54:44 +03:00
Vitaliy Filippov	73c80e2c39	Move accept_connections() to osd_messenger_t, add a simple uring OSD stub	2020-06-08 01:32:16 +03:00
Vitaliy Filippov	437dc5b630	Implement a FIO engine for testing cluster I/O	2020-06-07 00:30:15 +03:00
Vitaliy Filippov	226f5a2945	Allow to override block_size in fio_sec_osd	2020-06-07 00:10:13 +03:00
Vitaliy Filippov	2187d06eac	Add a parameter to pass the initial config to client	2020-06-07 00:10:12 +03:00
Vitaliy Filippov	c573bc6bb3	(Probably almost) implement cluster client	2020-06-07 00:09:36 +03:00
Vitaliy Filippov	2f6cf605a1	Rename cluster_client to osd_messenger	2020-06-04 12:57:54 +03:00
Vitaliy Filippov	05ea97119f	Fix BS_OP_LIST to account for deleted objects: only list the newest stable entry of each object This allows list responses to be unaffected by journal flushes, which, in turn, fixes PG peering when a peer OSD is replaying journal and journal contains deletions	2020-06-02 23:52:48 +03:00
Vitaliy Filippov	571be0f380	Make deletions instantly stable "2-phase" (write->stabilize) process is pointless for deletions because it doesn't protect us from incomplete objects. This happens because it removes the version information from metadata after stabilization. Deletions require "3-phase" process with a potentially very long 3rd phase. So, deletions will be allowed to generate degraded and incomplete objects, and for it to not affect users' ability to delete something, the cluster will allow to delete whole inodes while storing a list of them in etcd. Proper TRIM will be impossible until the implementation of the aforementioned "3-phase" process, though. By the way, this change also fixes a possible write stall after rebalancing which was caused by the lack of "stabilize delete" operations.	2020-06-02 23:45:22 +03:00
Vitaliy Filippov	985c309d7f	Remove duplicate code between blockstore_{rollback,stable} and blockstore_init	2020-06-02 20:37:00 +03:00
Vitaliy Filippov	a56f8cd14e	Simplify handle_primary_subop() arguments	2020-06-02 18:44:23 +03:00
Vitaliy Filippov	46e111272f	Replace assert(this_it == cur_op) with if() for the case of PG repeering	2020-06-02 14:30:57 +03:00
Vitaliy Filippov	165c204555	Fix BS_OP_DELETE (the implementation was untested up to this point)	2020-06-02 14:26:01 +03:00
Vitaliy Filippov	af5cd45071	Oh crap, got SIGPIPE. Add MSG_NOSIGNAL	2020-06-02 11:41:08 +03:00
Vitaliy Filippov	c3fe9ad0d1	Fix rebalancing writes (add a forgotten state resume)	2020-06-02 01:26:14 +03:00
Vitaliy Filippov	0fcdeae18b	Do not die if a peer is already stopped on flush error	2020-06-01 23:07:08 +03:00
Vitaliy Filippov	e6a4b634f8	Fix possible write stall The stall occurred during fio Q=128 random write tests with low flusher_count (4). It was caused by flushers being unable to flush the beginning of the journal because it contained older writes to an object that also had writes in the very end of the journal, after dirty_start.	2020-06-01 16:18:23 +03:00
Vitaliy Filippov	c22e096943	Output journal offsets in debug trace in hex, add detailed "still waiting" messages	2020-06-01 16:18:19 +03:00
Vitaliy Filippov	45b1c2fbf1	Fix canceling of write operations on PG re-peer (which led to use-after-free, too...)	2020-06-01 16:18:14 +03:00
Vitaliy Filippov	3469bead67	Protect "delete this" with a stack refcounter (to fix use-after-free, too, but "delete this" was a time bomb anyway)	2020-06-01 16:18:09 +03:00
Vitaliy Filippov	3a5d488f19	Fix use-after-free in osd_flush.cpp	2020-06-01 01:56:24 +03:00
Vitaliy Filippov	73e4e30b1f	Auto-generate C++ header dependencies	2020-06-01 00:25:25 +03:00
Vitaliy Filippov	5feff1ffb9	Slightly cleanup socket send/receive code	2020-05-31 15:03:27 +03:00
Vitaliy Filippov	b466e215f0	Fix queued OP_SYNC execution	2020-05-27 13:55:25 +03:00
Vitaliy Filippov	36f995367f	Fix bind_address reporting	2020-05-27 10:58:40 +03:00
Vitaliy Filippov	0aca6e9ca8	Extract peer connect and read-write loop into a separate file (to be shared with the client library)	2020-05-26 22:11:30 +03:00
Vitaliy Filippov	fa98be6bc0	Allow to specify multiple etcd addresses	2020-05-25 16:30:05 +03:00
Vitaliy Filippov	256a7f2667	Free op->bs_op manually	2020-05-25 15:31:22 +03:00
Vitaliy Filippov	79bf57b6e2	Allow to override pg_stripe_size	2020-05-25 15:31:22 +03:00
Vitaliy Filippov	53f6aba3e6	Die when journal_sector_buffer_count is too small	2020-05-24 17:26:47 +03:00
Vitaliy Filippov	36595eb669	Print "Ran out of journal sector buffers" warning	2020-05-24 16:48:50 +03:00
Vitaliy Filippov	e09d0e0678	Several bug fixes - Do not block flock() requests - Fix stop_client(0) attempts leading to std::bad_function_call - Fix degraded writes crashing due to an unset stripes[i].missing (at least with a missing parity device) - Fix recovery B/W reporting	2020-05-24 01:51:35 +03:00
Vitaliy Filippov	d1602b50b3	Fix BS_OP_ROLLBACK removing an incorrect version Instead of only removing versions with oid == X and version > Y it was also removing the previous version in list (with the previous oid or with version == Y)	2020-05-24 01:51:28 +03:00
Vitaliy Filippov	7df384031a	Re-peer PGs after stopping the peer Fixes the bug where two peers killed at once have lead to PG state PG_DEGRADED\|PG_HAS_INCOMPLETE instead of PG_INCOMPLETE	2020-05-23 18:45:12 +03:00
Vitaliy Filippov	e614a98543	Add a sad FIXME :-)	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	01dd3ef89e	Fix timerfd_manager triggering of multiple times at the same time	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	cdccc23aff	Print [OSD $osd_num] in stats, print B/W only for ops that log bytes	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	700428829a	Fix autosync_interval default not setting when autosync_interval is skipped in config	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	6488d0044a	Ignore EPOLL_CTL_DEL ENOENT, fix detection of the rollback version	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	393fe75900	Fix creepy (osd_op_t*)(long) casts	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	f036eecf1c	Fix osd_rmw object recovery case (len==0)	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	e56909fb45	Remove tv_send (unused) and timerfd_interval from blockstore	2020-05-22 15:57:08 +03:00
Vitaliy Filippov	fac75b0b57	Handle reweights in mon	2020-05-22 12:52:27 +03:00
Vitaliy Filippov	9f842ec9a5	Remove connect callback because it is always the same	2020-05-22 12:45:12 +03:00
Vitaliy Filippov	f6a01a4819	Extract "state-watching" etcd client into a separate file	2020-05-22 12:38:40 +03:00
Vitaliy Filippov	6202260018	Extract HTTP client functions from osd_t	2020-05-21 11:39:01 +03:00
Vitaliy Filippov	a61ede9951	Remove io_uring usage from osd_http and timerfd_manager For better future interoperability with external event loops such as QEMU's one	2020-05-21 01:25:38 +03:00
Vitaliy Filippov	f57731f8ca	Calculate total stats in the monitor	2020-05-15 01:37:17 +03:00
Vitaliy Filippov	19f25c7cd5	Handle integer overflow of the op_stat_count	2020-05-15 01:37:17 +03:00
Vitaliy Filippov	2c3e84cc41	Implement stop_all_pgs()	2020-05-15 01:37:17 +03:00
Vitaliy Filippov	7bda66b866	Do not crash when optimising PGs in an undersized cluster	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	b467d0559f	Begin node.js storage monitor service	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	c2c2eefea4	Duplicate host in osd/state and osd/stats, take PGs from /config/pgs.items	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	5084ff7c6c	Measure & report recovery op count and bandwidth	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	47b6f64106	Support level names	2020-05-11 15:57:21 +03:00
Vitaliy Filippov	f71d0c117b	Measure & report op bandwidth, include local blockstore ops in stats	2020-05-11 02:58:13 +03:00
Vitaliy Filippov	2b854948f9	Remove dead code	2020-05-09 16:15:02 +03:00
Vitaliy Filippov	e7f897ed65	Report hostname to etcd	2020-05-09 02:33:43 +03:00
Vitaliy Filippov	c26b6e1fc3	Support CRUSH-like multi-level placement trees	2020-05-09 00:55:24 +03:00
Vitaliy Filippov	aaa054e644	Fix optimize_change generating infeasible problems Mainly happened when removing PG combinations (removing OSDs) Also randomize OSD combinations when there's a lot of them Also remove Perl version	2020-05-08 16:42:40 +03:00
Vitaliy Filippov	706a44d4d4	Fix optimize_initial in both perl and js versions	2020-05-06 23:12:03 +03:00
Vitaliy Filippov	842f88f94f	Rewrite LPOptimizer.pm to nodejs	2020-05-06 02:08:15 +03:00
Vitaliy Filippov	e8149e5848	Implement OSD_OP_DELETE	2020-05-05 00:39:51 +03:00
Vitaliy Filippov	6355b968f4	Track osd_set history and all_peers separately	2020-05-04 15:28:07 +03:00
Vitaliy Filippov	00cf24fbd7	Split osd_primary.cpp	2020-05-03 11:04:20 +03:00
Vitaliy Filippov	1bc08174f9	Sync before listing objects so flushes do not fail thereafter	2020-05-01 12:56:49 +03:00
Vitaliy Filippov	cd87333091	Fix PG state comparison leading to unclean PGs not flushing (a & b == b) -> ((a & b) == b) !	2020-05-01 12:56:46 +03:00
Vitaliy Filippov	bd0fe6e4cc	Fix PGs not stopping during sync, fix state reporting autovivification of erased PGs	2020-05-01 01:33:14 +03:00
Vitaliy Filippov	ce78454215	Reply with -EROFS to write commands in readonly mode	2020-05-01 00:54:34 +03:00
Vitaliy Filippov	762bd42096	Fix use-after-free caused by "delete this" in handle_read	2020-04-30 02:15:53 +03:00
Vitaliy Filippov	7b57eeeeb3	Implement PG state locking and PG moving in response to etcd events	2020-04-29 22:23:38 +03:00
Vitaliy Filippov	ec4a52af48	Fix websocket (and timer!) bugs	2020-04-26 01:59:56 +03:00
Vitaliy Filippov	268b497c0b	Implement simple websocket client	2020-04-25 23:11:50 +03:00
Vitaliy Filippov	35481925b1	Implement very simple HTTP streaming to handle etcd watches	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	895a80dfc4	Fix etcd 3.2 compatibility (no compare.target == LEASE, /kv/lease/revoke), fix small bugs	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	caa01c6aaf	Acquire etcd leases, prevent starting two OSDs with the same number	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	d398ddfd3b	Use snake_case for etcd requests	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	0f2b8dbf6f	Use a single timerfd_manager for all timers	2020-04-25 01:35:49 +03:00
Vitaliy Filippov	4f42e9659e	Use etcd instead of Consul	2020-04-24 01:03:55 +03:00
Vitaliy Filippov	7cf71a8031	Fix timerfd_manager: remove timer, then call callback	2020-04-21 12:45:18 +03:00
Vitaliy Filippov	9d22559bcf	Start peering immediately when loading PGs	2020-04-21 02:27:13 +03:00
Vitaliy Filippov	8c03e3ebab	Lock Blockstore devices exclusively by default	2020-04-21 01:59:11 +03:00
Vitaliy Filippov	2a640ba2e8	Remove range port selection (leads to races)	2020-04-21 00:10:59 +03:00
Vitaliy Filippov	6a21ea207e	Check peer config (at least, number) after connecting	2020-04-21 00:08:54 +03:00
Vitaliy Filippov	642802b595	Auto-select port numbers	2020-04-20 17:45:27 +03:00
Vitaliy Filippov	ff38b464a5	Add consul & connect timeouts, report state before loading PGs, move init_primary to osd_cluster	2020-04-20 15:43:07 +03:00
Vitaliy Filippov	663153713b	Reconnect to peers after connecting drops	2020-04-19 01:01:26 +03:00
Vitaliy Filippov	dc57c5c362	Report PG states again, clear PG history on reaching active+clean	2020-04-19 00:48:23 +03:00
Vitaliy Filippov	f95299b769	Take PG history into account when starting PGs	2020-04-19 00:20:18 +03:00
Vitaliy Filippov	9126ffb0f9	Fix PG loading - now it works, at least once	2020-04-17 02:33:44 +03:00
Vitaliy Filippov	2a8e40835e	Fix reporting to Consul, report even if we are purely secondary	2020-04-17 01:59:06 +03:00
Vitaliy Filippov	309486d746	Implement loading PGs from Consul (in theory)	2020-04-16 23:22:32 +03:00
Vitaliy Filippov	582f485578	Extract http & getifaddr_list into a separate file	2020-04-15 15:47:06 +03:00
Vitaliy Filippov	089b4eb208	Retry consul connection attempts and then die	2020-04-15 15:33:18 +03:00
Vitaliy Filippov	d78ce509c6	Add simple timer manager	2020-04-15 13:41:44 +03:00
Vitaliy Filippov	f3a7ccff50	Use 4K blockstore block by default, use MEM_ALIGNMENT in osd code	2020-04-14 19:19:56 +03:00
Vitaliy Filippov	37b27c3025	Implement basic OSD status reporting to Consul	2020-04-14 14:52:06 +03:00
Vitaliy Filippov	edf6d6f897	Fix http_request	2020-04-12 02:08:00 +03:00
Vitaliy Filippov	d11e8dcb5e	Do not flush or recover in readonly mode	2020-04-11 12:06:18 +03:00
Vitaliy Filippov	dd02bc1c44	Add base64 implementation	2020-04-11 12:06:18 +03:00
Vitaliy Filippov	298b013eae	Add simple http request function	2020-04-11 12:05:58 +03:00
Vitaliy Filippov	0880a77c1a	2 FIXME for the future	2020-04-06 00:55:47 +03:00
Vitaliy Filippov	aa849ea07b	Add a test for missing chunk overwrite	2020-04-05 16:14:03 +03:00
Vitaliy Filippov	d59be0e8b4	Delete misplaced chunks after moving the object, reset object state in primary_write	2020-04-05 15:51:22 +03:00
Vitaliy Filippov	cf7de0f181	(Almost) Implement misplaced recovery, integrating it into calc_rmw()	2020-04-05 15:50:53 +03:00
Vitaliy Filippov	6212195440	Implement parallel recovery	2020-04-04 19:23:12 +03:00
Vitaliy Filippov	dfb6e15eaa	Implement graceful stopping of PGs	2020-04-03 13:03:42 +03:00
Vitaliy Filippov	afe2e76c87	Implement regular automatic syncs, split osd_t constructor into some methods	2020-04-02 22:16:46 +03:00
Vitaliy Filippov	0f43f6d3f6	Fix crashes, print some stats Notably: - fix the `delete op` inside lambda callback crash (it frees the lambda itself which results in use-after-free with g++) - fix stop_client() reenterability - fix a bug in the blockstore layer which resulted in always returning version=0 for zero-length reads - change error codes for blockstore_stabilize	2020-03-31 17:55:31 +03:00
Vitaliy Filippov	92c800bb64	Forget unstable writes when re-peering, rename parity_block_size -> pg_stripe_size, pg_parity_size -> pg_block_size	2020-03-31 02:09:25 +03:00
Vitaliy Filippov	8a8b619875	Handle secondary OSD connection errors [in theory]	2020-03-30 19:51:34 +03:00
Vitaliy Filippov	43fe1d88e7	Fix memory leaks with subops, fix recovery crashes	2020-03-28 19:09:20 +03:00
Vitaliy Filippov	1b30120918	Fix stripe reconstruction in recovery, only write modified object parts	2020-03-28 13:58:42 +03:00
Vitaliy Filippov	c0a22d825d	Fix degraded object recovery (it seems to work now)	2020-03-25 02:17:41 +03:00
Vitaliy Filippov	7acfc95f75	CONFIG_HAVE_GETTID	2020-03-25 01:20:20 +03:00
Vitaliy Filippov	250f22c0b6	Implement basic degraded object recovery (integrated into primary_write)	2020-03-25 01:17:50 +03:00
Vitaliy Filippov	dbd8418798	Reply using a single finish_op() method, allow to call OSD ops from inside the OSD	2020-03-24 00:18:52 +03:00
Vitaliy Filippov	036f4c5bf3	Fix unstable flushing, include extra OSDs with old object versions in osd_set	2020-03-23 20:28:47 +03:00
Vitaliy Filippov	fd8e1a8418	Slightly reorganize object state check code	2020-03-23 00:42:17 +03:00
Vitaliy Filippov	a08e0bfacd	Treat misplaced and degraded as separate state parts	2020-03-23 00:40:31 +03:00
Vitaliy Filippov	ddc3e927d3	Solve it in integers	2020-03-20 13:58:54 +03:00
Vitaliy Filippov	2aa605f2bb	Do not check	2020-03-20 13:38:35 +03:00
Vitaliy Filippov	18915b264a	Extract to .pm + fix all_combinations	2020-03-19 21:35:47 +03:00
Vitaliy Filippov	60f795e7eb	Add lp_solve based data distribution optimizer	2020-03-19 17:23:24 +03:00
Vitaliy Filippov	3a4279adbf	Hash-based PG distribution experiments	2020-03-17 18:52:39 +03:00
Vitaliy Filippov	1ec9794376	Extract flushing into a separate file	2020-03-15 18:39:31 +03:00
Vitaliy Filippov	d8164e9d84	Print PG states on every change	2020-03-14 22:19:45 +03:00
Vitaliy Filippov	21d0b06959	Implement flushing (stabilize/rollback) of unstable entries on start of the PG	2020-03-14 02:49:34 +03:00
Vitaliy Filippov	46f9bd2a69	Make blockstore list operation return consistent snapshots	2020-03-14 02:10:25 +03:00
Vitaliy Filippov	6982fe1255	Do not block reads by previous unfinished writes	2020-03-13 21:28:49 +03:00
Vitaliy Filippov	eba053febe	Do not start small writes before finishing the last big write to the same object	2020-03-12 02:15:01 +03:00
Vitaliy Filippov	899946ff96	Add osd_test function to unblock an OSD blocked by the lack of journal space	2020-03-10 17:19:24 +03:00
Vitaliy Filippov	3dd1b22d55	Fix segfault with concurrent OP_SYNCs	2020-03-10 17:00:23 +03:00
Vitaliy Filippov	31f9445030	Use immediate_commit to benefit the primary OSD	2020-03-10 02:20:16 +03:00
Vitaliy Filippov	3f522c66e6	Implement immediate commit mode	2020-03-10 01:59:15 +03:00
Vitaliy Filippov	c3737ae3ff	Add journal fsync to stabilize/rollback	2020-03-09 00:35:58 +03:00
Vitaliy Filippov	c863543bfe	Fix possible journal corruption caused by concurrent flushing and writing of the same journal sector	2020-03-08 01:21:19 +03:00
Vitaliy Filippov	1696446545	Rename min/max _used to _flushed	2020-03-07 16:41:58 +03:00
Vitaliy Filippov	41dddddbf2	Fix some logging	2020-03-07 16:41:53 +03:00
Vitaliy Filippov	2d4e24c9ce	Add journal dumper debugging tool	2020-03-06 02:29:43 +03:00
Vitaliy Filippov	844cacd357	Allow incorrectly forbidden BS_OP_LIST in readonly mode	2020-03-06 02:29:39 +03:00
Vitaliy Filippov	e19d9fde5f	Fix peering_pg, begin tests	2020-03-06 02:02:49 +03:00
Vitaliy Filippov	9cb07d844b	Make [un]register_consumer operate on pointers, rename get_loop_again() to has_work()	2020-03-04 21:00:20 +03:00
Vitaliy Filippov	1e21555343	Add FIXME with Oops	2020-03-04 20:34:45 +03:00
Vitaliy Filippov	94cdbcd085	Stop reading when less than <buffer> data is available	2020-03-04 18:03:16 +03:00
Vitaliy Filippov	8315407558	Incoming data pre-buffering	2020-03-04 17:34:45 +03:00
Vitaliy Filippov	b27ad550cf	Use btree_map instead of sparsepp	2020-03-04 17:12:27 +03:00
Vitaliy Filippov	8e63995306	Allow to specify data area size	2020-03-04 02:32:49 +03:00