Test: use submit_and_wait()

2020-03-03 17:48:47 +03:00
76 changed files with 2545 additions and 10814 deletions
--- a/Make-gen.pl
+++ b/Make-gen.pl
@@ -1,46 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-
-my $deps = {};
-for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
-{
-    if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
-    {
-        $deps->{$1}->{$2} = 1;
-    }
-}
-
-my $added;
-do
-{
-    $added = 0;
-    for my $file (keys %$deps)
-    {
-        for my $dep (keys %{$deps->{$file}})
-        {
-            if ($deps->{$dep})
-            {
-                for my $subdep (keys %{$deps->{$dep}})
-                {
-                    if (!$deps->{$file}->{$subdep})
-                    {
-                        $added = 1;
-                        $deps->{$file}->{$subdep} = 1;
-                    }
-                }
-            }
-        }
-    }
-} while ($added);
-
-for my $file (sort keys %$deps)
-{
-    if ($file =~ /\.cpp$/)
-    {
-        my $obj = $file;
-        $obj =~ s/\.cpp$/.o/s;
-        print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
-        print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
-    }
-}
--- a/189
+++ b/189
@@ -1,153 +1,66 @@
 BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
-	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
+	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
 # -fsanitize=address
 CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
-all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal
+all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd stub_bench osd_test
 clean:
 	rm -f *.o

-dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
-	g++ $(CXXFLAGS) -o $@ $< crc32c.o
-
-libblockstore.so: $(BLOCKSTORE_OBJS)
-	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
-libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
-	g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
-
-OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
-	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o pg_states.o \
-	osd_rmw.o json11.o base64.o timerfd_manager.o
-osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
-	g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
-
-stub_osd: stub_osd.o rw_blocking.o
-	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
-
-STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
-stub_uring_osd: $(STUB_URING_OSD_OBJS)
-	g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
-stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
-	g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
-osd_test: osd_test.cpp osd_ops.h rw_blocking.o
-	g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
-osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
-	g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
-
-libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
-	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
-
-FIO_CLUSTER_OBJS := fio_cluster.o cluster_client.o epoll_manager.o etcd_state_client.o \
-	messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o pg_states.o timerfd_manager.o base64.o
-libfio_cluster.so: $(FIO_CLUSTER_OBJS)
-	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) -luring
-
-test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
-	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
-test: test.cpp osd_peering_pg.o
-	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
-test_allocator: test_allocator.cpp allocator.o
-	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
-
-crc32c.o: crc32c.c crc32c.h
+crc32c.o: crc32c.c
 	g++ $(CXXFLAGS) -c -o $@ $<
 json11.o: json11/json11.cpp
 	g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
-
-# Autogenerated
-
 allocator.o: allocator.cpp allocator.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-base64.o: base64.cpp base64.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-messenger.o: messenger.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-msgr_receive.o: msgr_receive.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-msgr_send.o: msgr_send.cpp json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_rmw.o: osd_rmw.cpp object_id.h osd_id.h osd_rmw.h xor.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_rmw_test.o: osd_rmw_test.cpp object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h etcd_state_client.h http_client.h json11/json11.hpp messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-pg_states.o: pg_states.cpp pg_states.h
-	g++ $(CXXFLAGS) -c -o $@ $<
 ringloop.o: ringloop.cpp ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+
+%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+
+libblockstore.so: $(BLOCKSTORE_OBJS)
+	g++ $(CXXFLAGS) -o libblockstore.so -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
+libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
+	g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
+
+OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
+osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
+	g++ $(CXXFLAGS) -o $@ $<
+osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
+	g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
+stub_osd: stub_osd.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o stub_osd stub_osd.cpp rw_blocking.o -ltcmalloc_minimal
+stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o stub_bench stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
 rw_blocking.o: rw_blocking.cpp rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-test_allocator.o: test_allocator.cpp allocator.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
-	g++ $(CXXFLAGS) -c -o $@ $<
-timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
-	g++ $(CXXFLAGS) -c -o $@ $<
+osd_test: osd_test.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
+
+libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring
+
+test_blockstore: ./libblockstore.so test_blockstore.cpp
+	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp ./libblockstore.so -ltcmalloc_minimal -luring
+test: test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring
+test_allocator: test_allocator.cpp allocator.o
+	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
--- a/base64.cpp
+++ b/base64.cpp
@@ -1,52 +0,0 @@
-#include "base64.h"
-
-std::string base64_encode(const std::string &in)
-{
-    std::string out;
-    unsigned val = 0;
-    int valb = -6;
-    for (unsigned char c: in)
-    {
-        val = (val << 8) + c;
-        valb += 8;
-        while (valb >= 0)
-        {
-            out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb) & 0x3F]);
-            valb -= 6;
-        }
-    }
-    if (valb > -6)
-        out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8)) & 0x3F]);
-    while (out.size() % 4)
-        out.push_back('=');
-    return out;
-}
-
-static char T[256] = { 0 };
-
-std::string base64_decode(const std::string &in)
-{
-    std::string out;
-    if (T[0] == 0)
-    {
-        for (int i = 0; i < 256; i++)
-            T[i] = -1;
-        for (int i = 0; i < 64; i++)
-            T[(unsigned char)("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i])] = i;
-    }
-    unsigned val = 0;
-    int valb = -8;
-    for (unsigned char c: in)
-    {
-        if (T[c] == -1)
-            break;
-        val = (val<<6) + T[c];
-        valb += 6;
-        if (valb >= 0)
-        {
-            out.push_back(char((val >> valb) & 0xFF));
-            valb -= 8;
-        }
-    }
-    return out;
-}
--- a/base64.h
+++ b/base64.h
@@ -1,5 +0,0 @@
-#pragma once
-#include <string>
-
-std::string base64_encode(const std::string &in);
-std::string base64_decode(const std::string &in);
--- a/blockstore.cpp
+++ b/blockstore.cpp
@@ -55,11 +55,6 @@ uint64_t blockstore_t::get_block_count()
    return impl->get_block_count();
 }

-uint64_t blockstore_t::get_free_block_count()
-{
-    return impl->get_free_block_count();
-}
-
 uint32_t blockstore_t::get_disk_alignment()
 {
    return impl->get_disk_alignment();
--- a/blockstore.h
+++ b/blockstore.h
@@ -15,9 +15,7 @@

 // Memory alignment for direct I/O (usually 512 bytes)
 // All other alignments must be a multiple of this one
-#ifndef MEM_ALIGNMENT
 #define MEM_ALIGNMENT 512
-#endif

 // Default block size is 128 KB, current allowed range is 4K - 128M
 #define DEFAULT_ORDER 17
@@ -52,7 +50,6 @@ Input:
  - version == 0: read the last stable version,
  - version == UINT64_MAX: read the last version,
  - otherwise: read the newest version that is <= the specified version
-  - reads aren't guaranteed to return data from previous unfinished writes
  For writes:
  - if version == 0, a new version is assigned automatically
  - if version != 0, it is assigned for the new write if possible, otherwise -EINVAL is returned
@@ -95,7 +92,7 @@ Input:
 - buf = pre-allocated obj_ver_id array <len> units long

 Output:
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
+- retval = 0 or negative error number (-EINVAL)

 ## BS_OP_SYNC_STAB_ALL

@@ -178,7 +175,6 @@ public:
    // FIXME rename to object_size
    uint32_t get_block_size();
    uint64_t get_block_count();
-    uint64_t get_free_block_count();

    uint32_t get_disk_alignment();
 };
--- a/blockstore_flush.cpp
+++ b/blockstore_flush.cpp
@@ -4,11 +4,9 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
 {
    this->bs = bs;
    this->flusher_count = flusher_count;
-    dequeuing = false;
    active_flushers = 0;
-    syncing_flushers = 0;
-    flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
-    journal_trim_interval = flusher_start_threshold;
+    sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
+    journal_trim_interval = sync_threshold;
    journal_trim_counter = 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
    co = new journal_flusher_co[flusher_count];
@@ -57,13 +55,17 @@ journal_flusher_t::~journal_flusher_t()

 bool journal_flusher_t::is_active()
 {
-    return active_flushers > 0 || dequeuing;
+    return active_flushers > 0 || start_forced && flush_queue.size() > 0 || flush_queue.size() >= sync_threshold;
 }

 void journal_flusher_t::loop()
 {
-    for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
+    for (int i = 0; i < flusher_count; i++)
    {
+        if (!active_flushers && (start_forced ? !flush_queue.size() : (flush_queue.size() < sync_threshold)))
+        {
+            return;
+        }
        co[i].loop();
    }
 }
@@ -81,11 +83,6 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_back(ov.oid);
    }
-    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
-    {
-        dequeuing = true;
-        bs->ringloop->wakeup();
-    }
 }

 void journal_flusher_t::unshift_flush(obj_ver_id ov)
@@ -101,25 +98,14 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_front(ov.oid);
    }
-    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
-    {
-        dequeuing = true;
-        bs->ringloop->wakeup();
-    }
 }

-void journal_flusher_t::request_trim()
+void journal_flusher_t::force_start()
 {
-    dequeuing = true;
-    trim_wanted++;
+    start_forced = true;
    bs->ringloop->wakeup();
 }

-void journal_flusher_t::release_trim()
-{
-    trim_wanted--;
-}
-
 #define await_sqe(label) \
    resume_##label:\
        sqe = bs->get_sqe();\
@@ -130,7 +116,6 @@ void journal_flusher_t::release_trim()
        }\
        data = ((ring_data_t*)sqe->user_data);

-// FIXME: Implement batch flushing
 bool journal_flusher_co::loop()
 {
    // This is much better than implementing the whole function as an FSM
@@ -170,9 +155,10 @@ bool journal_flusher_co::loop()
    else if (wait_state == 18)
        goto resume_18;
 resume_0:
-    if (!flusher->flush_queue.size() || !flusher->dequeuing)
+    if (!flusher->flush_queue.size() ||
+        !flusher->start_forced && !flusher->active_flushers && flusher->flush_queue.size() < flusher->sync_threshold)
    {
-        flusher->dequeuing = false;
+        flusher->start_forced = false;
        wait_state = 0;
        return true;
    }
@@ -183,76 +169,6 @@ resume_0:
    dirty_end = bs->dirty_db.find(cur);
    if (dirty_end != bs->dirty_db.end())
    {
-        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
-            (bs->journal.dirty_start >= bs->journal.used_start ||
-            dirty_end->second.journal_sector < bs->journal.used_start))
-        {
-            flusher->enqueue_flush(cur);
-            // We can't flush journal sectors that are still written to
-            // However, as we group flushes by oid, current oid may have older writes to flush!
-            // And it may even block writes if we don't flush the older version
-            // (if it's in the beginning of the journal)...
-            // So first try to find an older version of the same object to flush.
-            bool found = false;
-            while (dirty_end != bs->dirty_db.begin())
-            {
-                dirty_end--;
-                if (dirty_end->first.oid != cur.oid)
-                {
-                    break;
-                }
-                if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
-                    (bs->journal.dirty_start >= bs->journal.used_start ||
-                    dirty_end->second.journal_sector < bs->journal.used_start)))
-                {
-                    found = true;
-                    cur.version = dirty_end->first.version;
-                    break;
-                }
-            }
-            if (!found)
-            {
-                // Try other objects
-                int search_left = flusher->flush_queue.size() - 1;
-#ifdef BLOCKSTORE_DEBUG
-                printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
-#endif
-                while (search_left > 0)
-                {
-                    cur.oid = flusher->flush_queue.front();
-                    cur.version = flusher->flush_versions[cur.oid];
-                    flusher->flush_queue.pop_front();
-                    flusher->flush_versions.erase(cur.oid);
-                    dirty_end = bs->dirty_db.find(cur);
-                    if (dirty_end != bs->dirty_db.end())
-                    {
-                        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
-                            (bs->journal.dirty_start >= bs->journal.used_start ||
-                            dirty_end->second.journal_sector < bs->journal.used_start))
-                        {
-#ifdef BLOCKSTORE_DEBUG
-                            printf("Write %lu:%lu v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
-#endif
-                            flusher->enqueue_flush(cur);
-                        }
-                        else
-                        {
-                            break;
-                        }
-                    }
-                    search_left--;
-                }
-                if (search_left <= 0)
-                {
-#ifdef BLOCKSTORE_DEBUG
-                    printf("No older flushes, stopping\n");
-#endif
-                    flusher->dequeuing = false;
-                    wait_state = 0;
-                    return true;
-                }
-            }
-        }
        repeat_it = flusher->sync_to_repeat.find(cur.oid);
        if (repeat_it != flusher->sync_to_repeat.end())
        {
@@ -275,26 +191,32 @@ resume_0:
 #endif
        flusher->active_flushers++;
 resume_1:
-        // Find it in clean_db
-        clean_it = bs->clean_db.find(cur.oid);
-        old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
        // Scan dirty versions of the object
        if (!scan_dirty(1))
        {
            wait_state += 1;
            return false;
        }
-        // Writes and deletes shouldn't happen at the same time
-        assert(!(copy_count > 0 || has_writes) || !has_delete);
-        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
+        if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
        {
            // Nothing to flush
-            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
-            goto trim_journal;
+            flusher->active_flushers--;
+            repeat_it = flusher->sync_to_repeat.find(cur.oid);
+            if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
+            {
+                // Requeue version
+                flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
+            }
+            flusher->sync_to_repeat.erase(repeat_it);
+            wait_state = 0;
+            goto resume_0;
        }
+        // Find it in clean_db
+        clean_it = bs->clean_db.find(cur.oid);
+        old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
        if (clean_loc == UINT64_MAX)
        {
-            if (old_clean_loc == UINT64_MAX)
+            if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
            {
                // Object not allocated. This is a bug.
                char err[1024];
@@ -409,7 +331,6 @@ resume_1:
        else
        {
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
-            assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
            new_entry->oid = cur.oid;
            new_entry->version = cur.version;
            if (!bs->inmemory_meta)
@@ -465,9 +386,8 @@ resume_1:
        }
        // Update clean_db and dirty_db, free old data locations
        update_clean_db();
-    trim_journal:
        // Clear unused part of the journal every <journal_trim_interval> flushes
-        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
+        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
        {
            flusher->journal_trim_counter = 0;
            if (bs->journal.trim())
@@ -497,7 +417,7 @@ resume_1:
        }
        // All done
 #ifdef BLOCKSTORE_DEBUG
-        printf("Flushed %lu:%lu v%lu (%ld left)\n", cur.oid.inode, cur.oid.stripe, cur.version, flusher->flush_queue.size());
+        printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
 #endif
        flusher->active_flushers--;
        repeat_it = flusher->sync_to_repeat.find(cur.oid);
@@ -525,7 +445,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
    copy_count = 0;
    clean_loc = UINT64_MAX;
    has_delete = false;
-    has_writes = false;
+    has_empty = false;
    skip_copy = false;
    clean_init_bitmap = false;
    while (1)
@@ -533,8 +453,11 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
        {
            // First we submit all reads
-            has_writes = true;
-            if (dirty_it->second.len != 0)
+            if (dirty_it->second.len == 0)
+            {
+                has_empty = true;
+            }
+            else
            {
                offset = dirty_it->second.offset;
                end_offset = dirty_it->second.offset + dirty_it->second.len;
@@ -576,7 +499,6 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
        {
            // There is an unflushed big write. Copy small writes in its position
-            has_writes = true;
            clean_loc = dirty_it->second.location;
            clean_init_bitmap = true;
            clean_bitmap_offset = dirty_it->second.offset;
@@ -710,8 +632,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
        });
    sync_found:
        cur_sync->ready_count++;
-        flusher->syncing_flushers++;
-        if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
+        if (cur_sync->ready_count >= flusher->sync_threshold || !flusher->flush_queue.size())
        {
            // Sync batch is ready. Do it.
            await_sqe(0);
@@ -737,7 +658,6 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
            wait_state = 2;
            return false;
        }
-        flusher->syncing_flushers--;
        cur_sync->ready_count--;
        if (cur_sync->ready_count == 0)
        {
--- a/blockstore_flush.h
+++ b/blockstore_flush.h
@@ -45,8 +45,8 @@ class journal_flusher_co
    std::map<object_id, uint64_t>::iterator repeat_it;
    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;

-    bool skip_copy, has_delete, has_writes;
-    blockstore_clean_db_t::iterator clean_it;
+    bool skip_copy, has_delete, has_empty;
+    spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
    std::vector<copy_buffer_t> v;
    std::vector<copy_buffer_t>::iterator it;
    int copy_count;
@@ -73,10 +73,9 @@ public:
 // Journal flusher itself
 class journal_flusher_t
 {
-    int trim_wanted = 0;
-    bool dequeuing;
+    bool start_forced = false;
    int flusher_count;
-    int flusher_start_threshold;
+    int sync_threshold;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
    friend class journal_flusher_co;
@@ -85,7 +84,6 @@ class journal_flusher_t
    void* journal_superblock;

    int active_flushers;
-    int syncing_flushers;
    std::list<flusher_sync_t> syncs;
    std::map<object_id, uint64_t> sync_to_repeat;

@@ -97,8 +95,7 @@ public:
    ~journal_flusher_t();
    void loop();
    bool is_active();
-    void request_trim();
-    void release_trim();
+    void force_start();
    void enqueue_flush(obj_ver_id oid);
    void unshift_flush(obj_ver_id oid);
 };
--- a/blockstore_impl.cpp
+++ b/blockstore_impl.cpp
@@ -5,7 +5,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
    this->ringloop = ringloop;
    ring_consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(&ring_consumer);
+    ringloop->register_consumer(ring_consumer);
    initialized = 0;
    zero_object = (uint8_t*)memalign(MEM_ALIGNMENT, block_size);
    data_fd = meta_fd = journal.fd = -1;
@@ -36,7 +36,7 @@ blockstore_impl_t::~blockstore_impl_t()
    delete data_alloc;
    delete flusher;
    free(zero_object);
-    ringloop->unregister_consumer(&ring_consumer);
+    ringloop->unregister_consumer(ring_consumer);
    if (data_fd >= 0)
        close(data_fd);
    if (meta_fd >= 0 && meta_fd != data_fd)
@@ -98,19 +98,10 @@ void blockstore_impl_t::loop()
    {
        // try to submit ops
        unsigned initial_ring_space = ringloop->space_left();
-        // FIXME: rework this "sync polling"
        auto cur_sync = in_progress_syncs.begin();
        while (cur_sync != in_progress_syncs.end())
        {
-            if (continue_sync(*cur_sync) != 2)
-            {
-                // List is unmodified
-                cur_sync++;
-            }
-            else
-            {
-                cur_sync = in_progress_syncs.begin();
-            }
+            continue_sync(*cur_sync++);
        }
        auto cur = submit_queue.begin();
        int has_writes = 0;
@@ -124,6 +115,12 @@ void blockstore_impl_t::loop()
            if (PRIV(op)->wait_for)
            {
                check_wait(op);
+#ifdef BLOCKSTORE_DEBUG
+                if (PRIV(op)->wait_for)
+                {
+                    printf("still waiting for %d\n", PRIV(op)->wait_for);
+                }
+#endif
                if (PRIV(op)->wait_for == WAIT_SQE)
                {
                    break;
@@ -139,12 +136,12 @@ void blockstore_impl_t::loop()
            }
            unsigned ring_space = ringloop->space_left();
            unsigned prev_sqe_pos = ringloop->save();
-            bool dequeue_op = false;
+            int dequeue_op = 0;
            if (op->opcode == BS_OP_READ)
            {
                dequeue_op = dequeue_read(op);
            }
-            else if (op->opcode == BS_OP_WRITE)
+            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
            {
                if (has_writes == 2)
                {
@@ -154,16 +151,6 @@ void blockstore_impl_t::loop()
                dequeue_op = dequeue_write(op);
                has_writes = dequeue_op ? 1 : 2;
            }
-            else if (op->opcode == BS_OP_DELETE)
-            {
-                if (has_writes == 2)
-                {
-                    // Some writes could not be submitted
-                    break;
-                }
-                dequeue_op = dequeue_del(op);
-                has_writes = dequeue_op ? 1 : 2;
-            }
            else if (op->opcode == BS_OP_SYNC)
            {
                // wait for all small writes to be submitted
@@ -179,33 +166,16 @@ void blockstore_impl_t::loop()
            }
            else if (op->opcode == BS_OP_STABLE)
            {
-                if (has_writes == 2)
-                {
-                    // Don't submit additional flushes before completing previous LISTs
-                    break;
-                }
                dequeue_op = dequeue_stable(op);
            }
            else if (op->opcode == BS_OP_ROLLBACK)
            {
-                if (has_writes == 2)
-                {
-                    // Don't submit additional flushes before completing previous LISTs
-                    break;
-                }
                dequeue_op = dequeue_rollback(op);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                // Block LIST operation by previous modifications,
-                // so it always returns a consistent state snapshot
-                if (has_writes == 2 || inflight_writes > 0)
-                    has_writes = 2;
-                else
-                {
-                    process_list(op);
-                    dequeue_op = true;
-                }
+                process_list(op);
+                dequeue_op = true;
            }
            if (dequeue_op)
            {
@@ -226,16 +196,11 @@ void blockstore_impl_t::loop()
        {
            flusher->loop();
        }
-        int ret = ringloop->submit();
-        if (ret < 0)
-        {
-            throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
-        }
        if ((initial_ring_space - ringloop->space_left()) > 0)
        {
            live = true;
        }
-        queue_stall = !live && !ringloop->has_work();
+        queue_stall = !live && !ringloop->get_loop_again();
        live = false;
    }
 }
@@ -275,9 +240,19 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        if (ringloop->space_left() < PRIV(op)->wait_detail)
        {
            // stop submission if there's still no free space
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
-#endif
+            return;
+        }
+        PRIV(op)->wait_for = 0;
+    }
+    else if (PRIV(op)->wait_for == WAIT_IN_FLIGHT)
+    {
+        auto dirty_it = dirty_db.find((obj_ver_id){
+            .oid = op->oid,
+            .version = PRIV(op)->wait_detail,
+        });
+        if (dirty_it != dirty_db.end() && IS_IN_FLIGHT(dirty_it->second.state))
+        {
+            // do not submit
            return;
        }
        PRIV(op)->wait_for = 0;
@@ -287,12 +262,8 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        if (journal.used_start == PRIV(op)->wait_detail)
        {
            // do not submit
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
-#endif
            return;
        }
-        flusher->release_trim();
        PRIV(op)->wait_for = 0;
    }
    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
@@ -302,9 +273,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
            journal.sector_info[next].dirty)
        {
            // do not submit
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for a journal buffer\n");
-#endif
            return;
        }
        PRIV(op)->wait_for = 0;
@@ -313,9 +281,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    {
        if (!data_alloc->get_free_count() && !flusher->is_active())
        {
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for free space on the data device\n");
-#endif
            return;
        }
        PRIV(op)->wait_for = 0;
@@ -334,12 +299,12 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
            op->len > block_size-op->offset ||
            (op->len % disk_alignment)
        )) ||
-        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
+        readonly && op->opcode != BS_OP_READ ||
        first && op->opcode == BS_OP_WRITE)
    {
        // Basic verification not passed
        op->retval = -EINVAL;
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        op->callback(op);
        return;
    }
    if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -380,21 +345,21 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
            }
        };
    }
-    if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
+    if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
    {
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        op->callback(op);
        return;
    }
-    if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
+    if (0 && op->opcode == BS_OP_SYNC && immediate_commit)
    {
        op->retval = 0;
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        op->callback(op);
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
    PRIV(op)->wait_for = 0;
-    PRIV(op)->op_state = 0;
+    PRIV(op)->sync_state = 0;
    PRIV(op)->pending_ops = 0;
    if (!first)
    {
@@ -407,165 +372,82 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
    ringloop->wakeup();
 }

-static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
-{
-    while (search_start < search_end)
-    {
-        int pos = search_start+(search_end-search_start)/2;
-        if (oid < list[pos].oid)
-        {
-            search_end = pos;
-        }
-        else if (list[pos].oid < oid)
-        {
-            search_start = pos+1;
-        }
-        else
-        {
-            list[pos].version = version;
-            return true;
-        }
-    }
-    return false;
-}
-
 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    // Check PG
+    // Count objects
    uint32_t list_pg = op->offset;
    uint32_t pg_count = op->len;
-    uint64_t pg_stripe_size = op->oid.stripe;
-    if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
+    uint64_t parity_block_size = op->oid.stripe;
+    if (pg_count != 0 && (parity_block_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
    {
        op->retval = -EINVAL;
        FINISH_OP(op);
        return;
    }
-    // Copy clean_db entries (sorted)
-    int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
-    obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
-    if (!stable)
+    uint64_t stable_count = 0;
+    if (pg_count > 0)
+    {
+        for (auto it = clean_db.begin(); it != clean_db.end(); it++)
+        {
+            uint32_t pg = (it->first.inode + it->first.stripe / parity_block_size) % pg_count;
+            if (pg == list_pg)
+            {
+                stable_count++;
+            }
+        }
+    }
+    else
+    {
+        stable_count = clean_db.size();
+    }
+    uint64_t total_count = stable_count;
+    for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
+    {
+        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
+        {
+            if (IS_STABLE(it->second.state))
+            {
+                stable_count++;
+            }
+            total_count++;
+        }
+    }
+    // Allocate memory
+    op->version = stable_count;
+    op->retval = total_count;
+    op->buf = malloc(sizeof(obj_ver_id) * total_count);
+    if (!op->buf)
    {
        op->retval = -ENOMEM;
        FINISH_OP(op);
        return;
    }
+    obj_ver_id *vers = (obj_ver_id*)op->buf;
+    int i = 0;
    for (auto it = clean_db.begin(); it != clean_db.end(); it++)
    {
-        if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.inode + it->first.stripe / parity_block_size) % pg_count) == list_pg)
        {
-            if (stable_count >= stable_alloc)
-            {
-                stable_alloc += 32768;
-                stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-                if (!stable)
-                {
-                    op->retval = -ENOMEM;
-                    FINISH_OP(op);
-                    return;
-                }
-            }
-            stable[stable_count++] = {
+            vers[i++] = {
                .oid = it->first,
                .version = it->second.version,
            };
        }
    }
-    int clean_stable_count = stable_count;
-    // Copy dirty_db entries (sorted, too)
-    int unstable_count = 0, unstable_alloc = 0;
-    obj_ver_id *unstable = NULL;
+    int j = stable_count;
    for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
    {
-        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
        {
-            if (IS_DELETE(it->second.state))
+            if (IS_STABLE(it->second.state))
            {
-                // Deletions are always stable, so try to zero out two possible entries
-                if (!replace_stable(it->first.oid, 0, 0, clean_stable_count, stable))
-                {
-                    replace_stable(it->first.oid, 0, clean_stable_count, stable_count, stable);
-                }
-            }
-            else if (IS_STABLE(it->second.state))
-            {
-                // First try to replace a clean stable version in the first part of the list
-                if (!replace_stable(it->first.oid, it->first.version, 0, clean_stable_count, stable))
-                {
-                    // Then try to replace the last dirty stable version in the second part of the list
-                    if (stable[stable_count-1].oid == it->first.oid)
-                    {
-                        stable[stable_count-1].version = it->first.version;
-                    }
-                    else
-                    {
-                        if (stable_count >= stable_alloc)
-                        {
-                            stable_alloc += 32768;
-                            stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-                            if (!stable)
-                            {
-                                if (unstable)
-                                    free(unstable);
-                                op->retval = -ENOMEM;
-                                FINISH_OP(op);
-                                return;
-                            }
-                        }
-                        stable[stable_count++] = it->first;
-                    }
-                }
+                vers[i++] = it->first;
            }
            else
            {
-                if (unstable_count >= unstable_alloc)
-                {
-                    unstable_alloc += 32768;
-                    unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
-                    if (!unstable)
-                    {
-                        if (stable)
-                            free(stable);
-                        op->retval = -ENOMEM;
-                        FINISH_OP(op);
-                        return;
-                    }
-                }
-                unstable[unstable_count++] = it->first;
+                vers[j++] = it->first;
            }
        }
    }
-    // Remove zeroed out stable entries
-    int j = 0;
-    for (int i = 0; i < stable_count; i++)
-    {
-        if (stable[i].version != 0)
-        {
-            stable[j++] = stable[i];
-        }
-    }
-    stable_count = j;
-    if (stable_count+unstable_count > stable_alloc)
-    {
-        stable_alloc = stable_count+unstable_count;
-        stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-        if (!stable)
-        {
-            if (unstable)
-                free(unstable);
-            op->retval = -ENOMEM;
-            FINISH_OP(op);
-            return;
-        }
-    }
-    // Copy unstable entries
-    for (int i = 0; i < unstable_count; i++)
-    {
-        stable[j++] = unstable[i];
-    }
-    free(unstable);
-    op->version = stable_count;
-    op->retval = stable_count+unstable_count;
-    op->buf = stable;
    FINISH_OP(op);
 }
--- a/blockstore_impl.h
+++ b/blockstore_impl.h
@@ -1,6 +1,7 @@
 #pragma once

 #include "blockstore.h"
+#include "timerfd_interval.h"

 #include <sys/types.h>
 #include <sys/ioctl.h>
@@ -15,7 +16,7 @@
 #include <deque>
 #include <new>

-#include "cpp-btree/btree_map.h"
+#include "sparsepp/sparsepp/spp.h"

 #include "allocator.h"

@@ -24,17 +25,17 @@
 // States are not stored on disk. Instead, they're deduced from the journal
 // FIXME: Rename to BS_ST_*

-#define ST_J_WAIT_BIG 1
-#define ST_J_IN_FLIGHT 2
-#define ST_J_SUBMITTED 3
-#define ST_J_WRITTEN 4
-#define ST_J_SYNCED 5
-#define ST_J_STABLE 6
+#define ST_J_IN_FLIGHT 1
+#define ST_J_SUBMITTED 2
+#define ST_J_WRITTEN 3
+#define ST_J_SYNCED 4
+#define ST_J_STABLE 5

 #define ST_D_IN_FLIGHT 15
 #define ST_D_SUBMITTED 16
 #define ST_D_WRITTEN 17
-#define ST_D_SYNCED 20
+#define ST_D_META_WRITTEN 19
+#define ST_D_META_SYNCED 20
 #define ST_D_STABLE 21

 #define ST_DEL_IN_FLIGHT 31
@@ -45,17 +46,13 @@

 #define ST_CURRENT 48

-#define IMMEDIATE_NONE 0
-#define IMMEDIATE_SMALL 1
-#define IMMEDIATE_ALL 2
-
-#define IS_IN_FLIGHT(st) (st == ST_J_WAIT_BIG || st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
+#define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
 #define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
-#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_SYNCED || st == ST_DEL_SYNCED)
-#define IS_JOURNAL(st) (st >= ST_J_WAIT_BIG && st <= ST_J_STABLE)
-#define IS_BIG_WRITE(st) (st >= ST_D_IN_FLIGHT && st <= ST_D_STABLE)
-#define IS_DELETE(st) (st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_STABLE)
-#define IS_UNSYNCED(st) (st >= ST_J_WAIT_BIG && st <= ST_J_WRITTEN || st >= ST_D_IN_FLIGHT && st <= ST_D_WRITTEN|| st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_WRITTEN)
+#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED || st == ST_DEL_SYNCED)
+#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
+#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
+#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
+#define IS_UNSYNCED(st) (st >= ST_J_SUBMITTED && st <= ST_J_WRITTEN || st >= ST_D_SUBMITTED && st <= ST_D_META_WRITTEN || st >= ST_DEL_SUBMITTED && st <= ST_DEL_WRITTEN)

 #define BS_SUBMIT_GET_SQE(sqe, data) \
    BS_SUBMIT_GET_ONLY_SQE(sqe); \
@@ -127,6 +124,8 @@ struct __attribute__((__packed__)) dirty_entry

 // Suspend operation until there are more free SQEs
 #define WAIT_SQE 1
+// Suspend operation until version <wait_detail> of object <oid> is written
+#define WAIT_IN_FLIGHT 2
 // Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
 #define WAIT_JOURNAL 3
 // Suspend operation until the next journal sector buffer is free
@@ -140,7 +139,7 @@ struct fulfill_read_t
 };

 #define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
-#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)
+#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); op->callback(op)

 struct blockstore_op_private_t
 {
@@ -148,13 +147,12 @@ struct blockstore_op_private_t
    int wait_for;
    uint64_t wait_detail;
    int pending_ops;
-    int op_state;

    // Read
    std::vector<fulfill_read_t> read_vec;

    // Sync, write
-    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
+    uint64_t min_used_journal_sector, max_used_journal_sector;

    // Write
    struct iovec iov_zerofill[3];
@@ -163,13 +161,9 @@ struct blockstore_op_private_t
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
    int sync_small_checked, sync_big_checked;
    std::list<blockstore_op_t*>::iterator in_progress_ptr;
-    int prev_sync_count;
+    int sync_state, prev_sync_count;
 };

-// https://github.com/algorithm-ninja/cpp-btree
-// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
-// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
-typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
 typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;

 #include "blockstore_init.h"
@@ -183,30 +177,29 @@ class blockstore_impl_t
    uint32_t block_size;
    uint64_t meta_offset;
    uint64_t data_offset;
-    uint64_t cfg_journal_size, cfg_data_size;
+    uint64_t cfg_journal_size;
    // Required write alignment and journal/metadata/data areas' location alignment
-    uint32_t disk_alignment = 4096;
+    uint32_t disk_alignment = 512;
    // Journal block size - minimum_io_size of the journal device is the best choice
-    uint64_t journal_block_size = 4096;
+    uint64_t journal_block_size = 512;
    // Metadata block size - minimum_io_size of the metadata device is the best choice
-    uint64_t meta_block_size = 4096;
+    uint64_t meta_block_size = 512;
    // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
    uint64_t bitmap_granularity = 4096;
    bool readonly = false;
-    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
-    bool disable_flock = false;
    // It is safe to disable fsync() if drive write cache is writethrough
    bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
    // Enable if you want every operation to be executed with an "implicit fsync"
-    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
-    int immediate_commit = IMMEDIATE_NONE;
+    // FIXME Not implemented yet
+    bool immediate_commit = false;
    bool inmemory_meta = false;
    int flusher_count;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

-    blockstore_clean_db_t clean_db;
+    // Another option is https://github.com/algorithm-ninja/cpp-btree
+    spp::sparse_hash_map<object_id, clean_entry> clean_db;
    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
    std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
@@ -231,7 +224,6 @@ class blockstore_impl_t

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
-    int inflight_writes = 0;

    bool stop_sync_submitted;

@@ -272,7 +264,7 @@ class blockstore_impl_t
    bool enqueue_write(blockstore_op_t *op);
    int dequeue_write(blockstore_op_t *op);
    int dequeue_del(blockstore_op_t *op);
-    int continue_write(blockstore_op_t *op);
+    void ack_write(blockstore_op_t *op);
    void release_journal_sectors(blockstore_op_t *op);
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

@@ -285,15 +277,11 @@ class blockstore_impl_t

    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
-    int continue_stable(blockstore_op_t *op);
-    void mark_stable(const obj_ver_id & ov);
    void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
    void stabilize_object(object_id oid, uint64_t max_ver);

    // Rollback
    int dequeue_rollback(blockstore_op_t *op);
-    int continue_rollback(blockstore_op_t *op);
-    void mark_rolled_back(const obj_ver_id & ov);
    void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
    void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);

@@ -328,6 +316,5 @@ public:

    inline uint32_t get_block_size() { return block_size; }
    inline uint64_t get_block_count() { return block_count; }
-    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
    inline uint32_t get_disk_alignment() { return disk_alignment; }
 };
--- a/blockstore_init.cpp
+++ b/blockstore_init.cpp
@@ -402,9 +402,8 @@ resume_1:
    }
    // Trim journal on start so we don't stall when all entries are older
    bs->journal.trim();
-    bs->journal.dirty_start = bs->journal.next_free;
    printf(
-        "Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
+        "Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
        entries_loaded,
        (bs->journal.next_free >= bs->journal.used_start
            ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@@ -440,7 +439,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
        {
            journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
            if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
-                je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
+                je->type < JE_SMALL_WRITE || je->type > JE_DELETE || started && je->crc32_prev != crc32_last)
            {
                if (pos == 0)
                {
@@ -475,7 +474,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                if (location != je->small_write.data_offset)
                {
                    char err[1024];
-                    snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
+                    snprintf(err, 1024, "BUG: calculated journal data offset (%lu) != stored journal data offset (%lu)", location, je->small_write.data_offset);
                    throw std::runtime_error(err);
                }
                uint32_t data_crc32 = 0;
@@ -510,9 +509,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                if (data_crc32 != je->small_write.crc32_data)
                {
                    // journal entry is corrupt, stop here
-                    // interesting thing is that we must clear the corrupt entry if we're not readonly,
-                    // because we don't write next entries in the same journal block
-                    printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
+                    // interesting thing is that we must clear the corrupt entry if we're not readonly
                    memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
                    bs->journal.next_free = prev_free;
                    init_write_buf = buf + proc_pos - done_pos;
@@ -521,7 +518,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                }
                auto clean_it = bs->clean_db.find(je->small_write.oid);
                if (clean_it == bs->clean_db.end() ||
-                    clean_it->second.version < je->small_write.version)
+                    clean_it->second.version < je->big_write.version)
                {
                    obj_ver_id ov = {
                        .oid = je->small_write.oid,
@@ -537,10 +534,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    });
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
-                    printf(
-                        "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
-                        proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
-                    );
+                    printf("journal offset %lu is used by %lu:%lu v%lu\n", proc_pos, ov.oid.inode, ov.oid.stripe, ov.version);
 #endif
                    auto & unstab = bs->unstable_writes[ov.oid];
                    unstab = unstab < ov.version ? ov.version : unstab;
@@ -561,7 +555,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .version = je->big_write.version,
                    };
                    bs->dirty_db.emplace(ov, (dirty_entry){
-                        .state = ST_D_SYNCED,
+                        .state = ST_D_META_SYNCED,
                        .flags = 0,
                        .location = je->big_write.location,
                        .offset = je->big_write.offset,
@@ -587,7 +581,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    .oid = je->stable.oid,
                    .version = je->stable.version,
                };
-                bs->mark_stable(ov);
+                auto it = bs->dirty_db.find(ov);
+                if (it == bs->dirty_db.end())
+                {
+                    // journal contains a legitimate STABLE entry for a non-existing dirty write
+                    // this probably means that journal was trimmed between WRITE and STABLE entries
+                    // skip it
+                }
+                else
+                {
+                    while (1)
+                    {
+                        it->second.state = (it->second.state == ST_D_META_SYNCED
+                            ? ST_D_STABLE
+                            : (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
+                        if (it == bs->dirty_db.begin())
+                            break;
+                        it--;
+                        if (it->first.oid != ov.oid || IS_STABLE(it->second.state))
+                            break;
+                    }
+                    bs->flusher->enqueue_flush(ov);
+                }
+                auto unstab_it = bs->unstable_writes.find(ov.oid);
+                if (unstab_it != bs->unstable_writes.end() && unstab_it->second <= ov.version)
+                {
+                    bs->unstable_writes.erase(unstab_it);
+                }
            }
            else if (je->type == JE_ROLLBACK)
            {
@@ -595,39 +615,70 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
 #endif
                // rollback dirty writes of <oid> up to <version>
-                obj_ver_id ov = {
+                auto it = bs->dirty_db.lower_bound((obj_ver_id){
                    .oid = je->rollback.oid,
-                    .version = je->rollback.version,
-                };
-                bs->mark_rolled_back(ov);
+                    .version = UINT64_MAX,
+                });
+                if (it != bs->dirty_db.begin())
+                {
+                    uint64_t max_unstable = 0;
+                    auto rm_start = it;
+                    auto rm_end = it;
+                    it--;
+                    while (it->first.oid == je->rollback.oid &&
+                        it->first.version > je->rollback.version &&
+                        !IS_IN_FLIGHT(it->second.state) &&
+                        !IS_STABLE(it->second.state))
+                    {
+                        if (it->first.oid != je->rollback.oid)
+                            break;
+                        else if (it->first.version <= je->rollback.version)
+                        {
+                            if (!IS_STABLE(it->second.state))
+                                max_unstable = it->first.version;
+                            break;
+                        }
+                        else if (IS_STABLE(it->second.state))
+                            break;
+                        // Remove entry
+                        rm_start = it;
+                        if (it == bs->dirty_db.begin())
+                            break;
+                        it--;
+                    }
+                    if (rm_start != rm_end)
+                    {
+                        bs->erase_dirty(rm_start, rm_end, UINT64_MAX);
+                    }
+                    auto unstab_it = bs->unstable_writes.find(je->rollback.oid);
+                    if (unstab_it != bs->unstable_writes.end())
+                    {
+                        if (max_unstable == 0)
+                            bs->unstable_writes.erase(unstab_it);
+                        else
+                            unstab_it->second = max_unstable;
+                    }
+                }
            }
            else if (je->type == JE_DELETE)
            {
 #ifdef BLOCKSTORE_DEBUG
                printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
 #endif
-                auto clean_it = bs->clean_db.find(je->del.oid);
-                if (clean_it == bs->clean_db.end() ||
-                    clean_it->second.version < je->del.version)
-                {
-                    // oid, version
-                    obj_ver_id ov = {
-                        .oid = je->del.oid,
-                        .version = je->del.version,
-                    };
-                    bs->dirty_db.emplace(ov, (dirty_entry){
-                        .state = ST_DEL_SYNCED,
-                        .flags = 0,
-                        .location = 0,
-                        .offset = 0,
-                        .len = 0,
-                        .journal_sector = proc_pos,
-                    });
-                    bs->journal.used_sectors[proc_pos]++;
-                    // Deletions are treated as immediately stable, because
-                    // "2-phase commit" (write->stabilize) isn't sufficient for them anyway
-                    bs->mark_stable(ov);
-                }
+                // oid, version
+                obj_ver_id ov = {
+                    .oid = je->del.oid,
+                    .version = je->del.version,
+                };
+                bs->dirty_db.emplace(ov, (dirty_entry){
+                    .state = ST_DEL_SYNCED,
+                    .flags = 0,
+                    .location = 0,
+                    .offset = 0,
+                    .len = 0,
+                    .journal_sector = proc_pos,
+                });
+                bs->journal.used_sectors[proc_pos]++;
            }
            started = true;
            pos += je->size;
--- a/blockstore_journal.cpp
+++ b/blockstore_journal.cpp
@@ -6,24 +6,18 @@ blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
    sectors_required = 0;
    next_pos = bs->journal.next_free;
    next_sector = bs->journal.cur_sector;
-    first_sector = -1;
    next_in_pos = bs->journal.in_sector_pos;
    right_dir = next_pos >= bs->journal.used_start;
 }

 // Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
-int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
+int blockstore_journal_check_t::check_available(blockstore_op_t *op, int required, int size, int data_after)
 {
-    int required = entries_required;
    while (1)
    {
        int fits = (bs->journal.block_size - next_in_pos) / size;
        if (fits > 0)
        {
-            if (first_sector == -1)
-            {
-                first_sector = next_sector;
-            }
            required -= fits;
            next_in_pos += fits * size;
            sectors_required++;
@@ -44,40 +38,19 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
            right_dir = false;
        }
        next_in_pos = 0;
-        next_sector = ((next_sector + 1) % bs->journal.sector_count);
-        if (next_sector == first_sector)
+        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
+            bs->journal.sector_info[next_sector].dirty)
        {
-            // next_sector may wrap when all sectors are flushed and the incoming batch is too big
-            // This is an error condition, we can't wait for anything in this case
-            throw std::runtime_error(
-                "Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
-                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
-            );
+            next_sector = ((next_sector + 1) % bs->journal.sector_count);
        }
        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
            bs->journal.sector_info[next_sector].dirty)
        {
            // No memory buffer available. Wait for it.
-            int used = 0, dirty = 0;
-            for (int i = 0; i < bs->journal.sector_count; i++)
-            {
-                if (bs->journal.sector_info[i].dirty)
-                {
-                    dirty++;
-                    used++;
-                }
-                if (bs->journal.sector_info[i].usage_count > 0)
-                {
-                    used++;
-                }
-            }
-            // In fact, it's even more rare than "ran out of journal space", so print a warning
-            printf(
-                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
-                used, bs->journal.sector_count, dirty, next_sector,
-                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
-                bs->journal.sector_info[next_sector].usage_count
-            );
+#ifdef BLOCKSTORE_DEBUG
+            printf("next journal buffer %d is still dirty=%d used=%d\n", next_sector,
+                bs->journal.sector_info[next_sector].dirty, bs->journal.sector_info[next_sector].usage_count);
+#endif
            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
            return 0;
        }
@@ -101,7 +74,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                : bs->journal.used_start - bs->journal.next_free)
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
-        bs->flusher->request_trim();
+        bs->flusher->force_start();
        PRIV(op)->wait_detail = bs->journal.used_start;
        return 0;
    }
@@ -118,11 +91,6 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        {
            // Also select next sector buffer in memory
            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
-            assert(!journal.sector_info[journal.cur_sector].usage_count);
-        }
-        else
-        {
-            journal.dirty_start = journal.next_free;
        }
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
@@ -180,8 +148,8 @@ bool journal_t::trim()
    auto journal_used_it = used_sectors.lower_bound(used_start);
 #ifdef BLOCKSTORE_DEBUG
    printf(
-        "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
-        used_start, next_free, dirty_start,
+        "Trimming journal (used_start=%lu, next_free=%lu, first_used=%lu, usage_count=%lu)\n",
+        used_start, next_free,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
    );
@@ -212,7 +180,7 @@ bool journal_t::trim()
        return false;
    }
 #ifdef BLOCKSTORE_DEBUG
-    printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
+    printf("Journal trimmed to %lu (next_free=%lu)\n", used_start, next_free);
 #endif
    return true;
 }
--- a/blockstore_journal.h
+++ b/blockstore_journal.h
@@ -12,14 +12,12 @@
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
-#define JE_MIN         0x01
 #define JE_START       0x01
 #define JE_SMALL_WRITE 0x02
 #define JE_BIG_WRITE   0x03
 #define JE_STABLE      0x04
 #define JE_DELETE      0x05
 #define JE_ROLLBACK    0x06
-#define JE_MAX         0x06

 // crc32c comes first to ease calculation and is equal to crc32()
 struct __attribute__((__packed__)) journal_entry_start
@@ -137,14 +135,10 @@ struct journal_t
    bool inmemory = false;
    void *buffer = NULL;

-    uint64_t block_size;
+    uint64_t block_size = 512;
    uint64_t offset, len;
-    // Next free block offset
    uint64_t next_free = 0;
-    // First occupied block offset
    uint64_t used_start = 0;
-    // End of the last block not used for writing anymore
-    uint64_t dirty_start = 0;
    uint32_t crc32_last = 0;

    // Current sector(s) used for writing
@@ -166,7 +160,7 @@ struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_required, first_sector;
+    int sectors_required;
    bool right_dir; // writing to the end or the beginning of the ring buffer

    blockstore_journal_check_t(blockstore_impl_t *bs);
--- a/blockstore_open.cpp
+++ b/blockstore_open.cpp
@@ -1,4 +1,3 @@
-#include <sys/file.h>
 #include "blockstore_impl.h"

 static uint32_t is_power_of_two(uint64_t value)
@@ -35,23 +34,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        disable_journal_fsync = true;
    }
-    if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
-    {
-        disable_flock = true;
-    }
-    if (config["immediate_commit"] == "all")
-    {
-        immediate_commit = IMMEDIATE_ALL;
-    }
-    else if (config["immediate_commit"] == "small")
-    {
-        immediate_commit = IMMEDIATE_SMALL;
-    }
    metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
    cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
    data_device = config["data_device"];
    data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
-    cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
    meta_device = config["meta_device"];
    meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
    block_size = strtoull(config["block_size"].c_str(), NULL, 10);
@@ -80,7 +66,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!disk_alignment)
    {
-        disk_alignment = 4096;
+        disk_alignment = 512;
    }
    else if (disk_alignment % MEM_ALIGNMENT)
    {
@@ -88,7 +74,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!journal_block_size)
    {
-        journal_block_size = 4096;
+        journal_block_size = 512;
    }
    else if (journal_block_size % MEM_ALIGNMENT)
    {
@@ -96,7 +82,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!meta_block_size)
    {
-        meta_block_size = 4096;
+        meta_block_size = 512;
    }
    else if (meta_block_size % MEM_ALIGNMENT)
    {
@@ -142,22 +128,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        metadata_buf_size = 4*1024*1024;
    }
-    if (meta_device == "")
-    {
-        disable_meta_fsync = disable_data_fsync;
-    }
-    if (journal_device == "")
-    {
-        disable_journal_fsync = disable_meta_fsync;
-    }
-    if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
-    {
-        throw std::runtime_error("immediate_commit requires disable_journal_fsync");
-    }
-    if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
-    {
-        throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
-    }
    // init some fields
    clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
    clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
@@ -181,15 +151,6 @@ void blockstore_impl_t::calc_lengths()
        data_len = data_len < journal.offset-data_offset
            ? data_len : journal.offset-data_offset;
    }
-    if (cfg_data_size != 0)
-    {
-        if (data_len < cfg_data_size)
-        {
-            throw std::runtime_error("Data area ("+std::to_string(data_len)+
-                " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
-        }
-        data_len = cfg_data_size;
-    }
    // meta
    meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
    if (meta_fd == data_fd && meta_offset <= data_offset)
@@ -291,10 +252,6 @@ void blockstore_impl_t::open_data()
    {
        throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
    }
-    if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
-    {
-        throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
-    }
 }

 void blockstore_impl_t::open_meta()
@@ -312,14 +269,11 @@ void blockstore_impl_t::open_meta()
        {
            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
        }
-        if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
-        {
-            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
-        }
    }
    else
    {
        meta_fd = data_fd;
+        disable_meta_fsync = disable_data_fsync;
        meta_size = 0;
        if (meta_offset >= data_size)
        {
@@ -337,15 +291,12 @@ void blockstore_impl_t::open_journal()
        {
            throw std::runtime_error("Failed to open journal device");
        }
-        check_size(journal.fd, &journal.device_size, "journal device");
-        if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
-        {
-            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
-        }
+        check_size(journal.fd, &journal.device_size, "metadata device");
    }
    else
    {
        journal.fd = meta_fd;
+        disable_journal_fsync = disable_meta_fsync;
        journal.device_size = 0;
        if (journal.offset >= data_size)
        {
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@@ -8,10 +8,12 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
        // Zero-length version - skip
        return 1;
    }
-    else if (IS_IN_FLIGHT(item_state))
+    if (IS_IN_FLIGHT(item_state))
    {
-        // Write not finished yet - skip
-        return 1;
+        // Pause until it's written somewhere
+        PRIV(op)->wait_for = WAIT_IN_FLIGHT;
+        PRIV(op)->wait_detail = item_version;
+        return 0;
    }
    else if (IS_DELETE(item_state))
    {
@@ -131,66 +133,63 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            dirty_it--;
        }
    }
-    if (clean_it != clean_db.end())
+    if (clean_it != clean_db.end() && fulfilled < read_op->len)
    {
        if (!result_version)
        {
            result_version = clean_it->second.version;
        }
-        if (fulfilled < read_op->len)
+        if (!clean_entry_bitmap_size)
        {
-            if (!clean_entry_bitmap_size)
+            if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
            {
-                if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
-                {
-                    // need to wait. undo added requests, don't dequeue op
-                    PRIV(read_op)->read_vec.clear();
-                    return 0;
-                }
+                // need to wait. undo added requests, don't dequeue op
+                PRIV(read_op)->read_vec.clear();
+                return 0;
+            }
+        }
+        else
+        {
+            uint64_t meta_loc = clean_it->second.location >> block_order;
+            uint8_t *clean_entry_bitmap;
+            if (inmemory_meta)
+            {
+                uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
+                uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
+                clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
            }
            else
            {
-                uint64_t meta_loc = clean_it->second.location >> block_order;
-                uint8_t *clean_entry_bitmap;
-                if (inmemory_meta)
+                clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
+            }
+            uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
+            while (bmp_start < bmp_size)
+            {
+                while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
                {
-                    uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
-                    uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
-                    clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
+                    bmp_end++;
                }
-                else
+                if (bmp_end > bmp_start)
                {
-                    clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
+                    // fill with zeroes
+                    fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                        bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
                }
-                uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
-                while (bmp_start < bmp_size)
+                bmp_start = bmp_end;
+                while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
                {
-                    while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
+                    bmp_end++;
+                }
+                if (bmp_end > bmp_start)
+                {
+                    if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                        bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
                    {
-                        bmp_end++;
-                    }
-                    if (bmp_end > bmp_start)
-                    {
-                        // fill with zeroes
-                        fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                            bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
+                        // need to wait. undo added requests, don't dequeue op
+                        PRIV(read_op)->read_vec.clear();
+                        return 0;
                    }
                    bmp_start = bmp_end;
-                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
-                    {
-                        bmp_end++;
-                    }
-                    if (bmp_end > bmp_start)
-                    {
-                        if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                            bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
-                        {
-                            // need to wait. undo added requests, don't dequeue op
-                            PRIV(read_op)->read_vec.clear();
-                            return 0;
-                        }
-                        bmp_start = bmp_end;
-                    }
                }
            }
        }
--- a/blockstore_rollback.cpp
+++ b/blockstore_rollback.cpp
@@ -2,10 +2,6 @@

 int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state)
-    {
-        return continue_rollback(op);
-    }
    obj_ver_id* v;
    int i, todo = op->len;
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -18,13 +14,8 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        });
        if (dirty_it == dirty_db.begin())
        {
-            if (v->version == 0)
-            {
-                // Already rolled back
-                // FIXME Skip this object version
-            }
        bad_op:
-            op->retval = -ENOENT;
+            op->retval = -EINVAL;
            FINISH_OP(op);
            return 1;
        }
@@ -40,9 +31,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
                if (!IS_SYNCED(dirty_it->second.state) ||
                    IS_STABLE(dirty_it->second.state))
                {
-                    op->retval = -EBUSY;
-                    FINISH_OP(op);
-                    return 1;
+                    goto bad_op;
                }
                if (dirty_it == dirty_db.begin())
                {
@@ -71,12 +60,39 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
-            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
+        // FIXME This is here only for the purpose of tracking unstable_writes. Remove if not required
+        // FIXME ...aaaand this is similar to blockstore_init.cpp - maybe dedup it?
+        auto dirty_it = dirty_db.lower_bound((obj_ver_id){
+            .oid = v->oid,
+            .version = UINT64_MAX,
+        });
+        uint64_t max_unstable = 0;
+        while (dirty_it != dirty_db.begin())
+        {
+            dirty_it--;
+            if (dirty_it->first.oid != v->oid)
+                break;
+            else if (dirty_it->first.version <= v->version)
+            {
+                if (!IS_STABLE(dirty_it->second.state))
+                    max_unstable = dirty_it->first.version;
+                break;
+            }
+        }
+        auto unstab_it = unstable_writes.find(v->oid);
+        if (unstab_it != unstable_writes.end())
+        {
+            if (max_unstable == 0)
+                unstable_writes.erase(unstab_it);
+            else
+                unstab_it->second = max_unstable;
+        }
        journal_entry_rollback *je = (journal_entry_rollback*)
            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
        journal.sector_info[journal.cur_sector].dirty = false;
@@ -87,117 +103,21 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        if (cur_sector != journal.cur_sector)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
-    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
-    PRIV(op)->op_state = 1;
-    inflight_writes++;
    return 1;
 }

-int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
-{
-    if (PRIV(op)->op_state == 2)
-        goto resume_2;
-    else if (PRIV(op)->op_state == 3)
-        goto resume_3;
-    else if (PRIV(op)->op_state == 5)
-        goto resume_5;
-    else
-        return 1;
-resume_2:
-    // Release used journal sectors
-    release_journal_sectors(op);
-resume_3:
-    if (!disable_journal_fsync)
-    {
-        io_uring_sqe *sqe = get_sqe();
-        if (!sqe)
-        {
-            return 0;
-        }
-        ring_data_t *data = ((ring_data_t*)sqe->user_data);
-        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
-        data->iov = { 0 };
-        data->callback = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        PRIV(op)->pending_ops = 1;
-        PRIV(op)->op_state = 4;
-        return 1;
-    }
-resume_5:
-    obj_ver_id* v;
-    int i;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
-    {
-        mark_rolled_back(*v);
-    }
-    journal.trim();
-    inflight_writes--;
-    // Acknowledge op
-    op->retval = 0;
-    FINISH_OP(op);
-    return 1;
-}
-
-void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
-{
-    auto it = dirty_db.lower_bound((obj_ver_id){
-        .oid = ov.oid,
-        .version = UINT64_MAX,
-    });
-    if (it != dirty_db.begin())
-    {
-        uint64_t max_unstable = 0;
-        auto rm_start = it;
-        auto rm_end = it;
-        it--;
-        while (it->first.oid == ov.oid &&
-            it->first.version > ov.version &&
-            !IS_IN_FLIGHT(it->second.state) &&
-            !IS_STABLE(it->second.state))
-        {
-            if (it->first.oid != ov.oid)
-                break;
-            else if (it->first.version <= ov.version)
-            {
-                if (!IS_STABLE(it->second.state))
-                    max_unstable = it->first.version;
-                break;
-            }
-            else if (IS_STABLE(it->second.state))
-                break;
-            // Remove entry
-            rm_start = it;
-            if (it == dirty_db.begin())
-                break;
-            it--;
-        }
-        if (rm_start != rm_end)
-        {
-            erase_dirty(rm_start, rm_end, UINT64_MAX);
-        }
-        auto unstab_it = unstable_writes.find(ov.oid);
-        if (unstab_it != unstable_writes.end())
-        {
-            if (max_unstable == 0)
-                unstable_writes.erase(unstab_it);
-            else
-                unstab_it->second = max_unstable;
-        }
-    }
-}
-
 void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
 {
    live = true;
    if (data->res != data->iov.iov_len)
    {
-        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -206,11 +126,37 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    PRIV(op)->pending_ops--;
    if (PRIV(op)->pending_ops == 0)
    {
-        PRIV(op)->op_state++;
-        if (!continue_rollback(op))
+        // Release used journal sectors
+        release_journal_sectors(op);
+        obj_ver_id* v;
+        int i;
+        for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
        {
-            submit_queue.push_front(op);
+            // Erase dirty_db entries
+            auto rm_end = dirty_db.lower_bound((obj_ver_id){
+                .oid = v->oid,
+                .version = UINT64_MAX,
+            });
+            rm_end--;
+            auto rm_start = rm_end;
+            while (1)
+            {
+                if (rm_end->first.oid != v->oid)
+                    break;
+                else if (rm_end->first.version <= v->version)
+                    break;
+                rm_start = rm_end;
+                if (rm_end == dirty_db.begin())
+                    break;
+                rm_end--;
+            }
+            if (rm_end != rm_start)
+                erase_dirty(rm_start, rm_end, UINT64_MAX);
        }
+        journal.trim();
+        // Acknowledge op
+        op->retval = 0;
+        FINISH_OP(op);
    }
 }

@@ -227,13 +173,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
 #endif
            data_alloc->set(dirty_it->second.location >> block_order, false);
        }
-        int used = --journal.used_sectors[dirty_it->second.journal_sector];
 #ifdef BLOCKSTORE_DEBUG
-        printf(
-            "remove usage of journal offset %08lx by %lu:%lu v%lu (%d refs)\n", dirty_it->second.journal_sector,
-            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
-        );
+        printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector,
+            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
 #endif
+        int used = --journal.used_sectors[dirty_it->second.journal_sector];
        if (used == 0)
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@@ -40,10 +40,6 @@

 int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state)
-    {
-        return continue_stable(op);
-    }
    obj_ver_id* v;
    int i, todo = 0;
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -55,7 +51,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            if (clean_it == clean_db.end() || clean_it->second.version < v->version)
            {
                // No such object version
-                op->retval = -ENOENT;
+                op->retval = -EINVAL;
                FINISH_OP(op);
                return 1;
            }
@@ -67,7 +63,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        else if (IS_UNSYNCED(dirty_it->second.state))
        {
            // Object not synced yet. Caller must sync it first
-            op->retval = -EBUSY;
+            op->retval = EAGAIN;
            FINISH_OP(op);
            return 1;
        }
@@ -102,13 +98,18 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
-            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
-        // FIXME: Only stabilize versions that aren't stable yet
+        auto unstab_it = unstable_writes.find(v->oid);
+        if (unstab_it != unstable_writes.end() &&
+            unstab_it->second <= v->version)
+        {
+            unstable_writes.erase(unstab_it);
+        }
        journal_entry_stable *je = (journal_entry_stable*)
            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
        journal.sector_info[journal.cur_sector].dirty = false;
@@ -119,116 +120,21 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        if (cur_sector != journal.cur_sector)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
-    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
-    PRIV(op)->op_state = 1;
-    inflight_writes++;
    return 1;
 }

-int blockstore_impl_t::continue_stable(blockstore_op_t *op)
-{
-    if (PRIV(op)->op_state == 2)
-        goto resume_2;
-    else if (PRIV(op)->op_state == 3)
-        goto resume_3;
-    else if (PRIV(op)->op_state == 5)
-        goto resume_5;
-    else
-        return 1;
-resume_2:
-    // Release used journal sectors
-    release_journal_sectors(op);
-resume_3:
-    if (!disable_journal_fsync)
-    {
-        io_uring_sqe *sqe = get_sqe();
-        if (!sqe)
-        {
-            return 0;
-        }
-        ring_data_t *data = ((ring_data_t*)sqe->user_data);
-        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
-        data->iov = { 0 };
-        data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        PRIV(op)->pending_ops = 1;
-        PRIV(op)->op_state = 4;
-        return 1;
-    }
-resume_5:
-    // Mark dirty_db entries as stable, acknowledge op completion
-    obj_ver_id* v;
-    int i;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
-    {
-        // Mark all dirty_db entries up to op->version as stable
-        mark_stable(*v);
-    }
-    inflight_writes--;
-    // Acknowledge op
-    op->retval = 0;
-    FINISH_OP(op);
-    return 1;
-}
-
-void blockstore_impl_t::mark_stable(const obj_ver_id & v)
-{
-    auto dirty_it = dirty_db.find(v);
-    if (dirty_it != dirty_db.end())
-    {
-        while (1)
-        {
-            if (dirty_it->second.state == ST_J_SYNCED)
-            {
-                dirty_it->second.state = ST_J_STABLE;
-            }
-            else if (dirty_it->second.state == ST_D_SYNCED)
-            {
-                dirty_it->second.state = ST_D_STABLE;
-            }
-            else if (dirty_it->second.state == ST_DEL_SYNCED)
-            {
-                dirty_it->second.state = ST_DEL_STABLE;
-            }
-            else if (IS_STABLE(dirty_it->second.state))
-            {
-                break;
-            }
-            if (dirty_it == dirty_db.begin())
-            {
-                break;
-            }
-            dirty_it--;
-            if (dirty_it->first.oid != v.oid)
-            {
-                break;
-            }
-        }
-#ifdef BLOCKSTORE_DEBUG
-        printf("enqueue_flush %lu:%lu v%lu\n", v.oid.inode, v.oid.stripe, v.version);
-#endif
-        flusher->enqueue_flush(v);
-    }
-    auto unstab_it = unstable_writes.find(v.oid);
-    if (unstab_it != unstable_writes.end() &&
-        unstab_it->second <= v.version)
-    {
-        unstable_writes.erase(unstab_it);
-    }
-}
-
 void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
 {
    live = true;
    if (data->res != data->iov.iov_len)
    {
-        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -237,10 +143,53 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    PRIV(op)->pending_ops--;
    if (PRIV(op)->pending_ops == 0)
    {
-        PRIV(op)->op_state++;
-        if (!continue_stable(op))
+        // Release used journal sectors
+        release_journal_sectors(op);
+        // Mark dirty_db entries as stable, acknowledge op completion
+        obj_ver_id* v;
+        int i;
+        for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
        {
-            submit_queue.push_front(op);
+            // Mark all dirty_db entries up to op->version as stable
+            auto dirty_it = dirty_db.find(*v);
+            if (dirty_it != dirty_db.end())
+            {
+                while (1)
+                {
+                    if (dirty_it->second.state == ST_J_SYNCED)
+                    {
+                        dirty_it->second.state = ST_J_STABLE;
+                    }
+                    else if (dirty_it->second.state == ST_D_META_SYNCED)
+                    {
+                        dirty_it->second.state = ST_D_STABLE;
+                    }
+                    else if (dirty_it->second.state == ST_DEL_SYNCED)
+                    {
+                        dirty_it->second.state = ST_DEL_STABLE;
+                    }
+                    else if (IS_STABLE(dirty_it->second.state))
+                    {
+                        break;
+                    }
+                    if (dirty_it == dirty_db.begin())
+                    {
+                        break;
+                    }
+                    dirty_it--;
+                    if (dirty_it->first.oid != v->oid)
+                    {
+                        break;
+                    }
+                }
+#ifdef BLOCKSTORE_DEBUG
+                printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
+#endif
+                flusher->enqueue_flush(*v);
+            }
        }
+        // Acknowledge op
+        op->retval = 0;
+        FINISH_OP(op);
    }
 }
--- a/blockstore_sync.cpp
+++ b/blockstore_sync.cpp
@@ -11,7 +11,7 @@

 int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state == 0)
+    if (PRIV(op)->sync_state == 0)
    {
        stop_sync_submitted = false;
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
@@ -21,11 +21,11 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
        unsynced_big_writes.clear();
        unsynced_small_writes.clear();
        if (PRIV(op)->sync_big_writes.size() > 0)
-            PRIV(op)->op_state = SYNC_HAS_BIG;
+            PRIV(op)->sync_state = SYNC_HAS_BIG;
        else if (PRIV(op)->sync_small_writes.size() > 0)
-            PRIV(op)->op_state = SYNC_HAS_SMALL;
+            PRIV(op)->sync_state = SYNC_HAS_SMALL;
        else
-            PRIV(op)->op_state = SYNC_DONE;
+            PRIV(op)->sync_state = SYNC_DONE;
        // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
        PRIV(op)->prev_sync_count = in_progress_syncs.size();
        PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
@@ -38,7 +38,7 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 int blockstore_impl_t::continue_sync(blockstore_op_t *op)
 {
    auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
-    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
+    if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
    {
        // No big writes, just fsync the journal
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
@@ -54,17 +54,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            // Write out the last journal sector if it happens to be dirty
            BS_SUBMIT_GET_ONLY_SQE(sqe);
            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
+            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
+            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
        }
    }
-    if (PRIV(op)->op_state == SYNC_HAS_BIG)
+    if (PRIV(op)->sync_state == SYNC_HAS_BIG)
    {
        for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
        {
@@ -81,17 +81,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
            data->callback = cb;
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
+            PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
+            PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
        }
    }
-    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
+    if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
    {
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
        {
@@ -121,7 +121,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            journal.sector_info[journal.cur_sector].dirty)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
@@ -133,11 +133,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            journal.sector_info[journal.cur_sector].dirty = false;
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
-            printf(
-                "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
-                dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
-                journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
-            );
+            printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
 #endif
            je->oid = it->oid;
            je->version = it->version;
@@ -150,17 +146,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            if (cur_sector != journal.cur_sector)
            {
                if (cur_sector == -1)
-                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                    PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
                cur_sector = journal.cur_sector;
                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
            }
        }
-        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
-        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
+        PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
        return 1;
    }
-    if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
+    if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
    {
        if (!disable_journal_fsync)
        {
@@ -169,17 +165,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            data->iov = { 0 };
            data->callback = cb;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
+            PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->op_state = SYNC_DONE;
+            PRIV(op)->sync_state = SYNC_DONE;
        }
    }
-    if (PRIV(op)->op_state == SYNC_DONE)
+    if (PRIV(op)->sync_state == SYNC_DONE)
    {
-        return ack_sync(op);
+        ack_sync(op);
    }
    return 1;
 }
@@ -200,17 +196,17 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
        // Release used journal sectors
        release_journal_sectors(op);
        // Handle states
-        if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
+        if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
        {
-            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
+            PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
        }
-        else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
+        else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
        {
-            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
+            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
        }
-        else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
+        else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
        {
-            PRIV(op)->op_state = SYNC_DONE;
+            PRIV(op)->sync_state = SYNC_DONE;
            ack_sync(op);
        }
        else
@@ -222,7 +218,7 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op

 int blockstore_impl_t::ack_sync(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
+    if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
    {
        // Remove dependency of subsequent syncs
        auto it = PRIV(op)->in_progress_ptr;
@@ -234,14 +230,14 @@ int blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            auto & next_sync = *it++;
            PRIV(next_sync)->prev_sync_count -= done_syncs;
-            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
+            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE)
            {
                done_syncs++;
                // Acknowledge next_sync
                ack_one_sync(next_sync);
            }
        }
-        return 2;
+        return 1;
    }
    return 0;
 }
@@ -256,17 +252,7 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
-        auto dirty_it = dirty_db.find(*it);
-        dirty_it->second.state = ST_D_SYNCED;
-        dirty_it++;
-        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
-        {
-            if (dirty_it->second.state == ST_J_WAIT_BIG)
-            {
-                dirty_it->second.state = ST_J_IN_FLIGHT;
-            }
-            dirty_it++;
-        }
+        dirty_db[*it].state = ST_D_META_SYNCED;
    }
    for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
    {
@@ -275,16 +261,7 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
-        if (dirty_db[*it].state == ST_DEL_WRITTEN)
-        {
-            dirty_db[*it].state = ST_DEL_SYNCED;
-            // Deletions are treated as immediately stable
-            mark_stable(*it);
-        }
-        else /* == ST_J_WRITTEN */
-        {
-            dirty_db[*it].state = ST_J_SYNCED;
-        }
+        dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
    }
    in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
    op->retval = 0;
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@@ -4,7 +4,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
-    bool is_inflight_big = false;
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -18,9 +17,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
-            is_inflight_big = dirty_it->second.state >= ST_D_IN_FLIGHT &&
-                dirty_it->second.state < ST_D_SYNCED ||
-                dirty_it->second.state == ST_J_WAIT_BIG;
        }
    }
    if (!found)
@@ -42,7 +38,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    else if (op->version < version)
    {
        // Invalid version requested
-        op->retval = -EEXIST;
+        op->retval = -EINVAL;
        return false;
    }
    if (deleted && is_del)
@@ -51,26 +47,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        op->retval = 0;
        return false;
    }
-    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
-        immediate_commit != IMMEDIATE_ALL)
-    {
-        // Issue an additional sync so that the previous big write can reach the journal
-        blockstore_op_t *sync_op = new blockstore_op_t;
-        sync_op->opcode = BS_OP_SYNC;
-        sync_op->callback = [this, op](blockstore_op_t *sync_op)
-        {
-            delete sync_op;
-        };
-        enqueue_op(sync_op);
-    }
+    // Immediately add the operation into dirty_db, so subsequent reads could see it
 #ifdef BLOCKSTORE_DEBUG
-    if (is_del)
-        printf("Delete %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version);
-    else
-        printf("Write %lu:%lu v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
+    printf("%s %lu:%lu v%lu\n", is_del ? "Delete" : "Write", op->oid.inode, op->oid.stripe, op->version);
 #endif
-    // No strict need to add it into dirty_db here, it's just left
-    // from the previous implementation where reads waited for writes
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
@@ -78,7 +58,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        .state = (uint32_t)(
            is_del
                ? ST_DEL_IN_FLIGHT
-                : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : (is_inflight_big ? ST_J_WAIT_BIG : ST_J_IN_FLIGHT))
+                : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : ST_J_IN_FLIGHT)
        ),
        .flags = 0,
        .location = 0,
@@ -92,20 +72,11 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 // First step of the write algorithm: dequeue operation and submit initial write(s)
 int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state)
-    {
-        return continue_write(op);
-    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
-    assert(dirty_it != dirty_db.end());
-    if (dirty_it->second.state == ST_J_WAIT_BIG)
-    {
-        return 0;
-    }
-    else if (dirty_it->second.state == ST_D_IN_FLIGHT)
+    if (dirty_it->second.state == ST_D_IN_FLIGHT)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -154,20 +125,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        if (immediate_commit != IMMEDIATE_ALL)
-        {
-            // Remember big write as unsynced
-            unsynced_big_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-            PRIV(op)->op_state = 3;
-        }
-        else
-        {
-            PRIV(op)->op_state = 1;
-        }
+        PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+        // Remember big write as unsynced
+        unsynced_big_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
    }
    else
    {
@@ -181,11 +144,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // There is sufficient space. Get SQE(s)
        struct io_uring_sqe *sqe1 = NULL;
-        if (immediate_commit != IMMEDIATE_NONE ||
-            (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
+        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
-            // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
+            // Write current journal sector only if it's dirty and full
            BS_SUBMIT_GET_SQE_DECL(sqe1);
        }
        struct io_uring_sqe *sqe2 = NULL;
@@ -195,18 +157,16 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // Got SQEs. Prepare previous journal sector write if required
        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        if (immediate_commit == IMMEDIATE_NONE)
+        if (sqe1)
        {
-            if (sqe1)
-            {
-                prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
-                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
-                PRIV(op)->pending_ops++;
-            }
-            else
-            {
-                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-            }
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+            // FIXME rename to min/max _flushing
+            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
+        else
+        {
+            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
        }
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)
@@ -214,11 +174,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
-        printf(
-            "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
-            dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
-            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
-        );
+        printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
 #endif
        // Figure out where data will be
        journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
@@ -230,12 +186,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        je->crc32_data = crc32c(0, op->buf, op->len);
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
-        if (immediate_commit != IMMEDIATE_NONE)
-        {
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
-            PRIV(op)->pending_ops++;
-        }
        if (op->len > 0)
        {
            // Prepare journal data write
@@ -263,120 +213,16 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        {
            journal.next_free = journal_block_size;
        }
-        if (immediate_commit == IMMEDIATE_NONE)
-        {
-            // Remember small write as unsynced
-            unsynced_small_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-        }
+        // Remember small write as unsynced
+        unsynced_small_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
        if (!PRIV(op)->pending_ops)
        {
-            PRIV(op)->op_state = 4;
-            continue_write(op);
-        }
-        else
-        {
-            PRIV(op)->op_state = 3;
+            ack_write(op);
        }
    }
-    inflight_writes++;
-    return 1;
-}
-
-int blockstore_impl_t::continue_write(blockstore_op_t *op)
-{
-    io_uring_sqe *sqe = NULL;
-    journal_entry_big_write *je;
-    auto dirty_it = dirty_db.find((obj_ver_id){
-        .oid = op->oid,
-        .version = op->version,
-    });
-    assert(dirty_it != dirty_db.end());
-    if (PRIV(op)->op_state == 2)
-        goto resume_2;
-    else if (PRIV(op)->op_state == 4)
-        goto resume_4;
-    else
-        return 1;
-resume_2:
-    // Only for the immediate_commit mode: prepare and submit big_write journal entry
-    sqe = get_sqe();
-    if (!sqe)
-    {
-        return 0;
-    }
-    je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
-    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
-    journal.sector_info[journal.cur_sector].dirty = false;
-    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
-#ifdef BLOCKSTORE_DEBUG
-    printf(
-        "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
-        journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
-        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
-    );
-#endif
-    je->oid = op->oid;
-    je->version = op->version;
-    je->offset = op->offset;
-    je->len = op->len;
-    je->location = dirty_it->second.location;
-    je->crc32 = je_crc32((journal_entry*)je);
-    journal.crc32_last = je->crc32;
-    prepare_journal_sector_write(journal, journal.cur_sector, sqe,
-        [this, op](ring_data_t *data) { handle_write_event(data, op); });
-    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
-    PRIV(op)->pending_ops = 1;
-    PRIV(op)->op_state = 3;
-    return 1;
-resume_4:
-    // Switch object state
-#ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
-#endif
-    bool imm = dirty_it->second.state == ST_D_SUBMITTED
-        ? (immediate_commit == IMMEDIATE_ALL)
-        : (immediate_commit != IMMEDIATE_NONE);
-    if (imm)
-    {
-        auto & unstab = unstable_writes[op->oid];
-        unstab = unstab < op->version ? op->version : unstab;
-    }
-    if (dirty_it->second.state == ST_J_SUBMITTED)
-    {
-        dirty_it->second.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
-    }
-    else if (dirty_it->second.state == ST_D_SUBMITTED)
-    {
-        dirty_it->second.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
-    }
-    else if (dirty_it->second.state == ST_DEL_SUBMITTED)
-    {
-        dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
-        if (imm)
-        {
-            // Deletions are treated as immediately stable
-            mark_stable(dirty_it->first);
-        }
-    }
-    if (immediate_commit == IMMEDIATE_ALL)
-    {
-        dirty_it++;
-        while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
-        {
-            if (dirty_it->second.state == ST_J_WAIT_BIG)
-            {
-                dirty_it->second.state = ST_J_IN_FLIGHT;
-            }
-            dirty_it++;
-        }
-    }
-    inflight_writes--;
-    // Acknowledge write
-    op->retval = op->len;
-    FINISH_OP(op);
    return 1;
 }

@@ -385,7 +231,6 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    live = true;
    if (data->res != data->iov.iov_len)
    {
-        inflight_writes--;
        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
@@ -396,117 +241,88 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    if (PRIV(op)->pending_ops == 0)
    {
        release_journal_sectors(op);
-        PRIV(op)->op_state++;
-        if (!continue_write(op))
-        {
-            submit_queue.push_front(op);
-        }
+        ack_write(op);
    }
 }

 void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
 {
-    // Release flushed journal sectors
-    if (PRIV(op)->min_flushed_journal_sector > 0 &&
-        PRIV(op)->max_flushed_journal_sector > 0)
+    // Release used journal sectors
+    if (PRIV(op)->min_used_journal_sector > 0 &&
+        PRIV(op)->max_used_journal_sector > 0)
    {
-        uint64_t s = PRIV(op)->min_flushed_journal_sector;
+        uint64_t s = PRIV(op)->min_used_journal_sector;
        while (1)
        {
            journal.sector_info[s-1].usage_count--;
-            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
-            {
-                // We know for sure that we won't write into this sector anymore
-                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
-                if (new_ds >= journal.len)
-                {
-                    new_ds = journal.block_size;
-                }
-                if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
-                    (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
-                {
-                    journal.dirty_start = new_ds;
-                }
-            }
-            if (s == PRIV(op)->max_flushed_journal_sector)
+            if (s == PRIV(op)->max_used_journal_sector)
                break;
            s = 1 + s % journal.sector_count;
        }
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
    }
 }

+void blockstore_impl_t::ack_write(blockstore_op_t *op)
+{
+    // Switch object state
+    auto & dirty_entry = dirty_db[(obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    }];
+#ifdef BLOCKSTORE_DEBUG
+    printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_entry.state);
+#endif
+    if (dirty_entry.state == ST_J_SUBMITTED)
+    {
+        dirty_entry.state = ST_J_WRITTEN;
+    }
+    else if (dirty_entry.state == ST_D_SUBMITTED)
+    {
+        dirty_entry.state = ST_D_WRITTEN;
+    }
+    else if (dirty_entry.state == ST_DEL_SUBMITTED)
+    {
+        dirty_entry.state = ST_DEL_WRITTEN;
+    }
+    // Acknowledge write without sync
+    op->retval = op->len;
+    FINISH_OP(op);
+}
+
 int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
 {
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
-    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
    {
        return 0;
    }
-    io_uring_sqe *sqe = NULL;
-    if (immediate_commit != IMMEDIATE_NONE ||
-        (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
-        journal.sector_info[journal.cur_sector].dirty)
-    {
-        // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
-        BS_SUBMIT_GET_SQE_DECL(sqe);
-    }
-    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
+    BS_SUBMIT_GET_ONLY_SQE(sqe);
    // Prepare journal sector write
-    if (immediate_commit == IMMEDIATE_NONE)
-    {
-        if (sqe)
-        {
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
-            PRIV(op)->pending_ops++;
-        }
-        else
-        {
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        }
-    }
-    // Pre-fill journal entry
    journal_entry_del *je = (journal_entry_del*)
        prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
-    printf(
-        "journal offset %08lx is used by %lu:%lu v%lu (%lu refs)\n",
-        dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
-        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
-    );
+    printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
 #endif
    je->oid = op->oid;
    je->version = op->version;
    je->crc32 = je_crc32((journal_entry*)je);
    journal.crc32_last = je->crc32;
+    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+    PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->pending_ops = 1;
    dirty_it->second.state = ST_DEL_SUBMITTED;
-    if (immediate_commit != IMMEDIATE_NONE)
-    {
-        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
-        PRIV(op)->pending_ops++;
-        // Remember small write as unsynced
-        unsynced_small_writes.push_back((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        });
-    }
-    if (!PRIV(op)->pending_ops)
-    {
-        PRIV(op)->op_state = 4;
-        continue_write(op);
-    }
-    else
-    {
-        PRIV(op)->op_state = 3;
-    }
+    // Remember small write as unsynced
+    unsynced_small_writes.push_back((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    });
    return 1;
 }
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@@ -1,349 +0,0 @@
-#include "cluster_client.h"
-
-cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
-{
-    this->ringloop = ringloop;
-    this->tfd = tfd;
-
-    msgr.tfd = tfd;
-    msgr.ringloop = ringloop;
-    msgr.repeer_pgs = [this](osd_num_t peer_osd)
-    {
-        // peer_osd just connected or dropped connection
-        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
-        {
-            // really connected :)
-            continue_ops();
-        }
-    };
-
-    st_cli.tfd = tfd;
-    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
-    st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
-    st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
-    st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
-
-    log_level = config["log_level"].int64_value();
-    st_cli.parse_config(config);
-    st_cli.load_global_config();
-}
-
-void cluster_client_t::continue_ops()
-{
-    for (auto op_it = unsent_ops.begin(); op_it != unsent_ops.end(); )
-    {
-        cluster_op_t *op = *op_it;
-        if (op->needs_reslice && !op->sent_count)
-        {
-            op->parts.clear();
-            op->done_count = 0;
-            op->needs_reslice = false;
-        }
-        if (!op->parts.size())
-        {
-            unsent_ops.erase(op_it++);
-            execute(op);
-            continue;
-        }
-        if (!op->needs_reslice)
-        {
-            for (auto & op_part: op->parts)
-            {
-                if (!op_part.sent && !op_part.done)
-                {
-                    try_send(op, &op_part);
-                }
-            }
-            if (op->sent_count == op->parts.size() - op->done_count)
-            {
-                unsent_ops.erase(op_it++);
-                sent_ops.insert(op);
-            }
-            else
-                op_it++;
-        }
-        else
-            op_it++;
-    }
-}
-
-static uint32_t is_power_of_two(uint64_t value)
-{
-    uint32_t l = 0;
-    while (value > 1)
-    {
-        if (value & 1)
-        {
-            return 64;
-        }
-        value = value >> 1;
-        l++;
-    }
-    return l;
-}
-
-void cluster_client_t::on_load_config_hook(json11::Json::object & config)
-{
-    bs_block_size = config["block_size"].uint64_value();
-    bs_disk_alignment = config["disk_alignment"].uint64_value();
-    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
-    if (!bs_block_size)
-        bs_block_size = DEFAULT_BLOCK_SIZE;
-    if (!bs_disk_alignment)
-        bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
-    if (!bs_bitmap_granularity)
-        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
-    {
-        uint32_t block_order;
-        if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
-            throw std::runtime_error("Bad block size");
-    }
-    if (config.find("pg_stripe_size") != config.end())
-    {
-        pg_stripe_size = config["pg_stripe_size"].uint64_value();
-        if (!pg_stripe_size)
-            pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
-    }
-    if (config["immediate_commit"] == "all")
-    {
-        // Cluster-wide immediate_commit mode
-        immediate_commit = true;
-    }
-    msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
-    if (!msgr.peer_connect_interval)
-        msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
-    msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
-    if (!msgr.peer_connect_timeout)
-        msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
-}
-
-void cluster_client_t::on_load_pgs_hook(bool success)
-{
-    if (success)
-    {
-        pg_count = st_cli.pg_config.size();
-        continue_ops();
-    }
-}
-
-void cluster_client_t::on_change_hook(json11::Json::object & changes)
-{
-    if (pg_count != st_cli.pg_config.size())
-    {
-        // At this point, all operations should be suspended
-        // And they need to be resliced!
-        for (auto op: unsent_ops)
-        {
-            op->needs_reslice = true;
-        }
-        for (auto op: sent_ops)
-        {
-            op->needs_reslice = true;
-        }
-        pg_count = st_cli.pg_config.size();
-    }
-    continue_ops();
-}
-
-void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
-{
-    if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
-    {
-        msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
-    }
-}
-
-// FIXME: Implement OSD_OP_SYNC for immediate_commit == false
-void cluster_client_t::execute(cluster_op_t *op)
-{
-    if (op->opcode == OSD_OP_SYNC && immediate_commit)
-    {
-        // Syncs are not required in the immediate_commit mode
-        op->retval = 0;
-        std::function<void(cluster_op_t*)>(op->callback)(op);
-        return;
-    }
-    if (op->opcode != OSD_OP_READ && op->opcode != OSD_OP_OUT || !op->inode || !op->len ||
-        op->offset % bs_disk_alignment || op->len % bs_disk_alignment)
-    {
-        op->retval = -EINVAL;
-        std::function<void(cluster_op_t*)>(op->callback)(op);
-        return;
-    }
-    if (!pg_stripe_size)
-    {
-        // Config is not loaded yet
-        unsent_ops.insert(op);
-        return;
-    }
-    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
-    {
-        // Copy operation
-        cluster_op_t *op_copy = new cluster_op_t();
-        op_copy->opcode = op->opcode;
-        op_copy->inode = op->inode;
-        op_copy->offset = op->offset;
-        op_copy->len = op->len;
-        op_copy->buf = malloc(op->len);
-        memcpy(op_copy->buf, op->buf, op->len);
-        unsynced_ops.push_back(op_copy);
-        unsynced_bytes += op->len;
-        if (inmemory_commit)
-        {
-            // Immediately acknowledge write and continue with the copy
-            op->retval = op->len;
-            std::function<void(cluster_op_t*)>(op->callback)(op);
-            op = op_copy;
-        }
-        if (unsynced_bytes >= inmemory_dirty_limit)
-        {
-            // Push an extra SYNC operation
-        }
-    }
-    // Slice the request into individual object stripe requests
-    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
-    uint64_t pg_block_size = bs_block_size * pg_part_count;
-    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
-    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
-    int part_count = 0;
-    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
-    {
-        if (op->offset < (stripe+pg_block_size) && (op->offset+op->len) > stripe)
-        {
-            part_count++;
-        }
-    }
-    op->parts.resize(part_count);
-    bool resend = false;
-    int i = 0;
-    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
-    {
-        uint64_t stripe_end = stripe + pg_block_size;
-        if (op->offset < stripe_end && (op->offset+op->len) > stripe)
-        {
-            pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pg_count + 1;
-            op->parts[i] = {
-                .parent = op,
-                .offset = op->offset < stripe ? stripe : op->offset,
-                .len = (uint32_t)((op->offset+op->len) > stripe_end ? pg_block_size : op->offset+op->len-stripe),
-                .pg_num = pg_num,
-                .buf = op->buf + (op->offset < stripe ? stripe-op->offset : 0),
-                .sent = false,
-                .done = false,
-            };
-            if (!try_send(op, &op->parts[i]))
-            {
-                // Part needs to be sent later
-                resend = true;
-            }
-            i++;
-        }
-    }
-    if (resend)
-    {
-        unsent_ops.insert(op);
-    }
-    else
-    {
-        sent_ops.insert(op);
-    }
-}
-
-bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
-{
-    auto pg_it = st_cli.pg_config.find(part->pg_num);
-    if (pg_it != st_cli.pg_config.end() &&
-        !pg_it->second.pause && pg_it->second.cur_primary)
-    {
-        osd_num_t primary_osd = pg_it->second.cur_primary;
-        auto peer_it = msgr.osd_peer_fds.find(primary_osd);
-        if (peer_it != msgr.osd_peer_fds.end())
-        {
-            int peer_fd = peer_it->second;
-            part->osd_num = primary_osd;
-            part->sent = true;
-            op->sent_count++;
-            part->op = {
-                .op_type = OSD_OP_OUT,
-                .peer_fd = peer_fd,
-                .req = { .rw = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = op_id++,
-                        .opcode = op->opcode,
-                    },
-                    .inode = op->inode,
-                    .offset = part->offset,
-                    .len = part->len,
-                } },
-                .callback = [this, part](osd_op_t *op_part)
-                {
-                    handle_op_part(part);
-                },
-            };
-            part->op.send_list.push_back(part->op.req.buf, OSD_PACKET_SIZE);
-            if (op->opcode == OSD_OP_WRITE)
-            {
-                part->op.send_list.push_back(part->buf, part->len);
-            }
-            else
-            {
-                part->op.buf = part->buf;
-            }
-            msgr.outbox_push(&part->op);
-            return true;
-        }
-        else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
-        {
-            msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
-        }
-    }
-    return false;
-}
-
-void cluster_client_t::handle_op_part(cluster_op_part_t *part)
-{
-    cluster_op_t *op = part->parent;
-    part->sent = false;
-    op->sent_count--;
-    part->op.buf = NULL;
-    if (part->op.reply.hdr.retval != part->op.req.rw.len)
-    {
-        // Operation failed, retry
-        printf(
-            "Operation part failed on OSD %lu: retval=%ld (expected %u), reconnecting\n",
-            part->osd_num, part->op.reply.hdr.retval, part->op.req.rw.len
-        );
-        msgr.stop_client(part->op.peer_fd);
-        if (op->sent_count == op->parts.size() - op->done_count - 1)
-        {
-            // Resend later when OSDs come up
-            // FIXME: Check for different types of errors
-            // FIXME: Repeat operations after a small timeout, for the case when OSD is coming up
-            sent_ops.erase(op);
-            unsent_ops.insert(op);
-        }
-        if (op->sent_count == 0 && op->needs_reslice)
-        {
-            // PG count has changed, reslice the operation
-            unsent_ops.erase(op);
-            op->parts.clear();
-            op->done_count = 0;
-            op->needs_reslice = false;
-            execute(op);
-        }
-    }
-    else
-    {
-        // OK
-        part->done = true;
-        op->done_count++;
-        if (op->done_count >= op->parts.size())
-        {
-            // Finished!
-            sent_ops.erase(op);
-            op->retval = op->len;
-            std::function<void(cluster_op_t*)>(op->callback)(op);
-        }
-    }
-}
--- a/cluster_client.h
+++ b/cluster_client.h
@@ -1,80 +0,0 @@
-#pragma once
-
-#include "messenger.h"
-#include "etcd_state_client.h"
-
-#define MIN_BLOCK_SIZE 4*1024
-#define MAX_BLOCK_SIZE 128*1024*1024
-#define DEFAULT_BLOCK_SIZE 128*1024
-#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
-#define DEFAULT_DISK_ALIGNMENT 4096
-#define DEFAULT_BITMAP_GRANULARITY 4096
-
-struct cluster_op_t;
-
-struct cluster_op_part_t
-{
-    cluster_op_t *parent;
-    uint64_t offset;
-    uint32_t len;
-    pg_num_t pg_num;
-    osd_num_t osd_num;
-    void *buf;
-    bool sent;
-    bool done;
-    osd_op_t op;
-};
-
-struct cluster_op_t
-{
-    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC
-    uint64_t inode;
-    uint64_t offset;
-    uint64_t len;
-    int retval;
-    void *buf;
-    std::function<void(cluster_op_t*)> callback;
-protected:
-    bool needs_reslice = false;
-    int sent_count = 0, done_count = 0;
-    std::vector<cluster_op_part_t> parts;
-    friend class cluster_client_t;
-};
-
-class cluster_client_t
-{
-    timerfd_manager_t *tfd;
-    ring_loop_t *ringloop;
-
-    uint64_t pg_part_count = 2;
-    uint64_t pg_stripe_size = 0;
-    uint64_t bs_block_size = 0;
-    uint64_t bs_disk_alignment = 0;
-    uint64_t bs_bitmap_granularity = 0;
-    uint64_t pg_count = 0;
-    bool immediate_commit = false;
-    bool inmemory_commit = false;
-    uint64_t inmemory_dirty_limit = 32*1024*1024;
-    int log_level;
-
-    uint64_t op_id = 1;
-    etcd_state_client_t st_cli;
-    osd_messenger_t msgr;
-    std::set<cluster_op_t*> sent_ops, unsent_ops;
-    // unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
-    std::vector<cluster_op_t*> unsynced_ops;
-    uint64_t unsynced_bytes = 0;
-
-public:
-    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
-    void execute(cluster_op_t *op);
-
-protected:
-    void continue_ops();
-    void on_load_config_hook(json11::Json::object & cfg);
-    void on_load_pgs_hook(bool success);
-    void on_change_hook(json11::Json::object & changes);
-    void on_change_osd_state_hook(uint64_t peer_osd);
-    bool try_send(cluster_op_t *op, cluster_op_part_t *part);
-    void handle_op_part(cluster_op_part_t *part);
-};
--- a/dump_journal.cpp
+++ b/dump_journal.cpp
@@ -1,165 +0,0 @@
-#define _LARGEFILE64_SOURCE
-#include <sys/types.h>
-#include <sys/ioctl.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <malloc.h>
-#include <linux/fs.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <stdio.h>
-
-#include "blockstore_impl.h"
-#include "crc32c.h"
-
-struct journal_dump_t
-{
-    char *journal_device;
-    uint32_t journal_block;
-    uint64_t journal_offset;
-    uint64_t journal_len;
-    uint64_t journal_pos;
-    int fd;
-
-    void dump_block(void *buf);
-};
-
-int main(int argc, char *argv[])
-{
-    if (argc < 5)
-    {
-        printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
-        return 1;
-    }
-    journal_dump_t self;
-    self.journal_device = argv[1];
-    self.journal_block = strtoul(argv[2], NULL, 10);
-    self.journal_offset = strtoull(argv[3], NULL, 10);
-    self.journal_len = strtoull(argv[4], NULL, 10);
-    if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
-        self.journal_block > 128*1024)
-    {
-        printf("Invalid journal block size\n");
-        return 1;
-    }
-    self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
-    if (self.fd == -1)
-    {
-        printf("Failed to open journal\n");
-        return 1;
-    }
-    void *data = memalign(MEM_ALIGNMENT, self.journal_block);
-    self.journal_pos = 0;
-    while (self.journal_pos < self.journal_len)
-    {
-        int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
-        assert(r == self.journal_block);
-        uint64_t s;
-        for (s = 0; s < self.journal_block; s += 8)
-        {
-            if (*((uint64_t*)(data+s)) != 0)
-                break;
-        }
-        if (s == self.journal_block)
-        {
-            printf("offset %08lx: zeroes\n", self.journal_pos);
-            self.journal_pos += self.journal_block;
-        }
-        else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
-        {
-            printf("offset %08lx:\n", self.journal_pos);
-            self.dump_block(data);
-        }
-        else
-        {
-            printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
-            self.journal_pos += self.journal_block;
-        }
-    }
-    free(data);
-    close(self.fd);
-    return 0;
-}
-
-void journal_dump_t::dump_block(void *buf)
-{
-    uint32_t pos = 0;
-    journal_pos += journal_block;
-    int entry = 0;
-    bool wrapped = false;
-    while (pos < journal_block)
-    {
-        journal_entry *je = (journal_entry*)(buf + pos);
-        if (je->magic != JOURNAL_MAGIC || je->type < JE_START || je->type > JE_DELETE)
-        {
-            break;
-        }
-        const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
-        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
-        if (je->type == JE_START)
-        {
-            printf("je_start start=%08lx\n", je->start.journal_start);
-        }
-        else if (je->type == JE_SMALL_WRITE)
-        {
-            printf(
-                "je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u loc=%08lx",
-                je->small_write.oid.inode, je->small_write.oid.stripe,
-                je->small_write.version, je->small_write.offset, je->small_write.len,
-                je->small_write.data_offset
-            );
-            if (journal_pos + je->small_write.len > journal_len)
-            {
-                // data continues from the beginning of the journal
-                journal_pos = journal_block;
-                wrapped = true;
-            }
-            if (journal_pos != je->small_write.data_offset)
-            {
-                printf(" (mismatched, calculated = %lu)", journal_pos);
-            }
-            journal_pos += je->small_write.len;
-            if (journal_pos >= journal_len)
-            {
-                journal_pos = journal_block;
-                wrapped = true;
-            }
-            uint32_t data_crc32 = 0;
-            void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
-            assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
-            data_crc32 = crc32c(0, data, je->small_write.len);
-            free(data);
-            printf(
-                " data_crc32=%08x%s", je->small_write.crc32_data,
-                (data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
-            );
-            printf("\n");
-        }
-        else if (je->type == JE_BIG_WRITE)
-        {
-            printf("je_big_write oid=%lu:%lu ver=%lu loc=%08lx\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
-        }
-        else if (je->type == JE_STABLE)
-        {
-            printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
-        }
-        else if (je->type == JE_ROLLBACK)
-        {
-            printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
-        }
-        else if (je->type == JE_DELETE)
-        {
-            printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
-        }
-        pos += je->size;
-        entry++;
-    }
-    if (wrapped)
-    {
-        journal_pos = journal_len;
-    }
-}
--- a/epoll_manager.cpp
+++ b/epoll_manager.cpp
@@ -1,87 +0,0 @@
-#include <sys/epoll.h>
-#include <sys/poll.h>
-#include <unistd.h>
-
-#include "epoll_manager.h"
-
-#define MAX_EPOLL_EVENTS 64
-
-epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
-{
-    this->ringloop = ringloop;
-
-    epoll_fd = epoll_create(1);
-    if (epoll_fd < 0)
-    {
-        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
-    }
-
-    tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
-
-    handle_epoll_events();
-}
-
-epoll_manager_t::~epoll_manager_t()
-{
-    if (tfd)
-    {
-        delete tfd;
-        tfd = NULL;
-    }
-    close(epoll_fd);
-}
-
-void epoll_manager_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
-{
-    if (handler != NULL)
-    {
-        bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
-        epoll_event ev;
-        ev.data.fd = fd;
-        ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
-        if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
-        {
-            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-        }
-        epoll_handlers[fd] = handler;
-    }
-    else
-    {
-        if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
-        {
-            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-        }
-        epoll_handlers.erase(fd);
-    }
-}
-
-void epoll_manager_t::handle_epoll_events()
-{
-    io_uring_sqe *sqe = ringloop->get_sqe();
-    if (!sqe)
-    {
-        throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
-    }
-    ring_data_t *data = ((ring_data_t*)sqe->user_data);
-    my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
-    data->callback = [this](ring_data_t *data)
-    {
-        if (data->res < 0)
-        {
-            throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
-        }
-        handle_epoll_events();
-    };
-    ringloop->submit();
-    int nfds;
-    epoll_event events[MAX_EPOLL_EVENTS];
-    do
-    {
-        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
-        for (int i = 0; i < nfds; i++)
-        {
-            auto & cb = epoll_handlers[events[i].data.fd];
-            cb(events[i].data.fd, events[i].events);
-        }
-    } while (nfds == MAX_EPOLL_EVENTS);
-}
--- a/epoll_manager.h
+++ b/epoll_manager.h
@@ -1,20 +0,0 @@
-#pragma once
-
-#include <map>
-
-#include "ringloop.h"
-#include "timerfd_manager.h"
-
-class epoll_manager_t
-{
-    int epoll_fd;
-    ring_loop_t *ringloop;
-    std::map<int, std::function<void(int, int)>> epoll_handlers;
-public:
-    epoll_manager_t(ring_loop_t *ringloop);
-    ~epoll_manager_t();
-    void set_fd_handler(int fd, std::function<void(int, int)> handler);
-    void handle_epoll_events();
-
-    timerfd_manager_t *tfd;
-};
--- a/etcd_state_client.cpp
+++ b/etcd_state_client.cpp
@@ -1,424 +0,0 @@
-#include "osd_ops.h"
-#include "pg_states.h"
-#include "etcd_state_client.h"
-#include "http_client.h"
-#include "base64.h"
-
-json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
-{
-    json_kv_t kv;
-    kv.key = base64_decode(kv_json["key"].string_value());
-    std::string json_err, json_text = base64_decode(kv_json["value"].string_value());
-    kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
-    if (json_err != "")
-    {
-        printf("Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
-        kv.key = "";
-    }
-    return kv;
-}
-
-void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
-{
-    etcd_call("/kv/txn", txn, timeout, callback);
-}
-
-void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
-{
-    std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
-    std::string etcd_api_path;
-    int pos = etcd_address.find('/');
-    if (pos >= 0)
-    {
-        etcd_api_path = etcd_address.substr(pos);
-        etcd_address = etcd_address.substr(0, pos);
-    }
-    std::string req = payload.dump();
-    req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
-        "Host: "+etcd_address+"\r\n"
-        "Content-Type: application/json\r\n"
-        "Content-Length: "+std::to_string(req.size())+"\r\n"
-        "Connection: close\r\n"
-        "\r\n"+req;
-    http_request_json(tfd, etcd_address, req, timeout, callback);
-}
-
-void etcd_state_client_t::parse_config(json11::Json & config)
-{
-    this->etcd_addresses.clear();
-    if (config["etcd_address"].is_string())
-    {
-        std::string ea = config["etcd_address"].string_value();
-        while (1)
-        {
-            int pos = ea.find(',');
-            std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
-            if (addr.length() > 0)
-            {
-                if (addr.find('/') < 0)
-                    addr += "/v3";
-                this->etcd_addresses.push_back(addr);
-            }
-            if (pos >= 0)
-                ea = ea.substr(pos+1);
-            else
-                break;
-        }
-    }
-    else if (config["etcd_address"].array_items().size())
-    {
-        for (auto & ea: config["etcd_address"].array_items())
-        {
-            std::string addr = ea.string_value();
-            if (addr != "")
-            {
-                if (addr.find('/') < 0)
-                    addr += "/v3";
-                this->etcd_addresses.push_back(addr);
-            }
-        }
-    }
-    this->etcd_prefix = config["etcd_prefix"].string_value();
-    if (this->etcd_prefix == "")
-    {
-        this->etcd_prefix = "/microceph";
-    }
-    else if (this->etcd_prefix[0] != '/')
-    {
-        this->etcd_prefix = "/"+this->etcd_prefix;
-    }
-    this->log_level = config["log_level"].int64_value();
-}
-
-void etcd_state_client_t::start_etcd_watcher()
-{
-    std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
-    std::string etcd_api_path;
-    int pos = etcd_address.find('/');
-    if (pos >= 0)
-    {
-        etcd_api_path = etcd_address.substr(pos);
-        etcd_address = etcd_address.substr(0, pos);
-    }
-    etcd_watches_initialised = 0;
-    etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT, [this](const http_response_t *msg)
-    {
-        if (msg->body.length())
-        {
-            std::string json_err;
-            json11::Json data = json11::Json::parse(msg->body, json_err);
-            if (json_err != "")
-            {
-                printf("Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
-            }
-            else
-            {
-                if (data["result"]["created"].bool_value())
-                {
-                    etcd_watches_initialised++;
-                }
-                if (etcd_watches_initialised == 4)
-                {
-                    etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
-                }
-                // First gather all changes into a hash to remove multiple overwrites
-                json11::Json::object changes;
-                for (auto & ev: data["result"]["events"].array_items())
-                {
-                    auto kv = parse_etcd_kv(ev["kv"]);
-                    if (kv.key != "")
-                    {
-                        changes[kv.key] = kv.value;
-                    }
-                }
-                for (auto & kv: changes)
-                {
-                    if (this->log_level > 0)
-                    {
-                        printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.dump().c_str());
-                    }
-                    parse_state(kv.first, kv.second);
-                }
-                // React to changes
-                if (on_change_hook != NULL)
-                {
-                    on_change_hook(changes);
-                }
-            }
-        }
-        if (msg->eof)
-        {
-            etcd_watch_ws = NULL;
-            if (etcd_watches_initialised == 0)
-            {
-                // Connection not established, retry in <ETCD_SLOW_TIMEOUT>
-                tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int)
-                {
-                    start_etcd_watcher();
-                });
-            }
-            else
-            {
-                // Connection was live, retry immediately
-                start_etcd_watcher();
-            }
-        }
-    });
-    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
-        { "create_request", json11::Json::object {
-            { "key", base64_encode(etcd_prefix+"/config/") },
-            { "range_end", base64_encode(etcd_prefix+"/config0") },
-            { "start_revision", etcd_watch_revision+1 },
-            { "watch_id", ETCD_CONFIG_WATCH_ID },
-        } }
-    }).dump());
-    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
-        { "create_request", json11::Json::object {
-            { "key", base64_encode(etcd_prefix+"/osd/state/") },
-            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
-            { "start_revision", etcd_watch_revision+1 },
-            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
-        } }
-    }).dump());
-    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
-        { "create_request", json11::Json::object {
-            { "key", base64_encode(etcd_prefix+"/pg/state/") },
-            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
-            { "start_revision", etcd_watch_revision+1 },
-            { "watch_id", ETCD_PG_STATE_WATCH_ID },
-        } }
-    }).dump());
-    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
-        { "create_request", json11::Json::object {
-            { "key", base64_encode(etcd_prefix+"/pg/history/") },
-            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
-            { "start_revision", etcd_watch_revision+1 },
-            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
-        } }
-    }).dump());
-}
-
-void etcd_state_client_t::load_global_config()
-{
-    etcd_call("/kv/range", json11::Json::object {
-        { "key", base64_encode(etcd_prefix+"/config/global") }
-    }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
-    {
-        if (err != "")
-        {
-            printf("Error reading OSD configuration from etcd: %s\n", err.c_str());
-            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
-            {
-                load_global_config();
-            });
-            return;
-        }
-        if (!etcd_watch_revision)
-        {
-            etcd_watch_revision = data["header"]["revision"].uint64_value();
-        }
-        json11::Json::object global_config;
-        if (data["kvs"].array_items().size() > 0)
-        {
-            auto kv = parse_etcd_kv(data["kvs"][0]);
-            if (kv.value.is_object())
-            {
-                global_config = kv.value.object_items();
-            }
-        }
-        on_load_config_hook(global_config);
-    });
-}
-
-void etcd_state_client_t::load_pgs()
-{
-    json11::Json::array txn = {
-        json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", base64_encode(etcd_prefix+"/config/pgs") },
-            } }
-        },
-        json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", base64_encode(etcd_prefix+"/pg/history/") },
-                { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
-            } }
-        },
-        json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", base64_encode(etcd_prefix+"/pg/state/") },
-                { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
-            } }
-        },
-        json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", base64_encode(etcd_prefix+"/osd/state/") },
-                { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
-            } }
-        },
-    };
-    json11::Json::object req = { { "success", txn } };
-    json11::Json checks = load_pgs_checks_hook != NULL ? load_pgs_checks_hook() : json11::Json();
-    if (checks.array_items().size() > 0)
-    {
-        req["compare"] = checks;
-    }
-    etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
-    {
-        if (err != "")
-        {
-            printf("Error loading PGs from etcd: %s\n", err.c_str());
-            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
-            {
-                load_pgs();
-            });
-            return;
-        }
-        if (!data["succeeded"].bool_value())
-        {
-            on_load_pgs_hook(false);
-            return;
-        }
-        for (auto & res: data["responses"].array_items())
-        {
-            for (auto & kv_json: res["response_range"]["kvs"].array_items())
-            {
-                auto kv = parse_etcd_kv(kv_json);
-                parse_state(kv.key, kv.value);
-            }
-        }
-        on_load_pgs_hook(true);
-    });
-}
-
-void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
-{
-    if (key == etcd_prefix+"/config/pgs")
-    {
-        for (auto & pg_item: this->pg_config)
-        {
-            pg_item.second.exists = false;
-        }
-        for (auto & pg_item: value["items"].object_items())
-        {
-            pg_num_t pg_num = stoull_full(pg_item.first);
-            if (!pg_num)
-            {
-                printf("Bad key in PG configuration: %s (must be a number), skipped\n", pg_item.first.c_str());
-                continue;
-            }
-            this->pg_config[pg_num].exists = true;
-            this->pg_config[pg_num].pause = pg_item.second["pause"].bool_value();
-            this->pg_config[pg_num].primary = pg_item.second["primary"].uint64_value();
-            this->pg_config[pg_num].target_set.clear();
-            for (auto pg_osd: pg_item.second["osd_set"].array_items())
-            {
-                this->pg_config[pg_num].target_set.push_back(pg_osd.uint64_value());
-            }
-            if (this->pg_config[pg_num].target_set.size() != 3)
-            {
-                printf("Bad PG %u config format: incorrect osd_set = %s\n", pg_num, pg_item.second["osd_set"].dump().c_str());
-                this->pg_config[pg_num].target_set.resize(3);
-                this->pg_config[pg_num].pause = true;
-            }
-        }
-    }
-    else if (key.substr(0, etcd_prefix.length()+12) == etcd_prefix+"/pg/history/")
-    {
-        // <etcd_prefix>/pg/history/%d
-        pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+12));
-        if (!pg_num)
-        {
-            printf("Bad etcd key %s, ignoring\n", key.c_str());
-        }
-        else
-        {
-            auto & pg_cfg = this->pg_config[pg_num];
-            pg_cfg.target_history.clear();
-            pg_cfg.all_peers.clear();
-            // Refuse to start PG if any set of the <osd_sets> has no live OSDs
-            for (auto hist_item: value["osd_sets"].array_items())
-            {
-                std::vector<osd_num_t> history_set;
-                for (auto pg_osd: hist_item.array_items())
-                {
-                    history_set.push_back(pg_osd.uint64_value());
-                }
-                pg_cfg.target_history.push_back(history_set);
-            }
-            // Include these additional OSDs when peering the PG
-            for (auto pg_osd: value["all_peers"].array_items())
-            {
-                pg_cfg.all_peers.push_back(pg_osd.uint64_value());
-            }
-        }
-    }
-    else if (key.substr(0, etcd_prefix.length()+10) == etcd_prefix+"/pg/state/")
-    {
-        // <etcd_prefix>/pg/state/%d
-        pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+10));
-        if (!pg_num)
-        {
-            printf("Bad etcd key %s, ignoring\n", key.c_str());
-        }
-        else if (value.is_null())
-        {
-            this->pg_config[pg_num].cur_primary = 0;
-            this->pg_config[pg_num].cur_state = 0;
-        }
-        else
-        {
-            osd_num_t cur_primary = value["primary"].uint64_value();
-            int state = 0;
-            for (auto & e: value["state"].array_items())
-            {
-                int i;
-                for (i = 0; i < pg_state_bit_count; i++)
-                {
-                    if (e.string_value() == pg_state_names[i])
-                    {
-                        state = state | pg_state_bits[i];
-                        break;
-                    }
-                }
-                if (i >= pg_state_bit_count)
-                {
-                    printf("Unexpected PG %u state keyword in etcd: %s\n", pg_num, e.dump().c_str());
-                    return;
-                }
-            }
-            if (!cur_primary || !value["state"].is_array() || !state ||
-                (state & PG_OFFLINE) && state != PG_OFFLINE ||
-                (state & PG_PEERING) && state != PG_PEERING ||
-                (state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
-            {
-                printf("Unexpected PG %u state in etcd: primary=%lu, state=%s\n", pg_num, cur_primary, value["state"].dump().c_str());
-                return;
-            }
-            this->pg_config[pg_num].cur_primary = cur_primary;
-            this->pg_config[pg_num].cur_state = state;
-        }
-    }
-    else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")
-    {
-        // <etcd_prefix>/osd/state/%d
-        osd_num_t peer_osd = std::stoull(key.substr(etcd_prefix.length()+11));
-        if (peer_osd > 0)
-        {
-            if (value.is_object() && value["state"] == "up" &&
-                value["addresses"].is_array() &&
-                value["port"].int64_value() > 0 && value["port"].int64_value() < 65536)
-            {
-                this->peer_states[peer_osd] = value;
-            }
-            else
-            {
-                this->peer_states.erase(peer_osd);
-            }
-            if (on_change_osd_state_hook != NULL)
-            {
-                on_change_osd_state_hook(peer_osd);
-            }
-        }
-    }
-}
--- a/etcd_state_client.h
+++ b/etcd_state_client.h
@@ -1,61 +0,0 @@
-#pragma once
-
-#include "osd_id.h"
-#include "http_client.h"
-#include "timerfd_manager.h"
-
-#define ETCD_CONFIG_WATCH_ID 1
-#define ETCD_PG_STATE_WATCH_ID 2
-#define ETCD_PG_HISTORY_WATCH_ID 3
-#define ETCD_OSD_STATE_WATCH_ID 4
-
-#define MAX_ETCD_ATTEMPTS 5
-#define ETCD_SLOW_TIMEOUT 5000
-#define ETCD_QUICK_TIMEOUT 1000
-
-struct pg_config_t
-{
-    bool exists;
-    osd_num_t primary;
-    std::vector<osd_num_t> target_set;
-    std::vector<std::vector<osd_num_t>> target_history;
-    std::vector<osd_num_t> all_peers;
-    bool pause;
-    osd_num_t cur_primary;
-    int cur_state;
-};
-
-struct json_kv_t
-{
-    std::string key;
-    json11::Json value;
-};
-
-struct etcd_state_client_t
-{
-    std::vector<std::string> etcd_addresses;
-    std::string etcd_prefix;
-    int log_level = 0;
-    timerfd_manager_t *tfd = NULL;
-
-    int etcd_watches_initialised = 0;
-    uint64_t etcd_watch_revision = 0;
-    websocket_t *etcd_watch_ws = NULL;
-    std::map<pg_num_t, pg_config_t> pg_config;
-    std::map<osd_num_t, json11::Json> peer_states;
-
-    std::function<void(json11::Json::object &)> on_change_hook;
-    std::function<void(json11::Json::object &)> on_load_config_hook;
-    std::function<json11::Json()> load_pgs_checks_hook;
-    std::function<void(bool)> on_load_pgs_hook;
-    std::function<void(uint64_t)> on_change_osd_state_hook;
-
-    json_kv_t parse_etcd_kv(const json11::Json & kv_json);
-    void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
-    void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
-    void start_etcd_watcher();
-    void load_global_config();
-    void load_pgs();
-    void parse_state(const std::string & key, const json11::Json & value);
-    void parse_config(json11::Json & config);
-};
--- a/fio_cluster.cpp
+++ b/fio_cluster.cpp
@@ -1,298 +0,0 @@
-// FIO engine to test cluster I/O
-//
-// Random write:
-//
-// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
-//     -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
-//
-// Linear write:
-//
-// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
-//     -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
-//
-// Random read (run with -iodepth=32 or -iodepth=1):
-//
-// fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
-//     -etcd=127.0.0.1:2379 [-etcd_prefix=/microceph] -size=1000M
-
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-
-#include <vector>
-#include <unordered_map>
-
-#include "epoll_manager.h"
-#include "cluster_client.h"
-extern "C" {
-#define CONFIG_HAVE_GETTID
-#define CONFIG_PWRITEV2
-#include "fio/fio.h"
-#include "fio/optgroup.h"
-}
-
-struct sec_data
-{
-    ring_loop_t *ringloop = NULL;
-    epoll_manager_t *epmgr = NULL;
-    cluster_client_t *cli = NULL;
-    bool last_sync = false;
-    /* The list of completed io_u structs. */
-    std::vector<io_u*> completed;
-    uint64_t op_n = 0, inflight = 0;
-    bool trace = false;
-};
-
-struct sec_options
-{
-    int __pad;
-    char *etcd_host = NULL;
-    char *etcd_prefix = NULL;
-    int inode = 0;
-    int trace = 0;
-};
-
-static struct fio_option options[] = {
-    {
-        .name   = "etcd",
-        .lname  = "etcd address",
-        .type   = FIO_OPT_STR_STORE,
-        .off1   = offsetof(struct sec_options, etcd_host),
-        .help   = "etcd address in the form HOST:PORT[/PATH]",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
-    {
-        .name   = "etcd",
-        .lname  = "etcd key prefix",
-        .type   = FIO_OPT_STR_STORE,
-        .off1   = offsetof(struct sec_options, etcd_prefix),
-        .help   = "etcd key prefix, by default /microceph",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
-    {
-        .name   = "inode",
-        .lname  = "inode to run tests on",
-        .type   = FIO_OPT_INT,
-        .off1   = offsetof(struct sec_options, inode),
-        .help   = "inode to run tests on (1 by default)",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
-    {
-        .name   = "osd_trace",
-        .lname  = "OSD trace",
-        .type   = FIO_OPT_BOOL,
-        .off1   = offsetof(struct sec_options, trace),
-        .help   = "Trace OSD operations",
-        .def    = "0",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
-    {
-        .name = NULL,
-    },
-};
-
-static int sec_setup(struct thread_data *td)
-{
-    sec_data *bsd;
-
-    bsd = new sec_data;
-    if (!bsd)
-    {
-        td_verror(td, errno, "calloc");
-        return 1;
-    }
-    td->io_ops_data = bsd;
-
-    if (!td->files_index)
-    {
-        add_file(td, "osd_cluster", 0, 0);
-        td->o.nr_files = td->o.nr_files ? : 1;
-        td->o.open_files++;
-    }
-
-    return 0;
-}
-
-static void sec_cleanup(struct thread_data *td)
-{
-    sec_data *bsd = (sec_data*)td->io_ops_data;
-    if (bsd)
-    {
-        delete bsd->cli;
-        delete bsd->epmgr;
-        delete bsd->ringloop;
-        bsd->cli = NULL;
-        bsd->epmgr = NULL;
-        bsd->ringloop = NULL;
-    }
-}
-
-/* Connect to the server from each thread. */
-static int sec_init(struct thread_data *td)
-{
-    sec_options *o = (sec_options*)td->eo;
-    sec_data *bsd = (sec_data*)td->io_ops_data;
-
-    json11::Json cfg = json11::Json::object {
-        { "etcd_address", std::string(o->etcd_host) },
-        { "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/microceph") },
-    };
-
-    bsd->ringloop = new ring_loop_t(512);
-    bsd->epmgr = new epoll_manager_t(bsd->ringloop);
-    bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
-
-    bsd->trace = o->trace ? true : false;
-
-    return 0;
-}
-
-/* Begin read or write request. */
-static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
-{
-    sec_options *opt = (sec_options*)td->eo;
-    sec_data *bsd = (sec_data*)td->io_ops_data;
-    int n = bsd->op_n;
-
-    fio_ro_check(td, io);
-    if (io->ddir == DDIR_SYNC && bsd->last_sync)
-    {
-        return FIO_Q_COMPLETED;
-    }
-
-    io->engine_data = bsd;
-    cluster_op_t *op = new cluster_op_t;
-
-    switch (io->ddir)
-    {
-    case DDIR_READ:
-        op->opcode = OSD_OP_READ;
-        op->inode = opt->inode;
-        op->offset = io->offset;
-        op->len = io->xfer_buflen;
-        op->buf = io->xfer_buf;
-        bsd->last_sync = false;
-        break;
-    case DDIR_WRITE:
-        op->opcode = OSD_OP_WRITE;
-        op->inode = opt->inode;
-        op->offset = io->offset;
-        op->len = io->xfer_buflen;
-        op->buf = io->xfer_buf;
-        bsd->last_sync = false;
-        break;
-    case DDIR_SYNC:
-        op->opcode = OSD_OP_SYNC;
-        bsd->last_sync = true;
-        break;
-    default:
-        io->error = EINVAL;
-        return FIO_Q_COMPLETED;
-    }
-
-    op->callback = [io, n](cluster_op_t *op)
-    {
-        io->error = op->retval < 0 ? -op->retval : 0;
-        sec_data *bsd = (sec_data*)io->engine_data;
-        bsd->inflight--;
-        bsd->completed.push_back(io);
-        if (bsd->trace)
-        {
-            printf("--- %s n=%d retval=%d\n", io->ddir == DDIR_READ ? "READ" :
-                (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n, op->retval);
-        }
-        delete op;
-    };
-
-    if (opt->trace)
-    {
-        printf("+++ %s # %d\n", io->ddir == DDIR_READ ? "READ" :
-            (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n);
-    }
-
-    io->error = 0;
-    bsd->inflight++;
-    bsd->op_n++;
-    bsd->cli->execute(op);
-
-    if (io->error != 0)
-        return FIO_Q_COMPLETED;
-    return FIO_Q_QUEUED;
-}
-
-static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
-{
-    sec_data *bsd = (sec_data*)td->io_ops_data;
-    while (true)
-    {
-        bsd->ringloop->loop();
-        if (bsd->completed.size() >= min)
-            break;
-        bsd->ringloop->wait();
-    }
-    return bsd->completed.size();
-}
-
-static struct io_u *sec_event(struct thread_data *td, int event)
-{
-    sec_data *bsd = (sec_data*)td->io_ops_data;
-    if (bsd->completed.size() == 0)
-        return NULL;
-    /* FIXME We ignore the event number and assume fio calls us exactly once for [0..nr_events-1] */
-    struct io_u *ev = bsd->completed.back();
-    bsd->completed.pop_back();
-    return ev;
-}
-
-static int sec_io_u_init(struct thread_data *td, struct io_u *io)
-{
-    io->engine_data = NULL;
-    return 0;
-}
-
-static void sec_io_u_free(struct thread_data *td, struct io_u *io)
-{
-}
-
-static int sec_open_file(struct thread_data *td, struct fio_file *f)
-{
-    return 0;
-}
-
-static int sec_invalidate(struct thread_data *td, struct fio_file *f)
-{
-    return 0;
-}
-
-struct ioengine_ops ioengine = {
-    .name               = "microceph_cluster",
-    .version            = FIO_IOOPS_VERSION,
-    .flags              = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
-    .setup              = sec_setup,
-    .init               = sec_init,
-    .queue              = sec_queue,
-    .getevents          = sec_getevents,
-    .event              = sec_event,
-    .cleanup            = sec_cleanup,
-    .open_file          = sec_open_file,
-    .invalidate         = sec_invalidate,
-    .io_u_init          = sec_io_u_init,
-    .io_u_free          = sec_io_u_free,
-    .option_struct_size = sizeof(struct sec_options),
-    .options            = options,
-};
-
-static void fio_init fio_sec_register(void)
-{
-    register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_sec_unregister(void)
-{
-    unregister_ioengine(&ioengine);
-}
--- a/fio_engine.cpp
+++ b/fio_engine.cpp
@@ -23,7 +23,6 @@

 #include "blockstore.h"
 extern "C" {
-#define CONFIG_HAVE_GETTID
 #define CONFIG_PWRITEV2
 #include "fio/fio.h"
 #include "fio/optgroup.h"
@@ -101,7 +100,7 @@ static void bs_cleanup(struct thread_data *td)
                bsd->ringloop->loop();
                if (bsd->bs->is_safe_to_stop())
                    goto safe;
-            } while (bsd->ringloop->has_work());
+            } while (bsd->ringloop->get_loop_again());
            bsd->ringloop->wait();
        }
    safe:
--- a/fio_sec_osd.cpp
+++ b/fio_sec_osd.cpp
@@ -5,7 +5,7 @@
 // Random write:
 //
 // fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
-//     -host=127.0.0.1 -port=11203 [-block_size_order=17] [-single_primary=1] -size=1000M
+//     -host=127.0.0.1 -port=11203 [-single_primary=1] -size=1000M
 //
 // Linear write:
 //
@@ -28,7 +28,6 @@
 #include "rw_blocking.h"
 #include "osd_ops.h"
 extern "C" {
-#define CONFIG_HAVE_GETTID
 #define CONFIG_PWRITEV2
 #include "fio/fio.h"
 #include "fio/optgroup.h"
@@ -53,7 +52,6 @@ struct sec_options
    int port = 0;
    int single_primary = 0;
    int trace = 0;
-    int block_order = 17;
 };

 static struct fio_option options[] = {
@@ -75,15 +73,6 @@ static struct fio_option options[] = {
        .category = FIO_OPT_C_ENGINE,
        .group  = FIO_OPT_G_FILENAME,
    },
-    {
-        .name   = "block_size_order",
-        .lname  = "Blockstore block size order",
-        .type   = FIO_OPT_INT,
-        .off1   = offsetof(struct sec_options, block_order),
-        .help   = "Blockstore block size order (size = 2^order)",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
    {
        .name   = "single_primary",
        .lname  = "Single Primary",
@@ -150,8 +139,6 @@ static int sec_init(struct thread_data *td)
 {
    sec_options *o = (sec_options*)td->eo;
    sec_data *bsd = (sec_data*)td->io_ops_data;
-    bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
-    bsd->block_size = 1 << o->block_order;

    struct sockaddr_in addr;
    int r;
--- a/http_client.cpp
+++ b/http_client.cpp
@@ -1,680 +0,0 @@
-#include <netinet/tcp.h>
-#include <sys/epoll.h>
-
-#include <net/if.h>
-#include <arpa/inet.h>
-#include <ifaddrs.h>
-
-#include <ctype.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <string.h>
-
-#include "json11/json11.hpp"
-#include "http_client.h"
-#include "timerfd_manager.h"
-
-#define READ_BUFFER_SIZE 9000
-
-static int extract_port(std::string & host);
-static std::string strtolower(const std::string & in);
-static std::string trim(const std::string & in);
-static std::string ws_format_frame(int type, uint64_t size);
-static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
-
-// FIXME: Use keepalive
-struct http_co_t
-{
-    timerfd_manager_t *tfd;
-
-    int request_timeout = 0;
-    std::string host;
-    std::string request;
-    std::string ws_outbox;
-    std::string response;
-    bool want_streaming;
-
-    http_response_t parsed;
-    uint64_t target_response_size = 0;
-
-    int state = 0;
-    int peer_fd = -1;
-    int timeout_id = -1;
-    int epoll_events = 0;
-    int sent = 0;
-    std::vector<char> rbuf;
-    iovec read_iov, send_iov;
-    msghdr read_msg = { 0 }, send_msg = { 0 };
-
-    std::function<void(const http_response_t*)> callback;
-
-    websocket_t ws;
-
-    int onstack = 0;
-    bool ended = false;
-
-    ~http_co_t();
-    inline void stackin() { onstack++; }
-    inline void stackout() { onstack--; if (!onstack && ended) end(); }
-    inline void end() { ended = true; if (!onstack) { delete this; } }
-    void start_connection();
-    void handle_events();
-    void handle_connect_result();
-    void submit_read();
-    void submit_send();
-    bool handle_read();
-    void post_message(int type, const std::string & msg);
-};
-
-#define HTTP_CO_CONNECTING 1
-#define HTTP_CO_SENDING_REQUEST 2
-#define HTTP_CO_REQUEST_SENT 3
-#define HTTP_CO_HEADERS_RECEIVED 4
-#define HTTP_CO_WEBSOCKET 5
-#define HTTP_CO_CHUNKED 6
-
-#define DEFAULT_TIMEOUT 5000
-
-void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
-    const http_options_t & options, std::function<void(const http_response_t *response)> callback)
-{
-    http_co_t *handler = new http_co_t();
-    handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
-    handler->want_streaming = options.want_streaming;
-    handler->tfd = tfd;
-    handler->host = host;
-    handler->request = request;
-    handler->callback = callback;
-    handler->ws.co = handler;
-    handler->start_connection();
-}
-
-void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
-    int timeout, std::function<void(std::string, json11::Json r)> callback)
-{
-    http_request(tfd, host, request, { .timeout = timeout }, [callback](const http_response_t* res)
-    {
-        if (res->error_code != 0)
-        {
-            callback("Error code: "+std::to_string(res->error_code)+" ("+std::string(strerror(res->error_code))+")", json11::Json());
-            return;
-        }
-        if (res->status_code != 200)
-        {
-            callback("HTTP "+std::to_string(res->status_code)+" "+res->status_line+" body: "+trim(res->body), json11::Json());
-            return;
-        }
-        std::string json_err;
-        json11::Json data = json11::Json::parse(res->body, json_err);
-        if (json_err != "")
-        {
-            callback("Bad JSON: "+json_err+" (response: "+trim(res->body)+")", json11::Json());
-            return;
-        }
-        callback(std::string(), data);
-    });
-}
-
-websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
-    int timeout, std::function<void(const http_response_t *msg)> callback)
-{
-    std::string request = "GET "+path+" HTTP/1.1\r\n"
-        "Host: "+host+"\r\n"
-        "Upgrade: websocket\r\n"
-        "Connection: upgrade\r\n"
-        "Sec-WebSocket-Key: x3JJHMbDL1EzLkh9GBhXDw==\r\n"
-        "Sec-WebSocket-Version: 13\r\n"
-        "\r\n";
-    http_co_t *handler = new http_co_t();
-    handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
-    handler->want_streaming = false;
-    handler->tfd = tfd;
-    handler->host = host;
-    handler->request = request;
-    handler->callback = callback;
-    handler->ws.co = handler;
-    handler->start_connection();
-    return &handler->ws;
-}
-
-void websocket_t::post_message(int type, const std::string & msg)
-{
-    co->post_message(type, msg);
-}
-
-void websocket_t::close()
-{
-    co->end();
-}
-
-http_co_t::~http_co_t()
-{
-    if (timeout_id >= 0)
-    {
-        tfd->clear_timer(timeout_id);
-        timeout_id = -1;
-    }
-    if (peer_fd >= 0)
-    {
-        tfd->set_fd_handler(peer_fd, NULL);
-        close(peer_fd);
-        peer_fd = -1;
-    }
-    if (parsed.headers["transfer-encoding"] == "chunked")
-    {
-        int prev = 0, pos = 0;
-        while ((pos = response.find("\r\n", prev)) >= prev)
-        {
-            uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
-            parsed.body += response.substr(pos+2, len);
-            prev = pos+2+len+2;
-        }
-    }
-    else
-    {
-        std::swap(parsed.body, response);
-    }
-    parsed.eof = true;
-    callback(&parsed);
-}
-
-void http_co_t::start_connection()
-{
-    stackin();
-    int port = extract_port(host);
-    struct sockaddr_in addr;
-    int r;
-    if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
-    {
-        parsed.error_code = ENXIO;
-        stackout();
-        end();
-        return;
-    }
-    addr.sin_family = AF_INET;
-    addr.sin_port = htons(port ? port : 80);
-    peer_fd = socket(AF_INET, SOCK_STREAM, 0);
-    if (peer_fd < 0)
-    {
-        parsed.error_code = errno;
-        stackout();
-        end();
-        return;
-    }
-    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
-    if (request_timeout > 0)
-    {
-        timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
-        {
-            if (response.length() == 0)
-            {
-                parsed.error_code = ETIME;
-            }
-            end();
-        });
-    }
-    epoll_events = 0;
-    // Finally call connect
-    r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
-    if (r < 0 && errno != EINPROGRESS)
-    {
-        parsed.error_code = errno;
-        stackout();
-        end();
-        return;
-    }
-    tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
-    {
-        this->epoll_events |= epoll_events;
-        handle_events();
-    });
-    state = HTTP_CO_CONNECTING;
-    stackout();
-}
-
-void http_co_t::handle_events()
-{
-    stackin();
-    while (epoll_events)
-    {
-        if (state == HTTP_CO_CONNECTING)
-        {
-            handle_connect_result();
-        }
-        else
-        {
-            epoll_events &= ~EPOLLOUT;
-            if (epoll_events & EPOLLIN)
-            {
-                submit_read();
-            }
-            else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
-            {
-                end();
-                break;
-            }
-        }
-    }
-    stackout();
-}
-
-void http_co_t::handle_connect_result()
-{
-    stackin();
-    int result = 0;
-    socklen_t result_len = sizeof(result);
-    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
-    {
-        result = errno;
-    }
-    if (result != 0)
-    {
-        parsed.error_code = result;
-        stackout();
-        end();
-        return;
-    }
-    int one = 1;
-    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-    state = HTTP_CO_SENDING_REQUEST;
-    submit_send();
-    stackout();
-}
-
-void http_co_t::submit_read()
-{
-    stackin();
-    int res;
-    if (rbuf.size() != READ_BUFFER_SIZE)
-    {
-        rbuf.resize(READ_BUFFER_SIZE);
-    }
-    read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
-    read_msg.msg_iov = &read_iov;
-    read_msg.msg_iovlen = 1;
-    res = recvmsg(peer_fd, &read_msg, 0);
-    if (res < 0)
-    {
-        res = -errno;
-    }
-    if (res == -EAGAIN || res == 0)
-    {
-        epoll_events = epoll_events & ~EPOLLIN;
-    }
-    else if (res < 0)
-    {
-        end();
-    }
-    else if (res > 0)
-    {
-        response += std::string(rbuf.data(), res);
-        handle_read();
-    }
-    stackout();
-}
-
-void http_co_t::submit_send()
-{
-    stackin();
-    int res;
-again:
-    if (sent < request.size())
-    {
-        send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
-        send_msg.msg_iov = &send_iov;
-        send_msg.msg_iovlen = 1;
-        res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL);
-        if (res < 0)
-        {
-            res = -errno;
-        }
-        if (res == -EAGAIN)
-        {
-            res = 0;
-        }
-        else if (res < 0)
-        {
-            stackout();
-            end();
-            return;
-        }
-        sent += res;
-        if (state == HTTP_CO_SENDING_REQUEST)
-        {
-            if (sent >= request.size())
-            {
-                state = HTTP_CO_REQUEST_SENT;
-            }
-            else
-                goto again;
-        }
-        else if (state == HTTP_CO_WEBSOCKET)
-        {
-            request = request.substr(sent);
-            sent = 0;
-            goto again;
-        }
-    }
-    stackout();
-}
-
-bool http_co_t::handle_read()
-{
-    stackin();
-    if (state == HTTP_CO_REQUEST_SENT)
-    {
-        int pos = response.find("\r\n\r\n");
-        if (pos >= 0)
-        {
-            if (timeout_id >= 0)
-            {
-                tfd->clear_timer(timeout_id);
-                timeout_id = -1;
-            }
-            state = HTTP_CO_HEADERS_RECEIVED;
-            parse_http_headers(response, &parsed);
-            if (parsed.status_code == 101 &&
-                parsed.headers.find("sec-websocket-accept") != parsed.headers.end() &&
-                parsed.headers["upgrade"] == "websocket" &&
-                parsed.headers["connection"] == "upgrade")
-            {
-                // Don't care about validating the key
-                state = HTTP_CO_WEBSOCKET;
-                request = ws_outbox;
-                ws_outbox = "";
-                sent = 0;
-                submit_send();
-            }
-            else if (parsed.headers["transfer-encoding"] == "chunked")
-            {
-                state = HTTP_CO_CHUNKED;
-            }
-            else if (parsed.headers["connection"] != "close")
-            {
-                target_response_size = stoull_full(parsed.headers["content-length"]);
-                if (!target_response_size)
-                {
-                    // Sorry, unsupported response
-                    stackout();
-                    end();
-                    return false;
-                }
-            }
-        }
-    }
-    if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
-    {
-        stackout();
-        end();
-        return false;
-    }
-    if (state == HTTP_CO_CHUNKED && response.size() > 0)
-    {
-        int prev = 0, pos = 0;
-        while ((pos = response.find("\r\n", prev)) >= prev)
-        {
-            uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
-            if (!len)
-            {
-                // Zero length chunk indicates EOF
-                parsed.eof = true;
-                break;
-            }
-            if (response.size() < pos+2+len+2)
-            {
-                break;
-            }
-            parsed.body += response.substr(pos+2, len);
-            prev = pos+2+len+2;
-        }
-        if (prev > 0)
-        {
-            response = response.substr(prev);
-        }
-        if (parsed.eof)
-        {
-            stackout();
-            end();
-            return false;
-        }
-        if (want_streaming && parsed.body.size() > 0)
-        {
-            callback(&parsed);
-            parsed.body = "";
-        }
-    }
-    if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
-    {
-        while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body))
-        {
-            callback(&parsed);
-            parsed.body = "";
-        }
-    }
-    stackout();
-    return true;
-}
-
-void http_co_t::post_message(int type, const std::string & msg)
-{
-    stackin();
-    if (state == HTTP_CO_WEBSOCKET)
-    {
-        request += ws_format_frame(type, msg.size());
-        request += msg;
-        submit_send();
-    }
-    else
-    {
-        ws_outbox += ws_format_frame(type, msg.size());
-        ws_outbox += msg;
-    }
-    stackout();
-}
-
-uint64_t stoull_full(const std::string & str, int base)
-{
-    if (isspace(str[0]))
-    {
-        return 0;
-    }
-    char *end = NULL;
-    uint64_t r = strtoull(str.c_str(), &end, base);
-    if (end != str.c_str()+str.length())
-    {
-        return 0;
-    }
-    return r;
-}
-
-void parse_http_headers(std::string & res, http_response_t *parsed)
-{
-    int pos = res.find("\r\n");
-    pos = pos < 0 ? res.length() : pos+2;
-    std::string status_line = res.substr(0, pos);
-    int http_version;
-    char *status_text = NULL;
-    sscanf(status_line.c_str(), "HTTP/1.%d %d %ms", &http_version, &parsed->status_code, &status_text);
-    if (status_text)
-    {
-        parsed->status_line = status_text;
-        // %ms = allocate a buffer
-        free(status_text);
-        status_text = NULL;
-    }
-    int prev = pos;
-    while ((pos = res.find("\r\n", prev)) >= prev)
-    {
-        if (pos == prev)
-        {
-            res = res.substr(pos+2);
-            break;
-        }
-        std::string header = res.substr(prev, pos-prev);
-        int p2 = header.find(":");
-        if (p2 >= 0)
-        {
-            std::string key = strtolower(header.substr(0, p2));
-            int p3 = p2+1;
-            while (p3 < header.length() && isblank(header[p3]))
-                p3++;
-            parsed->headers[key] = key == "connection" || key == "upgrade" || key == "transfer-encoding"
-                ? strtolower(header.substr(p3)) : header.substr(p3);
-        }
-        prev = pos+2;
-    }
-}
-
-static std::string ws_format_frame(int type, uint64_t size)
-{
-    // Always zero mask
-    std::string res;
-    int p = 0;
-    res.resize(2 + (size >= 126 ? 2 : 0) + (size >= 65536 ? 6 : 0) + /*mask*/4);
-    res[p++] = 0x80 | type;
-    if (size < 126)
-        res[p++] = size | /*mask*/0x80;
-    else if (size < 65536)
-    {
-        res[p++] = 126 | /*mask*/0x80;
-        res[p++] = (size >> 8) & 0xFF;
-        res[p++] = (size >> 0) & 0xFF;
-    }
-    else
-    {
-        res[p++] = 127 | /*mask*/0x80;
-        res[p++] = (size >> 56) & 0xFF;
-        res[p++] = (size >> 48) & 0xFF;
-        res[p++] = (size >> 40) & 0xFF;
-        res[p++] = (size >> 32) & 0xFF;
-        res[p++] = (size >> 24) & 0xFF;
-        res[p++] = (size >> 16) & 0xFF;
-        res[p++] = (size >>  8) & 0xFF;
-        res[p++] = (size >>  0) & 0xFF;
-    }
-    res[p++] = 0;
-    res[p++] = 0;
-    res[p++] = 0;
-    res[p++] = 0;
-    return res;
-}
-
-static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
-{
-    uint64_t hdr = 2;
-    if (buf.size() < hdr)
-    {
-        return false;
-    }
-    type = buf[0] & ~0x80;
-    bool mask = !!(buf[1] & 0x80);
-    hdr += mask ? 4 : 0;
-    uint64_t len = ((uint8_t)buf[1] & ~0x80);
-    if (len == 126)
-    {
-        hdr += 2;
-        if (buf.size() < hdr)
-        {
-            return false;
-        }
-        len = ((uint64_t)(uint8_t)buf[2] << 8) | ((uint64_t)(uint8_t)buf[3] << 0);
-    }
-    else if (len == 127)
-    {
-        hdr += 8;
-        if (buf.size() < hdr)
-        {
-            return false;
-        }
-        len = ((uint64_t)(uint8_t)buf[2] << 56) |
-            ((uint64_t)(uint8_t)buf[3] << 48) |
-            ((uint64_t)(uint8_t)buf[4] << 40) |
-            ((uint64_t)(uint8_t)buf[5] << 32) |
-            ((uint64_t)(uint8_t)buf[6] << 24) |
-            ((uint64_t)(uint8_t)buf[7] << 16) |
-            ((uint64_t)(uint8_t)buf[8] << 8) |
-            ((uint64_t)(uint8_t)buf[9] << 0);
-    }
-    if (buf.size() < hdr+len)
-    {
-        return false;
-    }
-    if (mask)
-    {
-        for (int i = 0; i < len; i++)
-            buf[hdr+i] ^= buf[hdr-4+(i & 3)];
-    }
-    res += buf.substr(hdr, len);
-    buf = buf.substr(hdr+len);
-    return true;
-}
-
-std::vector<std::string> getifaddr_list(bool include_v6)
-{
-    std::vector<std::string> addresses;
-    ifaddrs *list, *ifa;
-    if (getifaddrs(&list) == -1)
-    {
-        throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
-    }
-    for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
-    {
-        if (!ifa->ifa_addr)
-        {
-            continue;
-        }
-        int family = ifa->ifa_addr->sa_family;
-        if ((family == AF_INET || family == AF_INET6 && include_v6) &&
-            (ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
-        {
-            void *addr_ptr;
-            if (family == AF_INET)
-                addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
-            else
-                addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
-            char addr[INET6_ADDRSTRLEN];
-            if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
-            {
-                throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
-            }
-            addresses.push_back(std::string(addr));
-        }
-    }
-    freeifaddrs(list);
-    return addresses;
-}
-
-static int extract_port(std::string & host)
-{
-    int port = 0;
-    int pos = 0;
-    if ((pos = host.find(':')) >= 0)
-    {
-        port = strtoull(host.c_str() + pos + 1, NULL, 10);
-        if (port >= 0x10000)
-        {
-            port = 0;
-        }
-        host = host.substr(0, pos);
-    }
-    return port;
-}
-
-static std::string strtolower(const std::string & in)
-{
-    std::string s = in;
-    for (int i = 0; i < s.length(); i++)
-    {
-        s[i] = tolower(s[i]);
-    }
-    return s;
-}
-
-static std::string trim(const std::string & in)
-{
-    int begin = in.find_first_not_of(" \n\r\t");
-    if (begin == -1)
-        return "";
-    int end = in.find_last_not_of(" \n\r\t");
-    return in.substr(begin, end+1-begin);
-}
--- a/http_client.h
+++ b/http_client.h
@@ -1,56 +0,0 @@
-#pragma once
-#include <string>
-#include <vector>
-#include <map>
-#include <functional>
-#include "json11/json11.hpp"
-
-#define WS_CONTINUATION 0
-#define WS_TEXT 1
-#define WS_BINARY 2
-#define WS_CLOSE 8
-#define WS_PING 9
-#define WS_PONG 10
-
-class timerfd_manager_t;
-
-struct http_options_t
-{
-    int timeout;
-    bool want_streaming;
-};
-
-struct http_response_t
-{
-    bool eof = false;
-    int error_code = 0;
-    int status_code = 0;
-    std::string status_line;
-    std::map<std::string, std::string> headers;
-    int ws_msg_type = -1;
-    std::string body;
-};
-
-struct http_co_t;
-
-struct websocket_t
-{
-    http_co_t *co;
-    void post_message(int type, const std::string & msg);
-    void close();
-};
-
-void parse_http_headers(std::string & res, http_response_t *parsed);
-
-std::vector<std::string> getifaddr_list(bool include_v6 = false);
-
-uint64_t stoull_full(const std::string & str, int base = 10);
-
-void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
-    const http_options_t & options, std::function<void(const http_response_t *response)> callback);
-
-void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
-    int timeout, std::function<void(std::string, json11::Json r)> callback);
-
-websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
-    int timeout, std::function<void(const http_response_t *msg)> callback);
--- a/lp/lp-optimizer.js
+++ b/lp/lp-optimizer.js
@@ -1,521 +0,0 @@
-// Data distribution optimizer using linear programming (lp_solve)
-
-const child_process = require('child_process');
-
-const NO_OSD = 'Z';
-
-async function lp_solve(text)
-{
-    const cp = child_process.spawn('lp_solve');
-    let stdout = '', stderr = '', finish_cb;
-    cp.stdout.on('data', buf => stdout += buf.toString());
-    cp.stderr.on('data', buf => stderr += buf.toString());
-    cp.on('exit', () => finish_cb && finish_cb());
-    cp.stdin.write(text);
-    cp.stdin.end();
-    if (cp.exitCode == null)
-    {
-        await new Promise(ok => finish_cb = ok);
-    }
-    if (!stdout.trim())
-    {
-        return null;
-    }
-    let score = 0;
-    let vars = {};
-    for (const line of stdout.split(/\n/))
-    {
-        let m = /^(^Value of objective function: ([\d\.]+)|Actual values of the variables:)\s*$/.exec(line);
-        if (m)
-        {
-            if (m[2])
-            {
-                score = m[2];
-            }
-            continue;
-        }
-        else if (/This problem is (infeasible|unbounded)/.exec(line))
-        {
-            return null;
-        }
-        let [ k, v ] = line.trim().split(/\s+/, 2);
-        if (v)
-        {
-            vars[k] = v;
-        }
-    }
-    return { score, vars };
-}
-
-async function optimize_initial(osd_tree, pg_count, max_combinations)
-{
-    max_combinations = max_combinations || 10000;
-    const all_weights = Object.assign({}, ...Object.values(osd_tree));
-    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
-    let all_pgs = all_combinations(osd_tree, null, true);
-    if (all_pgs.length > max_combinations)
-    {
-        const prob = max_combinations/all_pgs.length;
-        all_pgs = all_pgs.filter(pg => Math.random() < prob);
-    }
-    const pg_per_osd = {};
-    for (const pg of all_pgs)
-    {
-        for (const osd of pg)
-        {
-            pg_per_osd[osd] = pg_per_osd[osd] || [];
-            pg_per_osd[osd].push("pg_"+pg.join("_"));
-        }
-    }
-    const pg_size = Math.min(Object.keys(osd_tree).length, 3);
-    let lp = '';
-    lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n";
-    for (const osd in pg_per_osd)
-    {
-        if (osd !== NO_OSD)
-        {
-            let osd_pg_count = all_weights[osd]/total_weight*pg_size*pg_count;
-            lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
-        }
-    }
-    for (const pg of all_pgs)
-    {
-        lp += 'pg_'+pg.join('_')+" >= 0;\n";
-    }
-    lp += "sec "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(', ')+";\n";
-    const lp_result = await lp_solve(lp);
-    if (!lp_result)
-    {
-        throw new Error('Problem is infeasible or unbounded - is it a bug?');
-    }
-    const int_pgs = make_int_pgs(lp_result.vars, pg_count);
-    const eff = pg_list_space_efficiency(int_pgs, all_weights);
-    return { score: lp_result.score, weights: lp_result.vars, int_pgs, space: eff*pg_size, total_space: total_weight };
-}
-
-function make_int_pgs(weights, pg_count)
-{
-    const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
-    let int_pgs = [];
-    let pg_left = pg_count;
-    let weight_left = total_weight;
-    for (const pg_name in weights)
-    {
-        let n = Math.round(weights[pg_name] / weight_left * pg_left);
-        for (let i = 0; i < n; i++)
-        {
-            int_pgs.push(pg_name.substr(3).split('_'));
-        }
-        weight_left -= weights[pg_name];
-        pg_left -= n;
-    }
-    return int_pgs;
-}
-
-// Try to minimize data movement
-async function optimize_change(prev_int_pgs, osd_tree, max_combinations)
-{
-    max_combinations = max_combinations || 10000;
-    const pg_size = Math.min(Object.keys(osd_tree).length, 3);
-    const pg_count = prev_int_pgs.length;
-    const prev_weights = {};
-    const prev_pg_per_osd = {};
-    for (const pg of prev_int_pgs)
-    {
-        const pg_name = 'pg_'+pg.join('_');
-        prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1;
-        for (const osd of pg)
-        {
-            prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || [];
-            prev_pg_per_osd[osd].push(pg_name);
-        }
-    }
-    // Get all combinations
-    let all_pgs = all_combinations(osd_tree, null, true);
-    if (all_pgs.length > max_combinations)
-    {
-        const intersecting = all_pgs.filter(pg => prev_weights['pg_'+pg.join('_')]);
-        if (intersecting.length > max_combinations)
-        {
-            const prob = max_combinations/intersecting.length;
-            all_pgs = intersecting.filter(pg => Math.random() < prob);
-        }
-        else
-        {
-            const prob = (max_combinations-intersecting.length)/all_pgs.length;
-            all_pgs = all_pgs.filter(pg => Math.random() < prob || prev_weights['pg_'+pg.join('_')]);
-        }
-    }
-    const pg_per_osd = {};
-    for (const pg of all_pgs)
-    {
-        const pg_name = 'pg_'+pg.join('_');
-        for (const osd of pg)
-        {
-            pg_per_osd[osd] = pg_per_osd[osd] || [];
-            pg_per_osd[osd].push(pg_name);
-        }
-    }
-    // Penalize PGs based on their similarity to old PGs
-    const intersect = {};
-    for (const pg_name in prev_weights)
-    {
-        const pg = pg_name.substr(3).split(/_/);
-        intersect[pg[0]+'::'] = intersect[':'+pg[1]+':'] = intersect['::'+pg[2]] = 2;
-        intersect[pg[0]+'::'+pg[2]] = intersect[':'+pg[1]+':'+pg[2]] = intersect[pg[0]+':'+pg[1]+':'] = 1;
-    }
-    const move_weights = {};
-    for (const pg of all_pgs)
-    {
-        move_weights['pg_'+pg.join('_')] =
-            intersect[pg[0]+'::'+pg[2]] || intersect[':'+pg[1]+':'+pg[2]] || intersect[pg[0]+':'+pg[1]+':'] ||
-            intersect[pg[0]+'::'] || intersect[':'+pg[1]+':'] || intersect['::'+pg[2]] ||
-            3;
-    }
-    // Calculate total weight - old PG weights
-    const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
-    const all_weights = Object.assign({}, ...Object.values(osd_tree));
-    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
-    // Generate the LP problem
-    let lp = '';
-    lp += 'max: '+all_pg_names.map(pg_name => (
-        prev_weights[pg_name] ? `${4-move_weights[pg_name]}*add_${pg_name} - 4*del_${pg_name}` : `${4-move_weights[pg_name]}*${pg_name}`
-    )).join(' + ')+';\n';
-    for (const osd in pg_per_osd)
-    {
-        if (osd !== NO_OSD)
-        {
-            const osd_sum = (pg_per_osd[osd]||[]).map(pg_name => prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : pg_name).join(' + ');
-            const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]).filter(old_pg_name => move_weights[old_pg_name]).length;
-            let osd_pg_count = all_weights[osd]*3/total_weight*pg_count - rm_osd_pg_count;
-            lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
-        }
-    }
-    let pg_vars = [];
-    for (const pg_name of all_pg_names)
-    {
-        if (prev_weights[pg_name])
-        {
-            pg_vars.push(`add_${pg_name}`, `del_${pg_name}`);
-            // Can't add or remove less than zero
-            lp += `add_${pg_name} >= 0;\n`;
-            lp += `del_${pg_name} >= 0;\n`;
-            // Can't remove more than the PG already has
-            lp += `add_${pg_name} - del_${pg_name} >= -${prev_weights[pg_name]};\n`;
-        }
-        else
-        {
-            pg_vars.push(pg_name);
-            lp += `${pg_name} >= 0;\n`;
-        }
-    }
-    lp += 'sec '+pg_vars.join(', ')+';\n';
-    // Solve it
-    const lp_result = await lp_solve(lp);
-    if (!lp_result)
-    {
-        console.log(lp);
-        throw new Error('Problem is infeasible or unbounded - is it a bug?');
-    }
-    // Generate the new distribution
-    const weights = { ...prev_weights };
-    for (const k in prev_weights)
-    {
-        if (!move_weights[k])
-        {
-            delete weights[k];
-        }
-    }
-    for (const k in lp_result.vars)
-    {
-        if (k.substr(0, 4) === 'add_')
-        {
-            weights[k.substr(4)] = (weights[k.substr(4)] || 0) + Number(lp_result.vars[k]);
-        }
-        else if (k.substr(0, 4) === 'del_')
-        {
-            weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]);
-        }
-        else
-        {
-            weights[k] = Number(lp_result.vars[k]);
-        }
-    }
-    for (const k in weights)
-    {
-        if (!weights[k])
-        {
-            delete weights[k];
-        }
-    }
-    const int_pgs = make_int_pgs(weights, pg_count);
-    // Align them with most similar previous PGs
-    const new_pgs = align_pgs(prev_int_pgs, int_pgs);
-    let differs = 0, osd_differs = 0;
-    for (let i = 0; i < pg_count; i++)
-    {
-        if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
-        {
-            differs++;
-        }
-        for (let j = 0; j < 3; j++)
-        {
-            if (new_pgs[i][j] != prev_int_pgs[i][j])
-            {
-                osd_differs++;
-            }
-        }
-    }
-    return {
-        prev_pgs: prev_int_pgs,
-        score: lp_result.score,
-        weights,
-        int_pgs: new_pgs,
-        differs,
-        osd_differs,
-        space: pg_size * pg_list_space_efficiency(new_pgs, all_weights),
-        total_space: total_weight,
-    };
-}
-
-function print_change_stats(retval, detailed)
-{
-    const new_pgs = retval.int_pgs;
-    const prev_int_pgs = retval.prev_pgs;
-    if (prev_int_pgs)
-    {
-        if (detailed)
-        {
-            for (let i = 0; i < new_pgs.length; i++)
-            {
-                if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
-                {
-                    console.log("pg "+i+": "+prev_int_pgs[i].join(' ')+" -> "+new_pgs[i].join(' '));
-                }
-            }
-        }
-        console.log(
-            "Data movement: "+retval.differs+" pgs, "+
-            retval.osd_differs+" pg*osds = "+Math.round(retval.osd_differs / prev_int_pgs.length / 3 * 10000)/100+" %"
-        );
-    }
-    console.log(
-        "Total space (raw): "+Math.round(retval.space*100)/100+" TB, space efficiency: "+
-        Math.round(retval.space/(retval.total_space||1)*10000)/100+" %"
-    );
-}
-
-function align_pgs(prev_int_pgs, int_pgs)
-{
-    const aligned_pgs = [];
-    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg.join(':') ]);
-    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::'+pg[2], ':'+pg[1]+':'+pg[2], pg[0]+':'+pg[1]+':' ]);
-    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::', ':'+pg[1]+':', '::'+pg[2] ]);
-    const free_slots = prev_int_pgs.map((pg, i) => !aligned_pgs[i] ? i : null).filter(i => i != null);
-    for (const pg of int_pgs)
-    {
-        if (!free_slots.length)
-        {
-            throw new Error("Can't place unaligned PG");
-        }
-        aligned_pgs[free_slots.shift()] = pg;
-    }
-    return aligned_pgs;
-}
-
-function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
-{
-    let prev_indexes = {};
-    for (let i = 0; i < prev_int_pgs.length; i++)
-    {
-        for (let k of keygen(prev_int_pgs[i]))
-        {
-            prev_indexes[k] = prev_indexes[k] || [];
-            prev_indexes[k].push(i);
-        }
-    }
-    PG: for (let i = int_pgs.length-1; i >= 0; i--)
-    {
-        let pg = int_pgs[i];
-        let keys = keygen(int_pgs[i]);
-        for (let k of keys)
-        {
-            while (prev_indexes[k] && prev_indexes[k].length)
-            {
-                let idx = prev_indexes[k].shift();
-                if (!aligned_pgs[idx])
-                {
-                    aligned_pgs[idx] = pg;
-                    int_pgs.splice(i, 1);
-                    continue PG;
-                }
-            }
-        }
-    }
-}
-
-// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
-// levels = { string: number }
-// to a two-level osd_tree suitable for all_combinations()
-function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
-{
-    osd_level = levels[osd_level] || osd_level;
-    failure_domain_level = levels[failure_domain_level] || failure_domain_level;
-    for (const node of osd_tree)
-    {
-        if ((levels[node.level] || node.level) < failure_domain_level)
-        {
-            flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
-        }
-        else
-        {
-            domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
-        }
-    }
-    return domains;
-}
-
-function extract_osds(osd_tree, levels, osd_level, osds = {})
-{
-    for (const node of osd_tree)
-    {
-        if ((levels[node.level] || node.level) >= osd_level)
-        {
-            osds[node.id] = node.size;
-        }
-        else
-        {
-            extract_osds(node.children||[], levels, osd_level, osds);
-        }
-    }
-    return osds;
-}
-
-// FIXME: support different pg_sizes, not just 3
-// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
-function all_combinations(osd_tree, count, ordered)
-{
-    const hosts = Object.keys(osd_tree).sort();
-    const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
-    while (hosts.length < 3)
-    {
-        osds[NO_OSD] = [ NO_OSD ];
-        hosts.push(NO_OSD);
-    }
-    let host_idx = [ 0, 1, 2 ];
-    let osd_idx = [ 0, 0, 0 ];
-    const r = [];
-    while (!count || count < 0 || r.length < count)
-    {
-        let inc;
-        if (host_idx[2] != host_idx[1] && host_idx[2] != host_idx[0] && host_idx[1] != host_idx[0])
-        {
-            r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
-            inc = 2;
-            while (inc >= 0)
-            {
-                osd_idx[inc]++;
-                if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
-                {
-                    osd_idx[inc] = 0;
-                    inc--;
-                }
-                else
-                {
-                    break;
-                }
-            }
-        }
-        else
-        {
-            inc = -1;
-        }
-        if (inc < 0)
-        {
-            // no osds left in current host combination, select the next one
-            osd_idx = [ 0, 0, 0 ];
-            host_idx[2]++;
-            if (host_idx[2] >= hosts.length)
-            {
-                host_idx[1]++;
-                host_idx[2] = ordered ? host_idx[1]+1 : 0;
-                if ((ordered ? host_idx[2] : host_idx[1]) >= hosts.length)
-                {
-                    host_idx[0]++;
-                    host_idx[1] = ordered ? host_idx[0]+1 : 0;
-                    host_idx[2] = ordered ? host_idx[1]+1 : 0;
-                    if ((ordered ? host_idx[2] : host_idx[0]) >= hosts.length)
-                    {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-    return r;
-}
-
-function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
-{
-    const per_osd = {};
-    for (const pg_name in weights)
-    {
-        for (const osd of pg_name.substr(3).split(/_/))
-        {
-            per_osd[osd] = (per_osd[osd]||0) + weights[pg_name];
-        }
-    }
-    return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes);
-}
-
-function pg_list_space_efficiency(pgs, osd_sizes)
-{
-    const per_osd = {};
-    for (const pg of pgs)
-    {
-        for (const osd of pg)
-        {
-            per_osd[osd] = (per_osd[osd]||0) + 1;
-        }
-    }
-    return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes);
-}
-
-function pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes)
-{
-    // each PG gets randomly selected in 1/N cases
-    // & there are x PGs per OSD
-    // => an OSD is selected in x/N cases
-    // => total space * x/N <= OSD size
-    // => total space <= OSD size * N/x
-    let space;
-    for (let osd in per_osd)
-    {
-        if (osd in osd_sizes)
-        {
-            const space_estimate = osd_sizes[osd] * pg_count / per_osd[osd];
-            if (space == null || space > space_estimate)
-            {
-                space = space_estimate;
-            }
-        }
-    }
-    return space == null ? 0 : space;
-}
-
-module.exports = {
-    NO_OSD,
-
-    optimize_initial,
-    optimize_change,
-    print_change_stats,
-    pg_weights_space_efficiency,
-    pg_list_space_efficiency,
-    pg_per_osd_space_efficiency,
-    flatten_tree,
-
-    lp_solve,
-    make_int_pgs,
-    align_pgs,
-    all_combinations,
-};
--- a/lp/mon-main.js
+++ b/lp/mon-main.js
@@ -1,22 +0,0 @@
-#!/usr/bin/node
-
-const Mon = require('./mon.js');
-
-const options = {};
-
-for (let i = 2; i < process.argv.length; i++)
-{
-    if (process.argv[i].substr(0, 2) == '--')
-    {
-        options[process.argv[i].substr(2)] = process.argv[i+1];
-        i++;
-    }
-}
-
-if (!options.etcd_url)
-{
-    console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url "http://127.0.0.1:2379,..." --etcd_prefix "/rage" --etcd_start_timeout 5');
-    process.exit();
-}
-
-new Mon(options).start();
--- a/lp/mon.js
+++ b/lp/mon.js
@@ -1,858 +0,0 @@
-const http = require('http');
-const os = require('os');
-const WebSocket = require('ws');
-const LPOptimizer = require('./lp-optimizer.js');
-const stableStringify = require('./stable-stringify.js');
-
-class Mon
-{
-    static etcd_tree = {
-        config: {
-            global: null,
-            /* placement_tree = {
-                levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
-                nodes: { host1: { level: 'host', parent: 'rack1' }, ... },
-                failure_domain: 'host',
-            } */
-            placement_tree: null,
-            osd: {},
-            pgs: {},
-        },
-        osd: {
-            state: {},
-            stats: {},
-        },
-        mon: {
-            master: null,
-        },
-        pg: {
-            change_stamp: null,
-            state: {},
-            stats: {},
-            history: {},
-        },
-    }
-
-    constructor(config)
-    {
-        // FIXME: Maybe prefer local etcd
-        this.etcd_urls = [];
-        for (let url of config.etcd_url.split(/,/))
-        {
-            let scheme = 'http';
-            url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
-            if (!/\/[^\/]/.exec(url))
-                url += '/v3';
-            this.etcd_urls.push(scheme+'://'+url);
-        }
-        this.etcd_prefix = config.etcd_prefix || '/rage';
-        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
-        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
-        this.state = JSON.parse(JSON.stringify(Mon.etcd_tree));
-    }
-
-    async start()
-    {
-        await this.load_config();
-        await this.get_lease();
-        await this.become_master();
-        await this.load_cluster_state();
-        await this.start_watcher();
-        await this.recheck_pgs();
-    }
-
-    async load_config()
-    {
-        const res = await this.etcd_call('/txn', { success: [
-            { requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
-        ] }, this.etcd_start_timeout, -1);
-        this.parse_kv(res.responses[0].response_range.kvs[0]);
-        this.check_config();
-    }
-
-    check_config()
-    {
-        this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
-        if (this.config.etcd_mon_timeout <= 0)
-        {
-            this.config.etcd_mon_timeout = 1000;
-        }
-        this.config.etcd_mon_retries = Number(this.config.etcd_mon_retries) || 5;
-        if (this.config.etcd_mon_retries < 0)
-        {
-            this.config.etcd_mon_retries = 0;
-        }
-        this.config.mon_change_timeout = Number(this.config.mon_change_timeout) || 1000;
-        if (this.config.mon_change_timeout < 100)
-        {
-            this.config.mon_change_timeout = 100;
-        }
-        this.config.mon_stats_timeout = Number(this.config.mon_stats_timeout) || 1000;
-        if (this.config.mon_stats_timeout < 100)
-        {
-            this.config.mon_stats_timeout = 100;
-        }
-        // After this number of seconds, a dead OSD will be removed from PG distribution
-        this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
-        if (!this.config.osd_out_time)
-        {
-            this.config.osd_out_time = 30*60; // 30 minutes by default
-        }
-        this.config.max_osd_combinations = Number(this.config.max_osd_combinations) || 10000;
-        if (this.config.max_osd_combinations < 100)
-        {
-            this.config.max_osd_combinations = 100;
-        }
-    }
-
-    async start_watcher(retries)
-    {
-        let retry = 0;
-        if (retries >= 0 && retries < 1)
-        {
-            retries = 1;
-        }
-        while (retries < 0 || retry < retries)
-        {
-            const base = 'ws'+this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)].substr(4);
-            const ok = await new Promise((ok, no) =>
-            {
-                const timer_id = setTimeout(() =>
-                {
-                    this.ws.close();
-                    ok(false);
-                }, timeout);
-                this.ws = new WebSocket(base+'/watch');
-                this.ws.on('open', () =>
-                {
-                    if (timer_id)
-                        clearTimeout(timer_id);
-                    ok(true);
-                });
-            });
-            if (!ok)
-            {
-                this.ws = null;
-            }
-            retry++;
-        }
-        if (!this.ws)
-        {
-            this.die('Failed to open etcd watch websocket');
-        }
-        this.ws.send(JSON.stringify({
-            create_request: {
-                key: b64(this.etcd_prefix+'/'),
-                range_end: b64(this.etcd_prefix+'0'),
-                start_revision: ''+this.etcd_watch_revision,
-                watch_id: 1,
-            },
-        }));
-        this.ws.on('message', (msg) =>
-        {
-            let data;
-            try
-            {
-                data = JSON.parse(msg);
-            }
-            catch (e)
-            {
-            }
-            if (!data || !data.result || !data.result.events)
-            {
-                console.error('Garbage received from watch websocket: '+msg);
-            }
-            else
-            {
-                let stats_changed = false, changed = false;
-                console.log('Revision '+data.result.header.revision+' events: ');
-                for (const e of data.result.events)
-                {
-                    this.parse_kv(e.kv);
-                    const key = e.kv.key.substr(this.etcd_prefix.length);
-                    if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/')
-                    {
-                        stats_changed = true;
-                    }
-                    else if (key != '/stats')
-                    {
-                        changed = true;
-                    }
-                    console.log(e);
-                }
-                if (stats_changed)
-                {
-                    this.schedule_update_stats();
-                }
-                if (changed)
-                {
-                    this.schedule_recheck();
-                }
-            }
-        });
-    }
-
-    async get_lease()
-    {
-        const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
-        const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
-        this.etcd_lease_id = res.ID;
-        setInterval(async () =>
-        {
-            const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
-            if (!res.result.TTL)
-            {
-                this.die('Lease expired');
-            }
-        }, config.etcd_mon_timeout);
-    }
-
-    async become_master()
-    {
-        const state = { ip: this.local_ips() };
-        while (1)
-        {
-            const res = await this.etcd_call('/txn', {
-                compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.etcd_prefix+'/mon/master') } ],
-                success: [ { key: b64(this.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.etcd_lease_id } ],
-            }, this.etcd_start_timeout, 0);
-            if (!res.succeeded)
-            {
-                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout));
-            }
-        }
-    }
-
-    async load_cluster_state()
-    {
-        const res = await this.etcd_call('/txn', { success: [
-            { requestRange: { key: b64(this.etcd_prefix+'/'), range_end: b64(this.etcd_prefix+'0') } },
-        ] }, this.etcd_start_timeout, -1);
-        this.etcd_watch_revision = BigInt(res.header.revision)+BigInt(1);
-        const data = JSON.parse(JSON.stringify(Mon.etcd_tree));
-        for (const response of res.responses)
-        {
-            for (const kv of response.response_range.kvs)
-            {
-                this.parse_kv(kv);
-            }
-        }
-        this.state = data;
-    }
-
-    all_osds()
-    {
-        return Object.keys(this.state.osd.stats);
-    }
-
-    get_osd_tree()
-    {
-        this.state.config.placement_tree = this.state.config.placement_tree||{};
-        const levels = this.state.config.placement_tree.levels||{};
-        levels.host = levels.host || 100;
-        levels.osd = levels.osd || 101;
-        const tree = { '': { children: [] } };
-        for (const node_id in this.state.config.placement_tree.nodes||{})
-        {
-            const node_cfg = this.state.config.placement_tree.nodes[node_id];
-            if (!node_id || /^\d/.exec(node_id) ||
-                !node_cfg.level || !levels[node_cfg.level])
-            {
-                // All nodes must have non-empty non-numeric IDs and valid levels
-                continue;
-            }
-            tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
-        }
-        // This requires monitor system time to be in sync with OSD system times (at least to some extent)
-        const down_time = Date.now()/1000 - this.config.osd_out_time;
-        for (const osd_num of this.all_osds().sort((a, b) => a - b))
-        {
-            const stat = this.state.osd.stats[osd_num];
-            if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
-            {
-                // Numeric IDs are reserved for OSDs
-                const reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight) || 1;
-                tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
-                tree[osd_num].level = 'osd';
-                tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
-                delete tree[osd_num].children;
-            }
-        }
-        for (const node_id in tree)
-        {
-            if (node_id === '')
-            {
-                continue;
-            }
-            const node_cfg = tree[node_id];
-            const node_level = levels[node_cfg.level] || node_cfg.level;
-            let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
-                && tree[node_cfg.parent].level;
-            parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
-            // Parent's level must be less than child's; OSDs must be leaves
-            const parent = parent_level && parent_level < node_level ? tree[node_cfg.parent] : '';
-            tree[parent].children.push(tree[node_id]);
-            delete node_cfg.parent;
-        }
-        return LPOptimizer.flatten_tree(tree[''].children, levels, this.state.config.failure_domain, 'osd');
-    }
-
-    async stop_all_pgs()
-    {
-        let has_online = false, paused = true;
-        for (const pg in this.state.config.pgs.items||{})
-        {
-            const cur_state = ((this.state.pg.state[pg]||{}).state||[]).join(',');
-            if (cur_state != '' && cur_state != 'offline')
-            {
-                has_online = true;
-            }
-            if (!this.state.config.pgs.items[pg].pause)
-            {
-                paused = false;
-            }
-        }
-        if (!paused)
-        {
-            console.log('Stopping all PGs before changing PG count');
-            const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
-            for (const pg in new_cfg.items)
-            {
-                new_cfg.items[pg].pause = true;
-            }
-            // Check that no OSDs change their state before we pause PGs
-            // Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
-            // and can't see the old PG configuration
-            const checks = [];
-            for (const osd_num of this.all_osds())
-            {
-                const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
-                checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
-            }
-            const res = await this.etcd_call('/txn', {
-                compare: [
-                    { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
-                    { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
-                    ...checks,
-                ],
-                success: [
-                    { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
-                ],
-            }, this.config.etcd_mon_timeout, 0);
-            if (!res.succeeded)
-            {
-                return false;
-            }
-            this.state.config.pgs = new_cfg;
-        }
-        return !has_online;
-    }
-
-    scale_pg_count(prev_pgs, pg_history, new_pg_count)
-    {
-        const old_pg_count = prev_pgs.length;
-        // Add all possibly intersecting PGs into the history of new PGs
-        if (!(new_pg_count % old_pg_count))
-        {
-            // New PG count is a multiple of the old PG count
-            const mul = (new_pg_count / old_pg_count);
-            for (let i = 0; i < new_pg_count; i++)
-            {
-                const old_i = Math.floor(new_pg_count / mul);
-                pg_history[i] = JSON.parse(JSON.stringify(this.state.pg.history[1+old_i]));
-            }
-        }
-        else if (!(old_pg_count % new_pg_count))
-        {
-            // Old PG count is a multiple of the new PG count
-            const mul = (old_pg_count / new_pg_count);
-            for (let i = 0; i < new_pg_count; i++)
-            {
-                pg_history[i] = {
-                    osd_sets: [],
-                    all_peers: [],
-                };
-                for (let j = 0; j < mul; j++)
-                {
-                    pg_history[i].osd_sets.push(prev_pgs[i*mul]);
-                    const hist = this.state.pg.history[1+i*mul+j];
-                    if (hist && hist.osd_sets && hist.osd_sets.length)
-                    {
-                        Array.prototype.push.apply(pg_history[i].osd_sets, hist.osd_sets);
-                    }
-                    if (hist && hist.all_peers && hist.all_peers.length)
-                    {
-                        Array.prototype.push.apply(pg_history[i].all_peers, hist.all_peers);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Any PG may intersect with any PG after non-multiple PG count change
-            // So, merge ALL PGs history
-            let all_sets = {};
-            let all_peers = {};
-            for (const pg of prev_pgs)
-            {
-                all_sets[pg.join(' ')] = pg;
-            }
-            for (const pg in this.state.pg.history)
-            {
-                const hist = this.state.pg.history[pg];
-                if (hist && hist.osd_sets)
-                {
-                    for (const pg of hist.osd_sets)
-                    {
-                        all_sets[pg.join(' ')] = pg;
-                    }
-                }
-                if (hist && hist.all_peers)
-                {
-                    for (const osd_num of hist.all_peers)
-                    {
-                        all_peers[osd_num] = Number(osd_num);
-                    }
-                }
-            }
-            all_sets = Object.values(all_sets);
-            all_peers = Object.values(all_peers);
-            for (let i = 0; i < new_pg_count; i++)
-            {
-                pg_history[i] = { osd_sets: all_sets, all_peers };
-            }
-        }
-        // Mark history keys for removed PGs as removed
-        for (let i = new_pg_count; i < old_pg_count; i++)
-        {
-            pg_history[i] = null;
-        }
-        if (old_pg_count < new_pg_count)
-        {
-            for (let i = new_pg_count-1; i >= 0; i--)
-            {
-                prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
-            }
-        }
-        else if (old_pg_count > new_pg_count)
-        {
-            for (let i = 0; i < new_pg_count; i++)
-            {
-                prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
-            }
-            prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
-        }
-    }
-
-    async save_new_pgs(prev_pgs, new_pgs, pg_history, tree_hash)
-    {
-        const txn = [], checks = [];
-        const pg_items = {};
-        new_pgs.map((osd_set, i) =>
-        {
-            osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
-            const alive_set = osd_set.filter(osd_num => osd_num);
-            pg_items[i+1] = {
-                osd_set,
-                primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
-            };
-            if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
-            {
-                pg_history[i] = pg_history[i] || {};
-                pg_history[i].osd_sets = pg_history[i].osd_sets || [];
-                pg_history[i].osd_sets.push(prev_pgs[i]);
-            }
-        });
-        for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
-        {
-            checks.push({
-                key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
-                target: 'MOD',
-                mod_revision: ''+this.etcd_watch_revision,
-                result: 'LESS',
-            });
-            if (pg_history[i])
-            {
-                txn.push({
-                    requestPut: {
-                        key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
-                        value: b64(JSON.stringify(pg_history[i])),
-                    },
-                });
-            }
-            else
-            {
-                txn.push({
-                    requestDeleteRange: {
-                        key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
-                    },
-                });
-            }
-        }
-        this.state.config.pgs = {
-            hash: tree_hash,
-            items: pg_items,
-        };
-        const res = await this.etcd_call('/txn', {
-            compare: [
-                { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
-                { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
-                ...checks,
-            ],
-            success: [
-                { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(this.state.config.pgs)) } },
-                ...txn,
-            ],
-        }, this.config.etcd_mon_timeout, 0);
-        return res.succeeded;
-    }
-
-    async recheck_pgs()
-    {
-        // Take configuration and state, check it against the stored configuration hash
-        // Recalculate PGs and save them to etcd if the configuration is changed
-        const tree_cfg = {
-            osd_tree: this.get_osd_tree(),
-            pg_count: this.config.pg_count || Object.keys(this.state.config.pgs.items||{}).length || 128,
-            max_osd_combinations: this.config.max_osd_combinations,
-        };
-        const tree_hash = sha1hex(stableStringify(tree_cfg));
-        if (this.state.config.pgs.hash != tree_hash)
-        {
-            // Something has changed
-            const prev_pgs = [];
-            for (const pg in this.state.config.pgs.items||{})
-            {
-                prev_pgs[pg-1] = this.state.config.pgs.items[pg].osd_set;
-            }
-            const pg_history = [];
-            const old_pg_count = prev_pgs.length;
-            let optimize_result;
-            if (old_pg_count > 0)
-            {
-                if (old_pg_count != tree_cfg.pg_count)
-                {
-                    // PG count changed. Need to bring all PGs down.
-                    if (!await this.stop_all_pgs())
-                    {
-                        this.schedule_recheck();
-                        return;
-                    }
-                    this.scale_pg_count(prev_pgs, pg_history, new_pg_count);
-                }
-                optimize_result = await LPOptimizer.optimize_change(prev_pgs, tree_cfg.osd_tree, tree_cfg.max_osd_combinations);
-            }
-            else
-            {
-                optimize_result = await LPOptimizer.optimize_initial(tree_cfg.osd_tree, tree_cfg.pg_count, tree_cfg.max_osd_combinations);
-            }
-            if (!await this.save_new_pgs(prev_pgs, optimize_result.int_pgs, pg_history, tree_hash))
-            {
-                console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
-                this.schedule_recheck();
-                return;
-            }
-            console.log('PG configuration successfully changed');
-            if (old_pg_count != optimize_result.int_pgs.length)
-            {
-                console.log(`PG count changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`);
-            }
-            LPOptimizer.print_change_stats(optimize_result);
-        }
-    }
-
-    schedule_recheck()
-    {
-        if (this.recheck_timer)
-        {
-            clearTimeout(this.recheck_timer);
-            this.recheck_timer = null;
-        }
-        this.recheck_timer = setTimeout(() =>
-        {
-            this.recheck_timer = null;
-            this.recheck_pgs().catch(console.error);
-        }, this.config.mon_change_timeout || 1000);
-    }
-
-    sum_stats()
-    {
-        let overflow = false;
-        this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} };
-        const op_stats = {}, subop_stats = {}, recovery_stats = {};
-        for (const osd in this.state.osd.stats)
-        {
-            const st = this.state.osd.stats[osd];
-            for (const op in st.op_stats||{})
-            {
-                op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
-                op_stats[op].count += BigInt(st.op_stats.count||0);
-                op_stats[op].usec += BigInt(st.op_stats.usec||0);
-                op_stats[op].bytes += BigInt(st.op_stats.bytes||0);
-            }
-            for (const op in st.subop_stats||{})
-            {
-                subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
-                subop_stats[op].count += BigInt(st.subop_stats.count||0);
-                subop_stats[op].usec += BigInt(st.subop_stats.usec||0);
-            }
-            for (const op in st.recovery_stats||{})
-            {
-                recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
-                recovery_stats[op].count += BigInt(st.recovery_stats.count||0);
-                recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
-            }
-        }
-        for (const op in op_stats)
-        {
-            if (op_stats[op].count >= 0x10000000000000000n)
-            {
-                if (!this.prev_stats.op_stats[op])
-                {
-                    overflow = true;
-                }
-                else
-                {
-                    op_stats[op].count -= this.prev_stats.op_stats[op].count;
-                    op_stats[op].usec -= this.prev_stats.op_stats[op].usec;
-                    op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes;
-                }
-            }
-        }
-        for (const op in subop_stats)
-        {
-            if (subop_stats[op].count >= 0x10000000000000000n)
-            {
-                if (!this.prev_stats.subop_stats[op])
-                {
-                    overflow = true;
-                }
-                else
-                {
-                    subop_stats[op].count -= this.prev_stats.subop_stats[op].count;
-                    subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec;
-                }
-            }
-        }
-        for (const op in recovery_stats)
-        {
-            if (recovery_stats[op].count >= 0x10000000000000000n)
-            {
-                if (!this.prev_stats.recovery_stats[op])
-                {
-                    overflow = true;
-                }
-                else
-                {
-                    recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count;
-                    recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes;
-                }
-            }
-        }
-        const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
-        for (const pg_num in this.state.pg.stats)
-        {
-            const st = this.state.pg.stats[pg_num];
-            for (const k in object_counts)
-            {
-                if (st[k+'_count'])
-                {
-                    object_counts[k] += BigInt(st[k+'_count']);
-                }
-            }
-        }
-        return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts });
-    }
-
-    async update_total_stats()
-    {
-        const stats = this.sum_stats();
-        if (!stats.overflow)
-        {
-            // Convert to strings, serialize and save
-            const ser = {};
-            for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ])
-            {
-                ser[st] = {};
-                for (const op in stats[st])
-                {
-                    ser[st][op] = {};
-                    for (const k in stats[st][op])
-                    {
-                        ser[st][op][k] = ''+stats[st][op][k];
-                    }
-                }
-            }
-            ser.object_counts = {};
-            for (const k in stats.object_counts)
-            {
-                ser.object_counts[k] = ''+stats.object_counts[k];
-            }
-            await this.etcd_call('/txn', {
-                success: [ { requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } } ],
-            }, this.config.etcd_mon_timeout, 0);
-        }
-    }
-
-    schedule_update_stats()
-    {
-        if (this.stats_timer)
-        {
-            clearTimeout(this.stats_timer);
-            this.stats_timer = null;
-        }
-        this.stats_timer = setTimeout(() =>
-        {
-            this.stats_timer = null;
-            this.update_total_stats().catch(console.error);
-        }, this.config.mon_stats_timeout || 1000);
-    }
-
-    parse_kv(kv)
-    {
-        if (!kv || !kv.key)
-        {
-            return;
-        }
-        kv.key = de64(kv.key);
-        kv.value = kv.value ? JSON.parse(de64(kv.value)) : null;
-        const key = kv.key.substr(this.etcd_prefix.length).replace(/^\/+/, '').split('/');
-        const cur = this.state, orig = Mon.etcd_tree;
-        for (let i = 0; i < key.length-1; i++)
-        {
-            if (!orig[key[i]])
-            {
-                console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
-                return;
-            }
-            orig = orig[key[i]];
-            cur = (cur[key[i]] = cur[key[i]] || {});
-        }
-        if (orig[key.length-1])
-        {
-            console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
-            return;
-        }
-        cur[key[key.length-1]] = kv.value;
-        if (key.join('/') === 'config/global')
-        {
-            this.state.config.global = this.state.config.global || {};
-            this.config = this.state.config.global;
-            this.check_config();
-        }
-    }
-
-    async etcd_call(path, body, timeout, retries)
-    {
-        let retry = 0;
-        if (retries >= 0 && retries < 1)
-        {
-            retries = 1;
-        }
-        while (retries < 0 || retry < retries)
-        {
-            const base = this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)];
-            const res = await POST(base+path, body, timeout);
-            if (res.json)
-            {
-                if (res.json.error)
-                {
-                    console.log('etcd returned error: '+res.json.error);
-                    break;
-                }
-                return res.json;
-            }
-            retry++;
-        }
-        this.die();
-    }
-
-    die(err)
-    {
-        // In fact we can just try to rejoin
-        console.fatal(err || 'Cluster connection failed');
-        process.exit(1);
-    }
-
-    local_ips()
-    {
-        const ips = [];
-        const ifaces = os.networkInterfaces();
-        for (const ifname in ifaces)
-        {
-            for (const iface of ifaces[ifname])
-            {
-                if (iface.family == 'IPv4' && !iface.internal)
-                {
-                    ips.push(iface.address);
-                }
-            }
-        }
-        return ips;
-    }
-}
-
-function POST(url, body, timeout)
-{
-    return new Promise((ok, no) =>
-    {
-        const body_text = Buffer.from(JSON.stringify(body));
-        let timer_id = timeout > 0 ? setTimeout(() =>
-        {
-            if (req)
-                req.abort();
-            req = null;
-            ok({ error: 'timeout' });
-        }, timeout) : null;
-        let req = http.request(url, { method: 'POST', headers: {
-            'Content-Type': 'application/json',
-            'Content-Length': body_text,
-        } }, (res) =>
-        {
-            if (!req)
-            {
-                return;
-            }
-            clearTimeout(timer_id);
-            if (res.statusCode != 200)
-            {
-                ok({ error: res.statusCode, response: res });
-                return;
-            }
-            let res_body = '';
-            res.setEncoding('utf8');
-            res.on('data', chunk => { res_body += chunk });
-            res.on('end', () =>
-            {
-                try
-                {
-                    res_body = JSON.parse(res_body);
-                    ok({ response: res, json: res_body });
-                }
-                catch (e)
-                {
-                    ok({ error: e, response: res, body: res_body });
-                }
-            });
-        });
-        req.write(body_text);
-        req.end();
-    });
-}
-
-function b64(str)
-{
-    return Buffer.from(str).toString('base64');
-}
-
-function de64(str)
-{
-    return Buffer.from(str, 'base64').toString();
-}
-
-function sha1hex(str)
-{
-    const hash = crypto.createHash('sha1');
-    hash.update(str);
-    return hash.digest('hex');
-}
--- a/lp/package.json
+++ b/lp/package.json
@@ -1,14 +0,0 @@
-{
-  "name": "rage-mon",
-  "version": "1.0.0",
-  "description": "RAGE storage monitor service",
-  "main": "mon.js",
-  "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1"
-  },
-  "author": "Vitaliy Filippov",
-  "license": "UNLICENSED",
-  "dependencies": {
-    "ws": "^7.2.5"
-  }
-}
--- a/lp/test-optimize-undersized.js
+++ b/lp/test-optimize-undersized.js
@@ -1,71 +0,0 @@
-const LPOptimizer = require('./lp-optimizer.js');
-
-const crush_tree = [
-    { level: 1, children: [
-        { level: 2, children: [
-            { level: 3, id: 1, size: 3 },
-            { level: 3, id: 2, size: 3 },
-        ] },
-        { level: 2, children: [
-            { level: 3, id: 3, size: 3 },
-            { level: 3, id: 4, size: 3 },
-        ] },
-    ] },
-    { level: 1, children: [
-        { level: 2, children: [
-            { level: 3, id: 5, size: 3 },
-            { level: 3, id: 6, size: 3 },
-        ] },
-        { level: 2, children: [
-            { level: 3, id: 7, size: 3 },
-            { level: 3, id: 8, size: 3 },
-        ] },
-    ] },
-    { level: 1, children: [
-        { level: 2, children: [
-            { level: 3, id: 9, size: 3 },
-            { level: 3, id: 10, size: 3 },
-        ] },
-        { level: 2, children: [
-            { level: 3, id: 11, size: 3 },
-            { level: 3, id: 12, size: 3 },
-        ] },
-    ] },
-];
-
-const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
-console.log(osd_tree);
-
-async function run()
-{
-    const cur_tree = {};
-    console.log('Empty tree:');
-    let res = await LPOptimizer.optimize_initial(cur_tree, 256);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('\nAdding 1st failure domain:');
-    cur_tree['dom1'] = osd_tree['dom1'];
-    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('\nAdding 2nd failure domain:');
-    cur_tree['dom2'] = osd_tree['dom2'];
-    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('\nAdding 3rd failure domain:');
-    cur_tree['dom3'] = osd_tree['dom3'];
-    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('\nRemoving 3rd failure domain:');
-    delete cur_tree['dom3'];
-    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('\nRemoving 2nd failure domain:');
-    delete cur_tree['dom2'];
-    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('\nRemoving 1st failure domain:');
-    delete cur_tree['dom1'];
-    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
-    LPOptimizer.print_change_stats(res, false);
-}
-
-run().catch(console.error);
--- a/lp/test-optimize.js
+++ b/lp/test-optimize.js
@@ -1,94 +0,0 @@
-const LPOptimizer = require('./lp-optimizer.js');
-
-const osd_tree = {
-    100: {
-        7: 3.63869,
-    },
-    300: {
-        10: 3.46089,
-        11: 3.46089,
-        12: 3.46089,
-    },
-    400: {
-        1: 3.49309,
-        2: 3.49309,
-        3: 3.49309,
-    },
-    500: {
-        4: 3.58498,
-//        8: 3.58589,
-        9: 3.63869,
-    },
-    600: {
-        5: 3.63869,
-        6: 3.63869,
-    },
-/*    100: {
-        1: 2.72800,
-    },
-    200: {
-        2: 2.72900,
-    },
-    300: {
-        3: 1.87000,
-    },
-    400: {
-        4: 1.87000,
-    },
-    500: {
-        5: 3.63869,
-    },*/
-};
-
-const crush_tree = [
-    { level: 1, children: [
-        { level: 2, children: [
-            { level: 3, id: 1, size: 3 },
-            { level: 3, id: 2, size: 2 },
-        ] },
-        { level: 2, children: [
-            { level: 3, id: 3, size: 4 },
-            { level: 3, id: 4, size: 4 },
-        ] },
-    ] },
-    { level: 1, children: [
-        { level: 2, children: [
-            { level: 3, id: 5, size: 4 },
-            { level: 3, id: 6, size: 1 },
-        ] },
-        { level: 2, children: [
-            { level: 3, id: 7, size: 3 },
-            { level: 3, id: 8, size: 5 },
-        ] },
-    ] },
-    { level: 1, children: [
-        { level: 2, children: [
-            { level: 3, id: 9, size: 5 },
-            { level: 3, id: 10, size: 2 },
-        ] },
-        { level: 2, children: [
-            { level: 3, id: 11, size: 3 },
-            { level: 3, id: 12, size: 3 },
-        ] },
-    ] },
-];
-
-async function run()
-{
-    // Test: add 1 OSD of almost the same size. Ideal data movement could be 1/12 = 8.33%. Actual is ~13%
-    // Space efficiency is ~99.5% in both cases.
-    let res = await LPOptimizer.optimize_initial(osd_tree, 256);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('adding osd.8');
-    osd_tree[500][8] = 3.58589;
-    res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
-    LPOptimizer.print_change_stats(res, false);
-    console.log('removing osd.8');
-    delete osd_tree[500][8];
-    res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
-    LPOptimizer.print_change_stats(res, false);
-    res = await LPOptimizer.optimize_initial(LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), 256);
-    LPOptimizer.print_change_stats(res, false);
-}
-
-run().catch(console.error);
--- a/messenger.cpp
+++ b/messenger.cpp
@@ -1,398 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <sys/epoll.h>
-#include <netinet/tcp.h>
-
-#include "messenger.h"
-
-osd_op_t::~osd_op_t()
-{
-    assert(!bs_op);
-    assert(!op_data);
-    if (rmw_buf)
-    {
-        free(rmw_buf);
-    }
-    if (buf)
-    {
-        // Note: reusing osd_op_t WILL currently lead to memory leaks
-        // So we don't reuse it, but free it every time
-        free(buf);
-    }
-}
-
-void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
-{
-    if (wanted_peers.find(peer_osd) == wanted_peers.end())
-    {
-        wanted_peers[peer_osd] = (osd_wanted_peer_t){
-            .address_list = peer_state["addresses"],
-            .port = (int)peer_state["port"].int64_value(),
-        };
-    }
-    else
-    {
-        wanted_peers[peer_osd].address_list = peer_state["addresses"];
-        wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
-    }
-    wanted_peers[peer_osd].address_changed = true;
-    if (!wanted_peers[peer_osd].connecting &&
-        (time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
-    {
-        try_connect_peer(peer_osd);
-    }
-}
-
-void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
-{
-    auto wp_it = wanted_peers.find(peer_osd);
-    if (wp_it == wanted_peers.end())
-    {
-        return;
-    }
-    if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
-    {
-        wanted_peers.erase(peer_osd);
-        return;
-    }
-    auto & wp = wp_it->second;
-    if (wp.address_index >= wp.address_list.array_items().size())
-    {
-        return;
-    }
-    wp.cur_addr = wp.address_list[wp.address_index].string_value();
-    wp.cur_port = wp.port;
-    try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
-}
-
-void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
-{
-    struct sockaddr_in addr;
-    int r;
-    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
-    {
-        on_connect_peer(peer_osd, -EINVAL);
-        return;
-    }
-    addr.sin_family = AF_INET;
-    addr.sin_port = htons(peer_port ? peer_port : 11203);
-    int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
-    if (peer_fd < 0)
-    {
-        on_connect_peer(peer_osd, -errno);
-        return;
-    }
-    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
-    int timeout_id = -1;
-    if (peer_connect_timeout > 0)
-    {
-        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
-        {
-            osd_num_t peer_osd = clients[peer_fd].osd_num;
-            stop_client(peer_fd);
-            on_connect_peer(peer_osd, -EIO);
-            return;
-        });
-    }
-    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
-    if (r < 0 && errno != EINPROGRESS)
-    {
-        close(peer_fd);
-        on_connect_peer(peer_osd, -errno);
-        return;
-    }
-    assert(peer_osd != this->osd_num);
-    clients[peer_fd] = (osd_client_t){
-        .peer_addr = addr,
-        .peer_port = peer_port,
-        .peer_fd = peer_fd,
-        .peer_state = PEER_CONNECTING,
-        .connect_timeout_id = timeout_id,
-        .osd_num = peer_osd,
-        .in_buf = malloc(receive_buffer_size),
-    };
-    tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
-    {
-        // Either OUT (connected) or HUP
-        handle_connect_epoll(peer_fd);
-    });
-}
-
-void osd_messenger_t::handle_connect_epoll(int peer_fd)
-{
-    auto & cl = clients[peer_fd];
-    if (cl.connect_timeout_id >= 0)
-    {
-        tfd->clear_timer(cl.connect_timeout_id);
-        cl.connect_timeout_id = -1;
-    }
-    osd_num_t peer_osd = cl.osd_num;
-    int result = 0;
-    socklen_t result_len = sizeof(result);
-    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
-    {
-        result = errno;
-    }
-    if (result != 0)
-    {
-        stop_client(peer_fd);
-        on_connect_peer(peer_osd, -result);
-        return;
-    }
-    int one = 1;
-    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-    cl.peer_state = PEER_CONNECTED;
-    // FIXME Disable EPOLLOUT on this fd
-    tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
-    {
-        handle_peer_epoll(peer_fd, epoll_events);
-    });
-    // Check OSD number
-    check_peer_config(cl);
-}
-
-void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
-{
-    // Mark client as ready (i.e. some data is available)
-    if (epoll_events & EPOLLRDHUP)
-    {
-        // Stop client
-        printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
-        stop_client(peer_fd);
-    }
-    else if (epoll_events & EPOLLIN)
-    {
-        // Mark client as ready (i.e. some data is available)
-        auto & cl = clients[peer_fd];
-        cl.read_ready++;
-        if (cl.read_ready == 1)
-        {
-            read_ready_clients.push_back(cl.peer_fd);
-            ringloop->wakeup();
-        }
-    }
-}
-
-void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
-{
-    auto & wp = wanted_peers.at(peer_osd);
-    wp.connecting = false;
-    if (peer_fd < 0)
-    {
-        printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
-        if (wp.address_changed)
-        {
-            wp.address_changed = false;
-            wp.address_index = 0;
-            try_connect_peer(peer_osd);
-        }
-        else if (wp.address_index < wp.address_list.array_items().size()-1)
-        {
-            // Try other addresses
-            wp.address_index++;
-            try_connect_peer(peer_osd);
-        }
-        else
-        {
-            // Retry again in <peer_connect_interval> seconds
-            wp.last_connect_attempt = time(NULL);
-            wp.address_index = 0;
-            tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
-            {
-                try_connect_peer(peer_osd);
-            });
-        }
-        return;
-    }
-    printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
-    wanted_peers.erase(peer_osd);
-    repeer_pgs(peer_osd);
-}
-
-void osd_messenger_t::check_peer_config(osd_client_t & cl)
-{
-    osd_op_t *op = new osd_op_t();
-    op->op_type = OSD_OP_OUT;
-    op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
-    op->peer_fd = cl.peer_fd;
-    op->req = {
-        .show_conf = {
-            .header = {
-                .magic = SECONDARY_OSD_OP_MAGIC,
-                .id = this->next_subop_id++,
-                .opcode = OSD_OP_SHOW_CONFIG,
-            },
-        },
-    };
-    op->callback = [this](osd_op_t *op)
-    {
-        osd_client_t & cl = clients[op->peer_fd];
-        std::string json_err;
-        json11::Json config;
-        bool err = false;
-        if (op->reply.hdr.retval < 0)
-        {
-            err = true;
-            printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
-        }
-        else
-        {
-            config = json11::Json::parse(std::string((char*)op->buf), json_err);
-            if (json_err != "")
-            {
-                err = true;
-                printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
-            }
-            else if (config["osd_num"].uint64_value() != cl.osd_num)
-            {
-                err = true;
-                printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
-                on_connect_peer(cl.osd_num, -1);
-            }
-        }
-        if (err)
-        {
-            stop_client(op->peer_fd);
-            delete op;
-            return;
-        }
-        osd_peer_fds[cl.osd_num] = cl.peer_fd;
-        on_connect_peer(cl.osd_num, cl.peer_fd);
-        delete op;
-    };
-    outbox_push(op);
-}
-
-void osd_messenger_t::cancel_osd_ops(osd_client_t & cl)
-{
-    for (auto p: cl.sent_ops)
-    {
-        cancel_op(p.second);
-    }
-    cl.sent_ops.clear();
-    for (auto op: cl.outbox)
-    {
-        cancel_op(op);
-    }
-    cl.outbox.clear();
-    if (cl.write_op)
-    {
-        cancel_op(cl.write_op);
-        cl.write_op = NULL;
-    }
-}
-
-void osd_messenger_t::cancel_op(osd_op_t *op)
-{
-    if (op->op_type == OSD_OP_OUT)
-    {
-        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        op->reply.hdr.id = op->req.hdr.id;
-        op->reply.hdr.opcode = op->req.hdr.opcode;
-        op->reply.hdr.retval = -EPIPE;
-        // Copy lambda to be unaffected by `delete op`
-        std::function<void(osd_op_t*)>(op->callback)(op);
-    }
-    else
-    {
-        // This function is only called in stop_client(), so it's fine to destroy the operation
-        delete op;
-    }
-}
-
-void osd_messenger_t::stop_client(int peer_fd)
-{
-    assert(peer_fd != 0);
-    auto it = clients.find(peer_fd);
-    if (it == clients.end())
-    {
-        return;
-    }
-    uint64_t repeer_osd = 0;
-    osd_client_t cl = it->second;
-    if (cl.peer_state == PEER_CONNECTED)
-    {
-        if (cl.osd_num)
-        {
-            // Reload configuration from etcd when the connection is dropped
-            printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
-            repeer_osd = cl.osd_num;
-        }
-        else
-        {
-            printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
-        }
-    }
-    clients.erase(it);
-    tfd->set_fd_handler(peer_fd, NULL);
-    if (cl.osd_num)
-    {
-        osd_peer_fds.erase(cl.osd_num);
-        // Cancel outbound operations
-        cancel_osd_ops(cl);
-    }
-    if (cl.read_op)
-    {
-        delete cl.read_op;
-        cl.read_op = NULL;
-    }
-    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
-    {
-        if (*rit == peer_fd)
-        {
-            read_ready_clients.erase(rit);
-            break;
-        }
-    }
-    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
-    {
-        if (*wit == peer_fd)
-        {
-            write_ready_clients.erase(wit);
-            break;
-        }
-    }
-    free(cl.in_buf);
-    close(peer_fd);
-    if (repeer_osd)
-    {
-        repeer_pgs(repeer_osd);
-    }
-}
-
-void osd_messenger_t::accept_connections(int listen_fd)
-{
-    // Accept new connections
-    sockaddr_in addr;
-    socklen_t peer_addr_size = sizeof(addr);
-    int peer_fd;
-    while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
-    {
-        assert(peer_fd != 0);
-        char peer_str[256];
-        printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
-            inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
-        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
-        int one = 1;
-        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        clients[peer_fd] = {
-            .peer_addr = addr,
-            .peer_port = ntohs(addr.sin_port),
-            .peer_fd = peer_fd,
-            .peer_state = PEER_CONNECTED,
-            .in_buf = malloc(receive_buffer_size),
-        };
-        // Add FD to epoll
-        tfd->set_fd_handler(peer_fd, [this](int peer_fd, int epoll_events)
-        {
-            handle_peer_epoll(peer_fd, epoll_events);
-        });
-        // Try to accept next connection
-        peer_addr_size = sizeof(addr);
-    }
-    if (peer_fd == -1 && errno != EAGAIN)
-    {
-        throw std::runtime_error(std::string("accept: ") + strerror(errno));
-    }
-}
--- a/messenger.h
+++ b/messenger.h
@@ -1,213 +0,0 @@
-#pragma once
-
-#include <sys/types.h>
-#include <stdint.h>
-#include <arpa/inet.h>
-#include <malloc.h>
-
-#include <set>
-#include <map>
-#include <deque>
-#include <vector>
-
-#include "json11/json11.hpp"
-#include "osd_ops.h"
-#include "timerfd_manager.h"
-#include "ringloop.h"
-
-#define OSD_OP_IN 0
-#define OSD_OP_OUT 1
-
-#define CL_READ_HDR 1
-#define CL_READ_DATA 2
-#define CL_READ_REPLY_DATA 3
-#define CL_WRITE_READY 1
-#define CL_WRITE_REPLY 2
-#define OSD_OP_INLINE_BUF_COUNT 16
-
-#define PEER_CONNECTING 1
-#define PEER_CONNECTED 2
-
-#define DEFAULT_PEER_CONNECT_INTERVAL 5
-#define DEFAULT_PEER_CONNECT_TIMEOUT 5
-
-struct osd_op_buf_list_t
-{
-    int count = 0, alloc = 0, sent = 0;
-    iovec *buf = NULL;
-    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
-
-    ~osd_op_buf_list_t()
-    {
-        if (buf && buf != inline_buf)
-        {
-            free(buf);
-        }
-    }
-
-    inline iovec* get_iovec()
-    {
-        return (buf ? buf : inline_buf) + sent;
-    }
-
-    inline int get_size()
-    {
-        return count - sent;
-    }
-
-    inline void push_back(void *nbuf, size_t len)
-    {
-        if (count >= alloc)
-        {
-            if (!alloc)
-            {
-                alloc = OSD_OP_INLINE_BUF_COUNT;
-                buf = inline_buf;
-            }
-            else if (buf == inline_buf)
-            {
-                int old = alloc;
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)malloc(sizeof(iovec) * alloc);
-                memcpy(buf, inline_buf, sizeof(iovec)*old);
-            }
-            else
-            {
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
-            }
-        }
-        buf[count++] = { .iov_base = nbuf, .iov_len = len };
-    }
-};
-
-struct blockstore_op_t;
-
-struct osd_primary_op_data_t;
-
-struct osd_op_t
-{
-    timespec tv_begin;
-    uint64_t op_type = OSD_OP_IN;
-    int peer_fd;
-    osd_any_op_t req;
-    osd_any_reply_t reply;
-    blockstore_op_t *bs_op = NULL;
-    void *buf = NULL;
-    void *rmw_buf = NULL;
-    osd_primary_op_data_t* op_data = NULL;
-    std::function<void(osd_op_t*)> callback;
-
-    osd_op_buf_list_t send_list;
-
-    ~osd_op_t();
-};
-
-struct osd_client_t
-{
-    sockaddr_in peer_addr;
-    int peer_port;
-    int peer_fd;
-    int peer_state;
-    int connect_timeout_id = -1;
-    osd_num_t osd_num = 0;
-
-    void *in_buf = NULL;
-
-    // Read state
-    int read_ready = 0;
-    osd_op_t *read_op = NULL;
-    int read_reply_id = 0;
-    iovec read_iov;
-    msghdr read_msg;
-    void *read_buf = NULL;
-    int read_remaining = 0;
-    int read_state = 0;
-
-    // Incoming operations
-    std::vector<osd_op_t*> received_ops;
-
-    // Outbound operations
-    std::deque<osd_op_t*> outbox;
-    std::map<int, osd_op_t*> sent_ops;
-
-    // PGs dirtied by this client's primary-writes (FIXME to drop the connection)
-    std::set<pg_num_t> dirty_pgs;
-
-    // Write state
-    osd_op_t *write_op = NULL;
-    msghdr write_msg;
-    int write_state = 0;
-};
-
-struct osd_wanted_peer_t
-{
-    json11::Json address_list;
-    int port;
-    time_t last_connect_attempt;
-    bool connecting, address_changed;
-    int address_index;
-    std::string cur_addr;
-    int cur_port;
-};
-
-struct osd_op_stats_t
-{
-    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
-    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
-    uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
-    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
-    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
-};
-
-struct osd_messenger_t
-{
-    timerfd_manager_t *tfd;
-    ring_loop_t *ringloop;
-
-    // osd_num_t is only for logging and asserts
-    osd_num_t osd_num;
-    int receive_buffer_size = 9000;
-    int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
-    int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
-    int log_level = 0;
-
-    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
-    std::map<uint64_t, int> osd_peer_fds;
-    uint64_t next_subop_id = 1;
-
-    std::map<int, osd_client_t> clients;
-    std::vector<int> read_ready_clients;
-    std::vector<int> write_ready_clients;
-
-    // op statistics
-    osd_op_stats_t stats;
-
-public:
-    void connect_peer(uint64_t osd_num, json11::Json peer_state);
-    void stop_client(int peer_fd);
-    void outbox_push(osd_op_t *cur_op);
-    std::function<void(osd_op_t*)> exec_op;
-    std::function<void(osd_num_t)> repeer_pgs;
-    void handle_peer_epoll(int peer_fd, int epoll_events);
-    void read_requests();
-    void send_replies();
-    void accept_connections(int listen_fd);
-
-protected:
-    void try_connect_peer(uint64_t osd_num);
-    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
-    void handle_connect_epoll(int peer_fd);
-    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
-    void check_peer_config(osd_client_t & cl);
-    void cancel_osd_ops(osd_client_t & cl);
-    void cancel_op(osd_op_t *op);
-
-    bool try_send(osd_client_t & cl);
-    void handle_send(int result, int peer_fd);
-
-    bool handle_read(int result, int peer_fd);
-    void handle_finished_read(osd_client_t & cl);
-    void handle_op_hdr(osd_client_t *cl);
-    void handle_reply_hdr(osd_client_t *cl);
-};
--- a/msgr_receive.cpp
+++ b/msgr_receive.cpp
@@ -1,270 +0,0 @@
-#include "messenger.h"
-
-void osd_messenger_t::read_requests()
-{
-    while (read_ready_clients.size() > 0)
-    {
-        int peer_fd = read_ready_clients[0];
-        auto & cl = clients[peer_fd];
-        if (!cl.read_op || cl.read_remaining < receive_buffer_size)
-        {
-            cl.read_iov.iov_base = cl.in_buf;
-            cl.read_iov.iov_len = receive_buffer_size;
-        }
-        else
-        {
-            cl.read_iov.iov_base = cl.read_buf;
-            cl.read_iov.iov_len = cl.read_remaining;
-        }
-        cl.read_msg.msg_iov = &cl.read_iov;
-        cl.read_msg.msg_iovlen = 1;
-        read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + 1);
-        int result = recvmsg(peer_fd, &cl.read_msg, 0);
-        if (result < 0)
-        {
-            result = -errno;
-        }
-        handle_read(result, peer_fd);
-    }
-}
-
-bool osd_messenger_t::handle_read(int result, int peer_fd)
-{
-    auto cl_it = clients.find(peer_fd);
-    if (cl_it != clients.end())
-    {
-        auto & cl = cl_it->second;
-        if (result < 0 && result != -EAGAIN)
-        {
-            // this is a client socket, so don't panic. just disconnect it
-            printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
-            stop_client(peer_fd);
-            return false;
-        }
-        if (result == -EAGAIN || result < cl.read_iov.iov_len)
-        {
-            cl.read_ready--;
-            if (cl.read_ready > 0)
-                read_ready_clients.push_back(peer_fd);
-        }
-        else
-        {
-            read_ready_clients.push_back(peer_fd);
-        }
-        if (result > 0)
-        {
-            if (cl.read_iov.iov_base == cl.in_buf)
-            {
-                // Compose operation(s) from the buffer
-                int remain = result;
-                void *curbuf = cl.in_buf;
-                while (remain > 0)
-                {
-                    if (!cl.read_op)
-                    {
-                        cl.read_op = new osd_op_t;
-                        cl.read_op->peer_fd = peer_fd;
-                        cl.read_op->op_type = OSD_OP_IN;
-                        cl.read_buf = cl.read_op->req.buf;
-                        cl.read_remaining = OSD_PACKET_SIZE;
-                        cl.read_state = CL_READ_HDR;
-                    }
-                    if (cl.read_remaining > remain)
-                    {
-                        memcpy(cl.read_buf, curbuf, remain);
-                        cl.read_remaining -= remain;
-                        cl.read_buf += remain;
-                        remain = 0;
-                        if (cl.read_remaining <= 0)
-                            handle_finished_read(cl);
-                    }
-                    else
-                    {
-                        memcpy(cl.read_buf, curbuf, cl.read_remaining);
-                        curbuf += cl.read_remaining;
-                        remain -= cl.read_remaining;
-                        cl.read_remaining = 0;
-                        cl.read_buf = NULL;
-                        handle_finished_read(cl);
-                    }
-                }
-            }
-            else
-            {
-                // Long data
-                cl.read_remaining -= result;
-                cl.read_buf += result;
-                if (cl.read_remaining <= 0)
-                {
-                    handle_finished_read(cl);
-                }
-            }
-            if (result >= cl.read_iov.iov_len)
-            {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-void osd_messenger_t::handle_finished_read(osd_client_t & cl)
-{
-    if (cl.read_state == CL_READ_HDR)
-    {
-        if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
-            handle_reply_hdr(&cl);
-        else
-            handle_op_hdr(&cl);
-    }
-    else if (cl.read_state == CL_READ_DATA)
-    {
-        // Operation is ready
-        cl.received_ops.push_back(cl.read_op);
-        exec_op(cl.read_op);
-        cl.read_op = NULL;
-        cl.read_state = 0;
-    }
-    else if (cl.read_state == CL_READ_REPLY_DATA)
-    {
-        // Reply is ready
-        auto req_it = cl.sent_ops.find(cl.read_reply_id);
-        osd_op_t *request = req_it->second;
-        cl.sent_ops.erase(req_it);
-        cl.read_reply_id = 0;
-        delete cl.read_op;
-        cl.read_op = NULL;
-        cl.read_state = 0;
-        // Measure subop latency
-        timespec tv_end;
-        clock_gettime(CLOCK_REALTIME, &tv_end);
-        stats.subop_stat_count[request->req.hdr.opcode]++;
-        if (!stats.subop_stat_count[request->req.hdr.opcode])
-        {
-            stats.subop_stat_count[request->req.hdr.opcode]++;
-            stats.subop_stat_sum[request->req.hdr.opcode] = 0;
-        }
-        stats.subop_stat_sum[request->req.hdr.opcode] += (
-            (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
-            (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
-        );
-        request->callback(request);
-    }
-    else
-    {
-        assert(0);
-    }
-}
-
-void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
-{
-    osd_op_t *cur_op = cl->read_op;
-    if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
-    {
-        if (cur_op->req.sec_rw.len > 0)
-            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
-        cl->read_remaining = 0;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
-    {
-        if (cur_op->req.sec_rw.len > 0)
-            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
-        cl->read_remaining = cur_op->req.sec_rw.len;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
-        cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
-    {
-        if (cur_op->req.sec_stab.len > 0)
-            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
-        cl->read_remaining = cur_op->req.sec_stab.len;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
-    {
-        if (cur_op->req.rw.len > 0)
-            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
-        cl->read_remaining = 0;
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
-    {
-        if (cur_op->req.rw.len > 0)
-            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
-        cl->read_remaining = cur_op->req.rw.len;
-    }
-    if (cl->read_remaining > 0)
-    {
-        // Read data
-        cl->read_buf = cur_op->buf;
-        cl->read_state = CL_READ_DATA;
-    }
-    else
-    {
-        // Operation is ready
-        cl->read_op = NULL;
-        cl->read_state = 0;
-        cl->received_ops.push_back(cur_op);
-        exec_op(cur_op);
-    }
-}
-
-void osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
-{
-    osd_op_t *cur_op = cl->read_op;
-    auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
-    if (req_it == cl->sent_ops.end())
-    {
-        // Command out of sync. Drop connection
-        printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
-        stop_client(cl->peer_fd);
-        return;
-    }
-    osd_op_t *op = req_it->second;
-    memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
-    if ((op->reply.hdr.opcode == OSD_OP_SECONDARY_READ || op->reply.hdr.opcode == OSD_OP_READ) &&
-        op->reply.hdr.retval > 0)
-    {
-        // Read data. In this case we assume that the buffer is preallocated by the caller (!)
-        assert(op->buf);
-        cl->read_state = CL_READ_REPLY_DATA;
-        cl->read_reply_id = op->req.hdr.id;
-        cl->read_buf = op->buf;
-        cl->read_remaining = op->reply.hdr.retval;
-    }
-    else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST && op->reply.hdr.retval > 0)
-    {
-        op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
-        cl->read_state = CL_READ_REPLY_DATA;
-        cl->read_reply_id = op->req.hdr.id;
-        cl->read_buf = op->buf;
-        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
-    }
-    else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
-    {
-        op->buf = malloc(op->reply.hdr.retval);
-        cl->read_state = CL_READ_REPLY_DATA;
-        cl->read_reply_id = op->req.hdr.id;
-        cl->read_buf = op->buf;
-        cl->read_remaining = op->reply.hdr.retval;
-    }
-    else
-    {
-        delete cl->read_op;
-        cl->read_state = 0;
-        cl->read_op = NULL;
-        cl->sent_ops.erase(req_it);
-        // Measure subop latency
-        timespec tv_end;
-        clock_gettime(CLOCK_REALTIME, &tv_end);
-        stats.subop_stat_count[op->req.hdr.opcode]++;
-        if (!stats.subop_stat_count[op->req.hdr.opcode])
-        {
-            stats.subop_stat_count[op->req.hdr.opcode]++;
-            stats.subop_stat_sum[op->req.hdr.opcode] = 0;
-        }
-        stats.subop_stat_sum[op->req.hdr.opcode] += (
-            (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
-            (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
-        );
-        // Copy lambda to be unaffected by `delete op`
-        std::function<void(osd_op_t*)>(op->callback)(op);
-    }
-}
--- a/msgr_send.cpp
+++ b/msgr_send.cpp
@@ -1,149 +0,0 @@
-#include "messenger.h"
-
-void osd_messenger_t::outbox_push(osd_op_t *cur_op)
-{
-    assert(cur_op->peer_fd);
-    auto & cl = clients.at(cur_op->peer_fd);
-    if (cur_op->op_type == OSD_OP_OUT)
-    {
-        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
-    }
-    else
-    {
-        // Check that operation actually belongs to this client
-        bool found = false;
-        for (auto it = cl.received_ops.begin(); it != cl.received_ops.end(); it++)
-        {
-            if (*it == cur_op)
-            {
-                found = true;
-                cl.received_ops.erase(it, it+1);
-                break;
-            }
-        }
-        if (!found)
-        {
-            delete cur_op;
-            return;
-        }
-    }
-    cl.outbox.push_back(cur_op);
-    if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
-    {
-        if (cl.write_state == 0)
-        {
-            cl.write_state = CL_WRITE_READY;
-            write_ready_clients.push_back(cur_op->peer_fd);
-        }
-        ringloop->wakeup();
-    }
-}
-
-bool osd_messenger_t::try_send(osd_client_t & cl)
-{
-    int peer_fd = cl.peer_fd;
-    if (!cl.write_op)
-    {
-        // pick next command
-        cl.write_op = cl.outbox.front();
-        cl.outbox.pop_front();
-        cl.write_state = CL_WRITE_REPLY;
-        if (cl.write_op->op_type == OSD_OP_IN)
-        {
-            // Measure execution latency
-            timespec tv_end;
-            clock_gettime(CLOCK_REALTIME, &tv_end);
-            stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
-            if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
-            {
-                stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
-                stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
-                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
-            }
-            stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
-                (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
-                (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
-            );
-            if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
-                cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
-            {
-                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.rw.len;
-            }
-            else if (cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
-            {
-                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
-            }
-        }
-    }
-    cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
-    cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
-    int result = sendmsg(peer_fd, &cl.write_msg, MSG_NOSIGNAL);
-    if (result < 0)
-        result = -errno;
-    handle_send(result, peer_fd);
-    return true;
-}
-
-void osd_messenger_t::send_replies()
-{
-    while (write_ready_clients.size() > 0)
-    {
-        auto & cl = clients[write_ready_clients[0]];
-        write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + 1);
-        try_send(cl);
-    }
-}
-
-void osd_messenger_t::handle_send(int result, int peer_fd)
-{
-    auto cl_it = clients.find(peer_fd);
-    if (cl_it != clients.end())
-    {
-        auto & cl = cl_it->second;
-        if (result < 0 && result != -EAGAIN)
-        {
-            // this is a client socket, so don't panic. just disconnect it
-            printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
-            stop_client(peer_fd);
-            return;
-        }
-        if (result >= 0)
-        {
-            osd_op_t *cur_op = cl.write_op;
-            while (result > 0 && cur_op->send_list.sent < cur_op->send_list.count)
-            {
-                iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
-                if (iov.iov_len <= result)
-                {
-                    result -= iov.iov_len;
-                    cur_op->send_list.sent++;
-                }
-                else
-                {
-                    iov.iov_len -= result;
-                    iov.iov_base += result;
-                    break;
-                }
-            }
-            if (cur_op->send_list.sent >= cur_op->send_list.count)
-            {
-                // Done
-                if (cur_op->op_type == OSD_OP_IN)
-                {
-                    delete cur_op;
-                }
-                else
-                {
-                    cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
-                }
-                cl.write_op = NULL;
-                cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
-            }
-        }
-        if (cl.write_state != 0)
-        {
-            write_ready_clients.push_back(peer_fd);
-        }
-    }
-}
--- a/osd.cpp
+++ b/osd.cpp
@@ -7,9 +7,7 @@

 #include "osd.h"

-#define MAX_EPOLL_EVENTS 64
-
-const char* osd_op_names[] = {
+static const char* osd_op_names[] = {
    "",
    "read",
    "write",
@@ -23,7 +21,6 @@ const char* osd_op_names[] = {
    "primary_read",
    "primary_write",
    "primary_sync",
-    "primary_delete",
 };

 osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
@@ -31,110 +28,50 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
    this->config = config;
    this->bs = bs;
    this->ringloop = ringloop;
-
+    this->tick_tfd = new timerfd_interval(ringloop, 3, [this]()
+    {
+        for (int i = 0; i <= OSD_OP_MAX; i++)
+        {
+            if (op_stat_count[i] != 0)
+            {
+                printf("avg latency for op %d (%s): %ld us\n", i, osd_op_names[i], op_stat_sum[i]/op_stat_count[i]);
+                op_stat_count[i] = 0;
+                op_stat_sum[i] = 0;
+            }
+        }
+        for (int i = 0; i <= OSD_OP_MAX; i++)
+        {
+            if (subop_stat_count[i] != 0)
+            {
+                printf("avg latency for subop %d (%s): %ld us\n", i, osd_op_names[i], subop_stat_sum[i]/subop_stat_count[i]);
+                subop_stat_count[i] = 0;
+                subop_stat_sum[i] = 0;
+            }
+        }
+        if (send_stat_count != 0)
+        {
+            printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
+            send_stat_count = 0;
+            send_stat_sum = 0;
+        }
+    });
    this->bs_block_size = bs->get_block_size();
    // FIXME: use bitmap granularity instead
    this->bs_disk_alignment = bs->get_disk_alignment();

-    parse_config(config);
-
-    epoll_fd = epoll_create(1);
-    if (epoll_fd < 0)
-    {
-        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
-    }
-
-    this->tfd = new timerfd_manager_t([this](int fd, std::function<void(int, int)> handler) { set_fd_handler(fd, handler); });
-    this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
-    {
-        print_stats();
-    });
-
-    c_cli.tfd = this->tfd;
-    c_cli.ringloop = this->ringloop;
-    c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
-    c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
-
-    init_cluster();
-
-    consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(&consumer);
-}
-
-osd_t::~osd_t()
-{
-    if (tfd)
-    {
-        delete tfd;
-        tfd = NULL;
-    }
-    ringloop->unregister_consumer(&consumer);
-    close(epoll_fd);
-    close(listen_fd);
-}
-
-void osd_t::parse_config(blockstore_config_t & config)
-{
-    // Initial startup configuration
-    json11::Json json_config = json11::Json(config);
-    st_cli.parse_config(json_config);
-    etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
-    if (etcd_report_interval <= 0)
-        etcd_report_interval = 30;
-    osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
-    if (!osd_num)
-        throw std::runtime_error("osd_num is required in the configuration");
-    c_cli.osd_num = osd_num;
-    run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
-    // Cluster configuration
    bind_address = config["bind_address"];
    if (bind_address == "")
        bind_address = "0.0.0.0";
-    bind_port = stoull_full(config["bind_port"]);
-    if (bind_port <= 0 || bind_port > 65535)
-        bind_port = 0;
-    if (config["immediate_commit"] == "all")
-        immediate_commit = IMMEDIATE_ALL;
-    else if (config["immediate_commit"] == "small")
-        immediate_commit = IMMEDIATE_SMALL;
-    if (config.find("autosync_interval") != config.end())
-    {
-        autosync_interval = strtoull(config["autosync_interval"].c_str(), NULL, 10);
-        if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
-            autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
-    }
-    if (config.find("client_queue_depth") != config.end())
-    {
-        client_queue_depth = strtoull(config["client_queue_depth"].c_str(), NULL, 10);
-        if (client_queue_depth < 128)
-            client_queue_depth = 128;
-    }
-    if (config.find("pg_stripe_size") != config.end())
-    {
-        pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
-        if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
-            pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
-    }
-    recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
-    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
-        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
-    if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
-        readonly = true;
-    print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
-    if (!print_stats_interval)
-        print_stats_interval = 3;
-    c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
-    if (!c_cli.peer_connect_interval)
-        c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
-    c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
-    if (!c_cli.peer_connect_timeout)
-        c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
-    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
-    c_cli.log_level = log_level;
-}
+    bind_port = strtoull(config["bind_port"].c_str(), NULL, 10);
+    if (!bind_port || bind_port > 65535)
+        bind_port = 11203;
+    osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
+    if (!osd_num)
+        throw std::runtime_error("osd_num is required in the configuration");
+    run_primary = config["run_primary"] == "true" || config["run_primary"] == "1" || config["run_primary"] == "yes";
+    if (run_primary)
+        init_primary();

-void osd_t::bind_socket()
-{
    listen_fd = socket(AF_INET, SOCK_STREAM, 0);
    if (listen_fd < 0)
    {
@@ -151,27 +88,13 @@ void osd_t::bind_socket()
        throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
    }
    addr.sin_family = AF_INET;
-
    addr.sin_port = htons(bind_port);
+
    if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
    {
        close(listen_fd);
        throw std::runtime_error(std::string("bind: ") + strerror(errno));
    }
-    if (bind_port == 0)
-    {
-        socklen_t len = sizeof(addr);
-        if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
-        {
-            close(listen_fd);
-            throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
-        }
-        listening_port = ntohs(addr.sin_port);
-    }
-    else
-    {
-        listening_port = bind_port;
-    }

    if (listen(listen_fd, listen_backlog) < 0)
    {
@@ -181,6 +104,13 @@ void osd_t::bind_socket()

    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);

+    epoll_fd = epoll_create(1);
+    if (epoll_fd < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
+    }
+
    epoll_event ev;
    ev.data.fd = listen_fd;
    ev.events = EPOLLIN | EPOLLET;
@@ -190,6 +120,39 @@ void osd_t::bind_socket()
        close(epoll_fd);
        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
    }
+
+    consumer.loop = [this]() { loop(); };
+    ringloop->register_consumer(consumer);
+}
+
+osd_t::~osd_t()
+{
+    delete tick_tfd;
+    ringloop->unregister_consumer(consumer);
+    close(epoll_fd);
+    close(listen_fd);
+}
+
+osd_op_t::~osd_op_t()
+{
+    if (bs_op)
+    {
+        delete bs_op;
+    }
+    if (op_data)
+    {
+        free(op_data);
+    }
+    if (rmw_buf)
+    {
+        free(rmw_buf);
+    }
+    if (buf)
+    {
+        // Note: reusing osd_op_t WILL currently lead to memory leaks
+        // So we don't reuse it, but free it every time
+        free(buf);
+    }
 }

 bool osd_t::shutdown()
@@ -210,33 +173,8 @@ void osd_t::loop()
        wait_state = 1;
    }
    handle_peers();
-    c_cli.read_requests();
-    c_cli.send_replies();
-    ringloop->submit();
-}
-
-void osd_t::set_fd_handler(int fd, std::function<void(int, int)> handler)
-{
-    if (handler != NULL)
-    {
-        bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
-        epoll_event ev;
-        ev.data.fd = fd;
-        ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
-        if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
-        {
-            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-        }
-        epoll_handlers[fd] = handler;
-    }
-    else
-    {
-        if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
-        {
-            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-        }
-        epoll_handlers.erase(fd);
-    }
+    read_requests();
+    send_replies();
 }

 void osd_t::handle_epoll_events()
@@ -265,12 +203,63 @@ restart:
    {
        if (events[i].data.fd == listen_fd)
        {
-            c_cli.accept_connections(listen_fd);
+            // Accept new connections
+            sockaddr_in addr;
+            socklen_t peer_addr_size = sizeof(addr);
+            int peer_fd;
+            while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
+            {
+                char peer_str[256];
+                printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
+                fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
+                int one = 1;
+                setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+                clients[peer_fd] = {
+                    .peer_addr = addr,
+                    .peer_port = ntohs(addr.sin_port),
+                    .peer_fd = peer_fd,
+                    .peer_state = PEER_CONNECTED,
+                };
+                // Add FD to epoll
+                epoll_event ev;
+                ev.data.fd = peer_fd;
+                ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+                if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
+                {
+                    throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+                }
+                // Try to accept next connection
+                peer_addr_size = sizeof(addr);
+            }
+            if (peer_fd == -1 && errno != EAGAIN)
+            {
+                throw std::runtime_error(std::string("accept: ") + strerror(errno));
+            }
        }
        else
        {
-            auto & cb = epoll_handlers[events[i].data.fd];
-            cb(events[i].data.fd, events[i].events);
+            auto & cl = clients[events[i].data.fd];
+            if (cl.peer_state == PEER_CONNECTING)
+            {
+                // Either OUT (connected) or HUP
+                handle_connect_result(cl.peer_fd);
+            }
+            else if (events[i].events & EPOLLRDHUP)
+            {
+                // Stop client
+                printf("osd: client %d disconnected\n", cl.peer_fd);
+                stop_client(cl.peer_fd);
+            }
+            else
+            {
+                // Mark client as ready (i.e. some data is available)
+                cl.read_ready++;
+                if (cl.read_ready == 1)
+                {
+                    read_ready_clients.push_back(cl.peer_fd);
+                    ringloop->wakeup();
+                }
+            }
        }
    }
    if (nfds == MAX_EPOLL_EVENTS)
@@ -279,6 +268,85 @@ restart:
    }
 }

+void osd_t::cancel_osd_ops(osd_client_t & cl)
+{
+    for (auto p: cl.sent_ops)
+    {
+        cancel_op(p.second);
+    }
+    cl.sent_ops.clear();
+    for (auto op: cl.outbox)
+    {
+        cancel_op(op);
+    }
+    cl.outbox.clear();
+    if (cl.write_op)
+    {
+        cancel_op(cl.write_op);
+        cl.write_op = NULL;
+    }
+}
+
+void osd_t::cancel_op(osd_op_t *op)
+{
+    if (op->op_type == OSD_OP_OUT)
+    {
+        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        op->reply.hdr.id = op->req.hdr.id;
+        op->reply.hdr.opcode = op->req.hdr.opcode;
+        op->reply.hdr.retval = -EPIPE;
+        op->callback(op);
+    }
+    else
+    {
+        delete op;
+    }
+}
+
+void osd_t::stop_client(int peer_fd)
+{
+    auto it = clients.find(peer_fd);
+    if (it == clients.end())
+    {
+        return;
+    }
+    auto & cl = it->second;
+    if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
+    {
+        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+    }
+    if (cl.osd_num)
+    {
+        // Cancel outbound operations
+        cancel_osd_ops(cl);
+        osd_peer_fds.erase(cl.osd_num);
+        repeer_pgs(cl.osd_num, false);
+        peering_state |= OSD_PEERING_PEERS;
+    }
+    if (cl.read_op)
+    {
+        delete cl.read_op;
+    }
+    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
+    {
+        if (*rit == peer_fd)
+        {
+            read_ready_clients.erase(rit);
+            break;
+        }
+    }
+    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
+    {
+        if (*wit == peer_fd)
+        {
+            write_ready_clients.erase(wit);
+            break;
+        }
+    }
+    clients.erase(it);
+    close(peer_fd);
+}
+
 void osd_t::exec_op(osd_op_t *cur_op)
 {
    clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@@ -288,29 +356,23 @@ void osd_t::exec_op(osd_op_t *cur_op)
        delete cur_op;
        return;
    }
-    inflight_ops++;
    cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
    if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
        cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
        (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
-        (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % bs_disk_alignment || cur_op->req.sec_rw.offset % bs_disk_alignment) ||
-        (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
-        (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % bs_disk_alignment || cur_op->req.rw.offset % bs_disk_alignment))
+        (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
+        (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
+        (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
    {
        // Bad command
-        finish_op(cur_op, -EINVAL);
-        return;
-    }
-    if (readonly &&
-        cur_op->req.hdr.opcode != OSD_OP_SECONDARY_READ &&
-        cur_op->req.hdr.opcode != OSD_OP_SECONDARY_LIST &&
-        cur_op->req.hdr.opcode != OSD_OP_READ &&
-        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
-    {
-        // Readonly mode
-        finish_op(cur_op, -EROFS);
+        cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        cur_op->reply.hdr.id = cur_op->req.hdr.id;
+        cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+        cur_op->reply.hdr.retval = -EINVAL;
+        outbox_push(this->clients[cur_op->peer_fd], cur_op);
        return;
    }
+    inflight_ops++;
    if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
    {
        exec_sync_stab_all(cur_op);
@@ -331,84 +393,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_sync(cur_op);
    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
-    {
-        continue_primary_del(cur_op);
-    }
    else
    {
        exec_secondary(cur_op);
    }
 }
-
-void osd_t::reset_stats()
-{
-    c_cli.stats = { 0 };
-    prev_stats = { 0 };
-    memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
-    memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
-}
-
-void osd_t::print_stats()
-{
-    for (int i = 0; i <= OSD_OP_MAX; i++)
-    {
-        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
-        {
-            uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
-            uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
-            if (c_cli.stats.op_stat_bytes[i] != 0)
-            {
-                printf(
-                    "[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
-                    (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
-                    (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
-                );
-            }
-            else
-            {
-                printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
-            }
-            prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i];
-            prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i];
-            prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
-        }
-    }
-    for (int i = 0; i <= OSD_OP_MAX; i++)
-    {
-        if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
-        {
-            uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
-            printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
-            prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i];
-            prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i];
-        }
-    }
-    for (int i = 0; i < 2; i++)
-    {
-        if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
-        {
-            uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
-            printf(
-                "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
-                (recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
-                (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
-                (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
-            );
-            recovery_stat_count[1][i] = recovery_stat_count[0][i];
-            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
-        }
-    }
-    if (incomplete_objects > 0)
-    {
-        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
-    }
-    if (degraded_objects > 0)
-    {
-        printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
-    }
-    if (misplaced_objects > 0)
-    {
-        printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
-    }
-}
--- a/osd.h
+++ b/osd.h
@@ -15,29 +15,144 @@

 #include "blockstore.h"
 #include "ringloop.h"
-#include "timerfd_manager.h"
+#include "timerfd_interval.h"
+#include "osd_ops.h"
 #include "osd_peering_pg.h"
-#include "messenger.h"
-#include "etcd_state_client.h"

-#define OSD_LOADING_PGS 0x01
-#define OSD_PEERING_PGS 0x04
-#define OSD_FLUSHING_PGS 0x08
-#define OSD_RECOVERING 0x10
+#include "sparsepp/sparsepp/spp.h"

-#define IMMEDIATE_NONE 0
-#define IMMEDIATE_SMALL 1
-#define IMMEDIATE_ALL 2
+#define OSD_OP_IN 0
+#define OSD_OP_OUT 1

-#define MAX_AUTOSYNC_INTERVAL 3600
-#define DEFAULT_AUTOSYNC_INTERVAL 5
-#define MAX_RECOVERY_QUEUE 2048
-#define DEFAULT_RECOVERY_QUEUE 4
-#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default
+#define CL_READ_OP 1
+#define CL_READ_DATA 2
+#define CL_READ_REPLY_DATA 3
+#define CL_WRITE_READY 1
+#define CL_WRITE_REPLY 2
+#define MAX_EPOLL_EVENTS 64
+#define OSD_OP_INLINE_BUF_COUNT 16
+
+#define PEER_CONNECTING 1
+#define PEER_CONNECTED 2
+#define OSD_PEERING_PEERS 1
+#define OSD_PEERING_PGS 2

 //#define OSD_STUB

-extern const char* osd_op_names[];
+struct osd_op_buf_list_t
+{
+    int count = 0, alloc = 0, sent = 0;
+    iovec *buf = NULL;
+    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
+
+    ~osd_op_buf_list_t()
+    {
+        if (buf && buf != inline_buf)
+        {
+            free(buf);
+        }
+    }
+
+    inline iovec* get_iovec()
+    {
+        return (buf ? buf : inline_buf) + sent;
+    }
+
+    inline int get_size()
+    {
+        return count - sent;
+    }
+
+    inline void push_back(void *nbuf, size_t len)
+    {
+        if (count >= alloc)
+        {
+            if (!alloc)
+            {
+                alloc = OSD_OP_INLINE_BUF_COUNT;
+                buf = inline_buf;
+            }
+            else if (buf == inline_buf)
+            {
+                int old = alloc;
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)malloc(sizeof(iovec) * alloc);
+                memcpy(buf, inline_buf, sizeof(iovec)*old);
+            }
+            else
+            {
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
+            }
+        }
+        buf[count++] = { .iov_base = nbuf, .iov_len = len };
+    }
+};
+
+struct osd_primary_op_data_t;
+
+struct osd_op_t
+{
+    timespec tv_begin;
+    timespec tv_send;
+    int op_type = OSD_OP_IN;
+    int peer_fd;
+    osd_any_op_t req;
+    osd_any_reply_t reply;
+    blockstore_op_t *bs_op = NULL;
+    void *buf = NULL;
+    void *rmw_buf = NULL;
+    osd_primary_op_data_t* op_data = NULL;
+    std::function<void(osd_op_t*)> callback;
+
+    osd_op_buf_list_t send_list;
+
+    ~osd_op_t();
+};
+
+struct osd_peer_def_t
+{
+    osd_num_t osd_num = 0;
+    std::string addr;
+    int port = 0;
+    time_t last_connect_attempt = 0;
+};
+
+struct osd_client_t
+{
+    sockaddr_in peer_addr;
+    int peer_port;
+    int peer_fd;
+    int peer_state;
+    std::function<void(osd_num_t, int)> connect_callback;
+    osd_num_t osd_num = 0;
+
+    // Read state
+    int read_ready = 0;
+    osd_op_t *read_op = NULL;
+    int read_reply_id = 0;
+    iovec read_iov;
+    msghdr read_msg;
+    void *read_buf = NULL;
+    int read_remaining = 0;
+    int read_state = 0;
+
+    // Outbound operations sent to this client (which is probably an OSD peer)
+    std::map<int, osd_op_t*> sent_ops;
+
+    // Outbound messages (replies or requests)
+    std::deque<osd_op_t*> outbox;
+
+    // PGs dirtied by this client's primary-writes
+    std::set<pg_num_t> dirty_pgs;
+
+    // Write state
+    osd_op_t *write_op = NULL;
+    msghdr write_msg;
+    int write_state = 0;
+};
+
+struct osd_rmw_stripe_t;

 struct osd_object_id_t
 {
@@ -45,58 +160,26 @@ struct osd_object_id_t
    object_id oid;
 };

-struct osd_recovery_op_t
-{
-    int st = 0;
-    bool degraded = false;
-    pg_num_t pg_num = 0;
-    object_id oid = { 0 };
-    osd_op_t *osd_op = NULL;
-};
-
 class osd_t
 {
    // config

-    blockstore_config_t config;
-    int etcd_report_interval = 30;
-
-    bool readonly = false;
    osd_num_t osd_num = 1; // OSD numbers start with 1
    bool run_primary = false;
+    std::vector<osd_peer_def_t> peers;
+    blockstore_config_t config;
    std::string bind_address;
    int bind_port, listen_backlog;
-    // FIXME: Implement client queue depth limit
    int client_queue_depth = 128;
    bool allow_test_ops = true;
-    int print_stats_interval = 3;
-    int immediate_commit = IMMEDIATE_NONE;
-    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
-    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
-    int log_level = 0;

-    // cluster state
+    // peer OSDs

-    etcd_state_client_t st_cli;
-    osd_messenger_t c_cli;
-    int etcd_failed_attempts = 0;
-    std::string etcd_lease_id;
-    json11::Json self_state;
-    bool loading_peer_config = false;
-    std::set<pg_num_t> pg_state_dirty;
-    bool pg_config_applied = false;
-    bool etcd_reporting_pg_state = false;
-    bool etcd_reporting_stats = false;
-
-    // peers and PGs
-
-    std::map<pg_num_t, pg_t> pgs;
-    std::set<pg_num_t> dirty_pgs;
-    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
+    std::map<uint64_t, int> osd_peer_fds;
+    std::vector<pg_t> pgs;
    int peering_state = 0;
    unsigned pg_count = 0;
-    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    osd_op_t *autosync_op = NULL;
+    uint64_t next_subop_id = 1;

    // Unstable writes
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -108,73 +191,53 @@ class osd_t
    int inflight_ops = 0;
    blockstore_t *bs;
    uint32_t bs_block_size, bs_disk_alignment;
-    uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
+    uint64_t parity_block_size = 4*1024*1024; // 4 MB by default
    ring_loop_t *ringloop;
-    timerfd_manager_t *tfd = NULL;
+    timerfd_interval *tick_tfd;

    int wait_state = 0;
    int epoll_fd = 0;
-    int listening_port = 0;
    int listen_fd = 0;
    ring_consumer_t consumer;
-    std::map<int, std::function<void(int, int)>> epoll_handlers;

-    // op statistics
-    osd_op_stats_t prev_stats;
-    const char* recovery_stat_names[2] = { "degraded", "misplaced" };
-    uint64_t recovery_stat_count[2][2] = { 0 };
-    uint64_t recovery_stat_bytes[2][2] = { 0 };
+    std::unordered_map<int,osd_client_t> clients;
+    std::vector<int> read_ready_clients;
+    std::vector<int> write_ready_clients;
+    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
+    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
+    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
+    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
+    uint64_t send_stat_sum = 0;
+    uint64_t send_stat_count = 0;

-    // cluster connection
-    void parse_config(blockstore_config_t & config);
-    void init_cluster();
-    void on_change_osd_state_hook(uint64_t osd_num);
-    void on_change_etcd_state_hook(json11::Json::object & changes);
-    void on_load_config_hook(json11::Json::object & changes);
-    json11::Json on_load_pgs_checks_hook();
-    void on_load_pgs_hook(bool success);
-    void bind_socket();
-    void acquire_lease();
-    json11::Json get_osd_state();
-    void create_osd_state();
-    void renew_lease();
-    void print_stats();
-    void reset_stats();
-    json11::Json get_statistics();
-    void report_statistics();
-    void report_pg_state(pg_t & pg);
-    void report_pg_states();
-    void apply_pg_count();
-    void apply_pg_config();
+    // methods

    // event loop, socket read/write
    void loop();
-    void set_fd_handler(int fd, std::function<void(int, int)> handler);
    void handle_epoll_events();
+    void read_requests();
+    void handle_read(ring_data_t *data, int peer_fd);
+    void handle_op_hdr(osd_client_t *cl);
+    void handle_reply_hdr(osd_client_t *cl);
+    bool try_send(osd_client_t & cl);
+    void send_replies();
+    void handle_send(ring_data_t *data, int peer_fd);
+    void outbox_push(osd_client_t & cl, osd_op_t *op);

    // peer handling (primary OSD logic)
-    void parse_test_peer(std::string peer);
+    void connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback);
+    void handle_connect_result(int peer_fd);
+    void cancel_osd_ops(osd_client_t & cl);
+    void cancel_op(osd_op_t *op);
+    void stop_client(int peer_fd);
+    osd_peer_def_t parse_peer(std::string peer);
+    void init_primary();
    void handle_peers();
-    void repeer_pgs(osd_num_t osd_num);
-    void start_pg_peering(pg_num_t pg_num);
-    void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
-    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
-    void discard_list_subop(osd_op_t *list_op);
-    bool stop_pg(pg_num_t pg_num);
-    void finish_stop_pg(pg_t & pg);
-
-    // flushing, recovery and backfill
-    void submit_pg_flush_ops(pg_num_t pg_num);
-    void handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
-    void submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
-    bool pick_next_recovery(osd_recovery_op_t &op);
-    void submit_recovery_op(osd_recovery_op_t *op);
-    bool continue_recovery();
-    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
+    void repeer_pgs(osd_num_t osd_num, bool is_connected);
+    void start_pg_peering(int i);

    // op execution
    void exec_op(osd_op_t *cur_op);
-    void finish_op(osd_op_t *cur_op, int retval);

    // secondary ops
    void exec_sync_stab_all(osd_op_t *cur_op);
@@ -183,34 +246,18 @@ class osd_t
    void secondary_op_callback(osd_op_t *cur_op);

    // primary ops
-    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
-    void cancel_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
-    void continue_primary_del(osd_op_t *cur_op);
-    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
-    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
-    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
-    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
-    void handle_primary_bs_subop(osd_op_t *subop);
-    void add_bs_subop_stats(osd_op_t *subop);
-    void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
+    void finish_primary_op(osd_op_t *cur_op, int retval);
+    void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
    void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
-    void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
    void submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);
-
-    inline pg_num_t map_to_pg(object_id oid)
-    {
-        return (oid.inode + oid.stripe / pg_stripe_size) % pg_count + 1;
-    }
-
 public:
    osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
    ~osd_t();
-    void force_stop(int exitcode);
    bool shutdown();
 };

--- a/osd_client.cpp
+++ b/osd_client.cpp
@@ -0,0 +1,40 @@
+void slice()
+{
+    // Slice the request into blockstore requests to individual objects
+    // Primary OSD still operates individual stripes, except they're twice the size of the blockstore's stripe.
+    std::vector read_parts;
+    int block = bs->get_block_size();
+    uint64_t stripe1 = cur_op->req.rw.offset / block / 2;
+    uint64_t stripe2 = (cur_op->req.rw.offset + cur_op->req.rw.len + block*2 - 1) / block / 2 - 1;
+    for (uint64_t s = stripe1; s <= stripe2; s++)
+    {
+        uint64_t start = s == stripe1 ? cur_op->req.rw.offset - stripe1*block*2 : 0;
+        uint64_t end = s == stripe2 ? cur_op->req.rw.offset + cur_op->req.rw.len - stripe2*block*2 : block*2;
+        if (start < block)
+        {
+            read_parts.push_back({
+                .role = 1,
+                .oid = {
+                    .inode = cur_op->req.rw.inode,
+                    .stripe = (s << STRIPE_ROLE_BITS) | 1,
+                },
+                .version = UINT64_MAX,
+                .offset = start,
+                .len = (block < end ? block : end) - start,
+            });
+        }
+        if (end > block)
+        {
+            read_parts.push_back({
+                .role = 2,
+                .oid = {
+                    .inode = cur_op->req.rw.inode,
+                    .stripe = (s << STRIPE_ROLE_BITS) | 2,
+                },
+                .version = UINT64_MAX,
+                .offset = (start > block ? start-block : 0),
+                .len = end - (start > block ? start-block : 0),
+            });
+        }
+    }
+}
--- a/osd_cluster.cpp
+++ b/osd_cluster.cpp
@@ -1,746 +0,0 @@
-#include "osd.h"
-#include "base64.h"
-#include "etcd_state_client.h"
-
-// Startup sequence:
-//   Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
-//   -> Load PG config -> Report&lock PG states -> Load peers -> Connect to peers -> Peer PGs
-// Event handling
-//   Wait for PG changes -> Start/Stop PGs when requested
-//   Peer connection is lost -> Reload connection data -> Try to reconnect
-void osd_t::init_cluster()
-{
-    if (!st_cli.etcd_addresses.size())
-    {
-        if (run_primary)
-        {
-            // Test version of clustering code with 1 PG and 2 peers
-            // Example: peers = 2:127.0.0.1:11204,3:127.0.0.1:11205
-            std::string peerstr = config["peers"];
-            while (peerstr.size())
-            {
-                int pos = peerstr.find(',');
-                parse_test_peer(pos < 0 ? peerstr : peerstr.substr(0, pos));
-                peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
-            }
-            if (st_cli.peer_states.size() < 2)
-            {
-                throw std::runtime_error("run_primary requires at least 2 peers");
-            }
-            pgs[1] = (pg_t){
-                .state = PG_PEERING,
-                .pg_cursize = 0,
-                .pg_num = 1,
-                .target_set = { 1, 2, 3 },
-                .cur_set = { 0, 0, 0 },
-            };
-            report_pg_state(pgs[1]);
-            pg_count = 1;
-        }
-        bind_socket();
-    }
-    else
-    {
-        st_cli.tfd = tfd;
-        st_cli.log_level = log_level;
-        st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
-        st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_etcd_state_hook(changes); };
-        st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
-        st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
-        st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
-        peering_state = OSD_LOADING_PGS;
-        st_cli.load_global_config();
-    }
-    if (run_primary && autosync_interval > 0)
-    {
-        this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
-        {
-            autosync();
-        });
-    }
-}
-
-void osd_t::parse_test_peer(std::string peer)
-{
-    // OSD_NUM:IP:PORT
-    int pos1 = peer.find(':');
-    int pos2 = peer.find(':', pos1+1);
-    if (pos1 < 0 || pos2 < 0)
-        throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
-    std::string addr = peer.substr(pos1+1, pos2-pos1-1);
-    std::string osd_num_str = peer.substr(0, pos1);
-    std::string port_str = peer.substr(pos2+1);
-    osd_num_t peer_osd = strtoull(osd_num_str.c_str(), NULL, 10);
-    if (!peer_osd)
-        throw new std::runtime_error("Could not parse OSD peer osd_num");
-    else if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
-        throw std::runtime_error("Same osd number "+std::to_string(peer_osd)+" specified twice in peers");
-    int port = strtoull(port_str.c_str(), NULL, 10);
-    if (!port)
-        throw new std::runtime_error("Could not parse OSD peer port");
-    st_cli.peer_states[peer_osd] = json11::Json::object {
-        { "state", "up" },
-        { "addresses", json11::Json::array { addr } },
-        { "port", port },
-    };
-    c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
-}
-
-json11::Json osd_t::get_osd_state()
-{
-    std::vector<char> hostname;
-    hostname.resize(1024);
-    while (gethostname(hostname.data(), hostname.size()) < 0 && errno == ENAMETOOLONG)
-        hostname.resize(hostname.size()+1024);
-    hostname.resize(strnlen(hostname.data(), hostname.size()));
-    json11::Json::object st;
-    st["state"] = "up";
-    if (bind_address != "0.0.0.0")
-        st["addresses"] = json11::Json::array { bind_address };
-    else
-        st["addresses"] = getifaddr_list();
-    st["host"] = std::string(hostname.data(), hostname.size());
-    st["port"] = listening_port;
-    st["primary_enabled"] = run_primary;
-    st["blockstore_enabled"] = bs ? true : false;
-    return st;
-}
-
-json11::Json osd_t::get_statistics()
-{
-    json11::Json::object st;
-    timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    char time_str[50] = { 0 };
-    sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
-    st["time"] = time_str;
-    st["blockstore_ready"] = bs->is_started();
-    if (bs)
-    {
-        st["size"] = bs->get_block_count() * bs->get_block_size();
-        st["free"] = bs->get_free_block_count() * bs->get_block_size();
-    }
-    st["host"] = self_state["host"];
-    json11::Json::object op_stats, subop_stats;
-    for (int i = 0; i <= OSD_OP_MAX; i++)
-    {
-        op_stats[osd_op_names[i]] = json11::Json::object {
-            { "count", c_cli.stats.op_stat_count[i] },
-            { "usec", c_cli.stats.op_stat_sum[i] },
-            { "bytes", c_cli.stats.op_stat_bytes[i] },
-        };
-    }
-    for (int i = 0; i <= OSD_OP_MAX; i++)
-    {
-        subop_stats[osd_op_names[i]] = json11::Json::object {
-            { "count", c_cli.stats.subop_stat_count[i] },
-            { "usec", c_cli.stats.subop_stat_sum[i] },
-        };
-    }
-    st["op_stats"] = op_stats;
-    st["subop_stats"] = subop_stats;
-    st["recovery_stats"] = json11::Json::object {
-        { recovery_stat_names[0], json11::Json::object {
-            { "count", recovery_stat_count[0][0] },
-            { "bytes", recovery_stat_bytes[0][0] },
-        } },
-        { recovery_stat_names[1], json11::Json::object {
-            { "count", recovery_stat_count[0][1] },
-            { "bytes", recovery_stat_bytes[0][1] },
-        } },
-    };
-    return st;
-}
-
-void osd_t::report_statistics()
-{
-    if (etcd_reporting_stats)
-    {
-        return;
-    }
-    etcd_reporting_stats = true;
-    json11::Json::array txn = { json11::Json::object {
-        { "request_put", json11::Json::object {
-            { "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
-            { "value", base64_encode(get_statistics().dump()) },
-        } }
-    } };
-    for (auto & p: pgs)
-    {
-        auto & pg = p.second;
-        if (pg.state & (PG_OFFLINE | PG_STARTING))
-        {
-            // Don't report statistics for offline PGs
-            continue;
-        }
-        json11::Json::object pg_stats;
-        pg_stats["object_count"] = pg.total_count;
-        pg_stats["clean_count"] = pg.clean_count;
-        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
-        pg_stats["degraded_count"] = pg.degraded_objects.size();
-        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
-        pg_stats["write_osd_set"] = pg.cur_set;
-        txn.push_back(json11::Json::object {
-            { "request_put", json11::Json::object {
-                { "key", base64_encode(st_cli.etcd_prefix+"/pg/stats/"+std::to_string(pg.pg_num)) },
-                { "value", base64_encode(json11::Json(pg_stats).dump()) },
-            } }
-        });
-    }
-    st_cli.etcd_txn(json11::Json::object { { "success", txn } }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
-    {
-        etcd_reporting_stats = false;
-        if (err != "")
-        {
-            printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
-            // Retry indefinitely
-            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
-            {
-                report_statistics();
-            });
-        }
-        else if (res["error"].string_value() != "")
-        {
-            printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, res["error"].string_value().c_str());
-            force_stop(1);
-        }
-    });
-}
-
-void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
-{
-    if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
-    {
-        c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
-    }
-}
-
-void osd_t::on_change_etcd_state_hook(json11::Json::object & changes)
-{
-    // FIXME apply config changes in runtime (maybe, some)
-    apply_pg_count();
-    apply_pg_config();
-}
-
-void osd_t::on_load_config_hook(json11::Json::object & global_config)
-{
-    blockstore_config_t osd_config = this->config;
-    for (auto & cfg_var: global_config)
-    {
-        if (this->config.find(cfg_var.first) == this->config.end())
-        {
-            if (cfg_var.second.is_string())
-            {
-                osd_config[cfg_var.first] = cfg_var.second.string_value();
-            }
-            else
-            {
-                osd_config[cfg_var.first] = cfg_var.second.dump();
-            }
-        }
-    }
-    parse_config(osd_config);
-    bind_socket();
-    st_cli.start_etcd_watcher();
-    acquire_lease();
-}
-
-// Acquire lease
-void osd_t::acquire_lease()
-{
-    // Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
-    st_cli.etcd_call("/lease/grant", json11::Json::object {
-        { "TTL", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 }
-    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
-    {
-        if (err != "" || data["ID"].string_value() == "")
-        {
-            printf("Error acquiring a lease from etcd: %s\n", err.c_str());
-            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
-            {
-                acquire_lease();
-            });
-            return;
-        }
-        etcd_lease_id = data["ID"].string_value();
-        create_osd_state();
-    });
-    printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].c_str(), etcd_report_interval);
-    tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
-    {
-        renew_lease();
-    });
-}
-
-// Report "up" state once, then keep it alive using the lease
-// Do it first to allow "monitors" check it when moving PGs
-void osd_t::create_osd_state()
-{
-    std::string state_key = base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num));
-    self_state = get_osd_state();
-    st_cli.etcd_txn(json11::Json::object {
-        // Check that the state key does not exist
-        { "compare", json11::Json::array {
-            json11::Json::object {
-                { "target", "CREATE" },
-                { "create_revision", 0 },
-                { "key", state_key },
-            }
-        } },
-        { "success", json11::Json::array {
-            json11::Json::object {
-                { "request_put", json11::Json::object {
-                    { "key", state_key },
-                    { "value", base64_encode(self_state.dump()) },
-                    { "lease", etcd_lease_id },
-                } }
-            },
-        } },
-        { "failure", json11::Json::array {
-            json11::Json::object {
-                { "request_range", json11::Json::object {
-                    { "key", state_key },
-                } }
-            },
-        } },
-    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
-    {
-        if (err != "")
-        {
-            etcd_failed_attempts++;
-            printf("Error creating OSD state key: %s\n", err.c_str());
-            if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
-            {
-                // Die
-                throw std::runtime_error("Cluster connection failed");
-            }
-            // Retry
-            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
-            {
-                create_osd_state();
-            });
-            return;
-        }
-        if (!data["succeeded"].bool_value())
-        {
-            // OSD is already up
-            auto kv = st_cli.parse_etcd_kv(data["responses"][0]["response_range"]["kvs"][0]);
-            printf("Key %s already exists in etcd, OSD %lu is still up\n", kv.key.c_str(), this->osd_num);
-            int64_t port = kv.value["port"].int64_value();
-            for (auto & addr: kv.value["addresses"].array_items())
-            {
-                printf("  listening at: %s:%ld\n", addr.string_value().c_str(), port);
-            }
-            force_stop(0);
-            return;
-        }
-        if (run_primary)
-        {
-            st_cli.load_pgs();
-        }
-    });
-}
-
-// Renew lease
-void osd_t::renew_lease()
-{
-    st_cli.etcd_call("/lease/keepalive", json11::Json::object {
-        { "ID", etcd_lease_id }
-    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
-    {
-        if (err == "" && data["result"]["TTL"].string_value() == "")
-        {
-            // Die
-            throw std::runtime_error("etcd lease has expired");
-        }
-        if (err != "")
-        {
-            etcd_failed_attempts++;
-            printf("Error renewing etcd lease: %s\n", err.c_str());
-            if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
-            {
-                // Die
-                throw std::runtime_error("Cluster connection failed");
-            }
-            // Retry
-            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
-            {
-                renew_lease();
-            });
-        }
-        else
-        {
-            etcd_failed_attempts = 0;
-            report_statistics();
-        }
-    });
-}
-
-void osd_t::force_stop(int exitcode)
-{
-    if (etcd_lease_id != "")
-    {
-        st_cli.etcd_call("/kv/lease/revoke", json11::Json::object {
-            { "ID", etcd_lease_id }
-        }, ETCD_QUICK_TIMEOUT, [this, exitcode](std::string err, json11::Json data)
-        {
-            if (err != "")
-            {
-                printf("Error revoking etcd lease: %s\n", err.c_str());
-            }
-            printf("[OSD %lu] Force stopping\n", this->osd_num);
-            exit(exitcode);
-        });
-    }
-    else
-    {
-        printf("[OSD %lu] Force stopping\n", this->osd_num);
-        exit(exitcode);
-    }
-}
-
-json11::Json osd_t::on_load_pgs_checks_hook()
-{
-    assert(this->pgs.size() == 0);
-    json11::Json::array checks = {
-        json11::Json::object {
-            { "target", "LEASE" },
-            { "lease", etcd_lease_id },
-            { "key", base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num)) },
-        }
-    };
-    return checks;
-}
-
-void osd_t::on_load_pgs_hook(bool success)
-{
-    if (!success)
-    {
-        printf("Error loading PGs from etcd: lease expired\n");
-        force_stop(1);
-    }
-    else
-    {
-        peering_state &= ~OSD_LOADING_PGS;
-        apply_pg_count();
-        apply_pg_config();
-    }
-}
-
-void osd_t::apply_pg_count()
-{
-    pg_num_t pg_count = st_cli.pg_config.size();
-    if (pg_count > 0 && (st_cli.pg_config.begin()->first != 1 || std::prev(st_cli.pg_config.end())->first != pg_count))
-    {
-        printf("Invalid PG configuration: PG numbers don't cover the whole 1..%d range\n", pg_count);
-        force_stop(1);
-        return;
-    }
-    if (this->pg_count != 0 && this->pg_count != pg_count)
-    {
-        // Check that all PGs are offline. It is not allowed to change PG count when any PGs are online
-        // The external tool must wait for all PGs to come down before changing PG count
-        // If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
-        // So an OSD just dies if it detects PG count change while there are active PGs
-        int still_active = 0;
-        for (auto & kv: pgs)
-        {
-            if (kv.second.state & PG_ACTIVE)
-            {
-                still_active++;
-            }
-        }
-        if (still_active > 0)
-        {
-            printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
-            force_stop(1);
-            return;
-        }
-    }
-    this->pg_count = pg_count;
-}
-
-void osd_t::apply_pg_config()
-{
-    bool all_applied = true;
-    for (auto & kv: st_cli.pg_config)
-    {
-        pg_num_t pg_num = kv.first;
-        auto & pg_cfg = kv.second;
-        bool take = pg_cfg.exists && pg_cfg.primary == this->osd_num &&
-            !pg_cfg.pause && (!pg_cfg.cur_primary || pg_cfg.cur_primary == this->osd_num);
-        bool currently_taken = this->pgs.find(pg_num) != this->pgs.end() &&
-            this->pgs[pg_num].state != PG_OFFLINE;
-        if (currently_taken && !take)
-        {
-            // Stop this PG
-            stop_pg(pg_num);
-        }
-        else if (take)
-        {
-            // Take this PG
-            std::set<osd_num_t> all_peers;
-            for (osd_num_t pg_osd: pg_cfg.target_set)
-            {
-                if (pg_osd != 0)
-                {
-                    all_peers.insert(pg_osd);
-                }
-            }
-            for (osd_num_t pg_osd: pg_cfg.all_peers)
-            {
-                if (pg_osd != 0)
-                {
-                    all_peers.insert(pg_osd);
-                }
-            }
-            for (auto & hist_item: pg_cfg.target_history)
-            {
-                for (auto pg_osd: hist_item)
-                {
-                    if (pg_osd != 0)
-                    {
-                        all_peers.insert(pg_osd);
-                    }
-                }
-            }
-            if (currently_taken)
-            {
-                if (this->pgs[pg_num].state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
-                {
-                    if (this->pgs[pg_num].target_set == pg_cfg.target_set)
-                    {
-                        // No change in osd_set; history changes are ignored
-                        continue;
-                    }
-                    else
-                    {
-                        // Stop PG, reapply change after stopping
-                        stop_pg(pg_num);
-                        all_applied = false;
-                        continue;
-                    }
-                }
-                else if (this->pgs[pg_num].state & PG_STOPPING)
-                {
-                    // Reapply change after stopping
-                    all_applied = false;
-                    continue;
-                }
-                else if (this->pgs[pg_num].state & PG_STARTING)
-                {
-                    if (pg_cfg.cur_primary == this->osd_num)
-                    {
-                        // PG locked, continue
-                    }
-                    else
-                    {
-                        // Reapply change after locking the PG
-                        all_applied = false;
-                        continue;
-                    }
-                }
-                else
-                {
-                    throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(this->pgs[pg_num].state));
-                }
-            }
-            this->pgs[pg_num] = (pg_t){
-                .state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
-                .pg_cursize = 0,
-                .pg_num = pg_num,
-                .target_history = pg_cfg.target_history,
-                .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
-                .target_set = pg_cfg.target_set,
-            };
-            this->pg_state_dirty.insert(pg_num);
-            this->pgs[pg_num].print_state();
-            if (pg_cfg.cur_primary == this->osd_num)
-            {
-                // Add peers
-                for (auto pg_osd: all_peers)
-                {
-                    if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
-                    {
-                        c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
-                    }
-                }
-                start_pg_peering(pg_num);
-            }
-            else
-            {
-                // Reapply change after locking the PG
-                all_applied = false;
-            }
-        }
-    }
-    report_pg_states();
-    this->pg_config_applied = all_applied;
-}
-
-void osd_t::report_pg_states()
-{
-    if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size())
-    {
-        return;
-    }
-    etcd_reporting_pg_state = true;
-    std::vector<std::pair<pg_num_t,bool>> reporting_pgs;
-    json11::Json::array checks;
-    json11::Json::array success;
-    json11::Json::array failure;
-    for (auto it = pg_state_dirty.begin(); it != pg_state_dirty.end(); it++)
-    {
-        auto pg_it = this->pgs.find(*it);
-        if (pg_it == this->pgs.end())
-        {
-            continue;
-        }
-        auto & pg = pg_it->second;
-        reporting_pgs.push_back({ pg.pg_num, pg.history_changed });
-        std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num));
-        if (pg.state == PG_STARTING)
-        {
-            // Check that the PG key does not exist
-            // Failed check indicates an unsuccessful PG lock attempt in this case
-            checks.push_back(json11::Json::object {
-                { "target", "VERSION" },
-                { "version", 0 },
-                { "key", state_key_base64 },
-            });
-        }
-        else
-        {
-            // Check that the key is ours
-            // Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
-            // and an unexpected race condition for started pgs (PG lock is held by someone else)
-            checks.push_back(json11::Json::object {
-                { "target", "LEASE" },
-                { "lease", etcd_lease_id },
-                { "key", state_key_base64 },
-            });
-        }
-        if (pg.state == PG_OFFLINE)
-        {
-            success.push_back(json11::Json::object {
-                { "request_delete_range", json11::Json::object {
-                    { "key", state_key_base64 },
-                } }
-            });
-        }
-        else
-        {
-            json11::Json::array pg_state_keywords;
-            for (int i = 0; i < pg_state_bit_count; i++)
-            {
-                if (pg.state & pg_state_bits[i])
-                {
-                    pg_state_keywords.push_back(pg_state_names[i]);
-                }
-            }
-            success.push_back(json11::Json::object {
-                { "request_put", json11::Json::object {
-                    { "key", base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num)) },
-                    { "value", base64_encode(json11::Json(json11::Json::object {
-                        { "primary", this->osd_num },
-                        { "state", pg_state_keywords },
-                        { "peers", pg.cur_peers },
-                    }).dump()) },
-                    { "lease", etcd_lease_id },
-                } }
-            });
-            if (pg.history_changed)
-            {
-                pg.history_changed = false;
-                if (pg.state == PG_ACTIVE)
-                {
-                    success.push_back(json11::Json::object {
-                        { "request_delete_range", json11::Json::object {
-                            { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
-                        } }
-                    });
-                }
-                else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
-                {
-                    success.push_back(json11::Json::object {
-                        { "request_put", json11::Json::object {
-                            { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
-                            { "value", base64_encode(json11::Json(json11::Json::object {
-                                { "all_peers", pg.all_peers },
-                            }).dump()) },
-                        } }
-                    });
-                }
-            }
-        }
-        failure.push_back(json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", state_key_base64 },
-            } }
-        });
-    }
-    pg_state_dirty.clear();
-    st_cli.etcd_txn(json11::Json::object {
-        { "compare", checks }, { "success", success }, { "failure", failure }
-    }, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data)
-    {
-        etcd_reporting_pg_state = false;
-        if (!data["succeeded"].bool_value())
-        {
-            // One of PG state updates failed, put dirty flags back
-            for (auto pp: reporting_pgs)
-            {
-                this->pg_state_dirty.insert(pp.first);
-                if (pp.second)
-                {
-                    auto pg_it = this->pgs.find(pp.first);
-                    if (pg_it != this->pgs.end())
-                    {
-                        pg_it->second.history_changed = true;
-                    }
-                }
-            }
-            for (auto & res: data["responses"].array_items())
-            {
-                if (res["kvs"].array_items().size())
-                {
-                    auto kv = st_cli.parse_etcd_kv(res["kvs"][0]);
-                    pg_num_t pg_num = stoull_full(kv.key.substr(st_cli.etcd_prefix.length()+10));
-                    auto pg_it = pgs.find(pg_num);
-                    if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
-                    {
-                        // Live PG state update failed
-                        printf("Failed to report state of PG %u which is live. Race condition detected, exiting\n", pg_num);
-                        force_stop(1);
-                        return;
-                    }
-                }
-            }
-            // Retry after a short pause (hope we'll get some updates and update PG states accordingly)
-            tfd->set_timer(500, false, [this](int) { report_pg_states(); });
-        }
-        else
-        {
-            // Success. We'll get our changes back via the watcher and react to them
-            for (auto pp: reporting_pgs)
-            {
-                auto pg_it = this->pgs.find(pp.first);
-                if (pg_it != this->pgs.end())
-                {
-                    if (pg_it->second.state == PG_OFFLINE)
-                    {
-                        // Remove offline PGs after reporting their state
-                        this->pgs.erase(pg_it);
-                    }
-                }
-            }
-            // Push other PG state updates, if any
-            report_pg_states();
-            if (!this->pg_state_dirty.size())
-            {
-                // Update statistics
-                report_statistics();
-            }
-        }
-    });
-}
--- a/osd_flush.cpp
+++ b/osd_flush.cpp
@@ -1,300 +0,0 @@
-#include "osd.h"
-
-#define FLUSH_BATCH 512
-
-void osd_t::submit_pg_flush_ops(pg_num_t pg_num)
-{
-    pg_t & pg = pgs[pg_num];
-    pg_flush_batch_t *fb = new pg_flush_batch_t();
-    pg.flush_batch = fb;
-    auto it = pg.flush_actions.begin(), prev_it = pg.flush_actions.begin();
-    bool first = true;
-    while (it != pg.flush_actions.end())
-    {
-        if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
-            (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
-            fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
-            fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
-        {
-            // Stop only at the object boundary
-            break;
-        }
-        it->second.submitted = true;
-        if (it->second.rollback)
-        {
-            fb->flush_objects++;
-            fb->rollback_lists[it->first.osd_num].push_back((obj_ver_id){
-                .oid = it->first.oid,
-                .version = it->second.rollback_to,
-            });
-        }
-        if (it->second.make_stable)
-        {
-            fb->flush_objects++;
-            fb->stable_lists[it->first.osd_num].push_back((obj_ver_id){
-                .oid = it->first.oid,
-                .version = it->second.stable_to,
-            });
-        }
-        prev_it = it;
-        first = false;
-        it++;
-    }
-    for (auto & l: fb->rollback_lists)
-    {
-        if (l.second.size() > 0)
-        {
-            fb->flush_ops++;
-            submit_flush_op(pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
-        }
-    }
-    for (auto & l: fb->stable_lists)
-    {
-        if (l.second.size() > 0)
-        {
-            fb->flush_ops++;
-            submit_flush_op(pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
-        }
-    }
-}
-
-void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
-{
-    if (pgs.find(pg_num) == pgs.end() || pgs[pg_num].flush_batch != fb)
-    {
-        // Throw the result away
-        return;
-    }
-    if (retval != 0)
-    {
-        if (peer_osd == this->osd_num)
-        {
-            throw std::runtime_error(
-                std::string(rollback
-                    ? "Error while doing local rollback operation: "
-                    : "Error while doing local stabilize operation: "
-                ) + strerror(-retval)
-            );
-        }
-        else
-        {
-            printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
-            auto fd_it = c_cli.osd_peer_fds.find(peer_osd);
-            if (fd_it != c_cli.osd_peer_fds.end())
-            {
-                c_cli.stop_client(fd_it->second);
-            }
-            return;
-        }
-    }
-    fb->flush_done++;
-    if (fb->flush_done == fb->flush_ops)
-    {
-        // This flush batch is done
-        std::vector<osd_op_t*> continue_ops;
-        auto & pg = pgs[pg_num];
-        auto it = pg.flush_actions.begin(), prev_it = it;
-        auto erase_start = it;
-        while (1)
-        {
-            if (it == pg.flush_actions.end() ||
-                it->first.oid.inode != prev_it->first.oid.inode ||
-                (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
-            {
-                pg.ver_override.erase((object_id){
-                    .inode = prev_it->first.oid.inode,
-                    .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
-                });
-                auto wr_it = pg.write_queue.find((object_id){
-                    .inode = prev_it->first.oid.inode,
-                    .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
-                });
-                if (wr_it != pg.write_queue.end())
-                {
-                    continue_ops.push_back(wr_it->second);
-                    pg.write_queue.erase(wr_it);
-                }
-            }
-            if ((it == pg.flush_actions.end() || !it->second.submitted) &&
-                erase_start != it)
-            {
-                pg.flush_actions.erase(erase_start, it);
-            }
-            if (it == pg.flush_actions.end())
-            {
-                break;
-            }
-            prev_it = it;
-            if (!it->second.submitted)
-            {
-                it++;
-                erase_start = it;
-            }
-            else
-            {
-                it++;
-            }
-        }
-        delete fb;
-        pg.flush_batch = NULL;
-        if (!pg.flush_actions.size())
-        {
-            pg.state = pg.state & ~PG_HAS_UNCLEAN;
-            report_pg_state(pg);
-        }
-        for (osd_op_t *op: continue_ops)
-        {
-            continue_primary_write(op);
-        }
-        if (pg.inflight == 0 && (pg.state & PG_STOPPING))
-        {
-            finish_stop_pg(pg);
-        }
-    }
-}
-
-void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
-{
-    osd_op_t *op = new osd_op_t();
-    // Copy buffer so it gets freed along with the operation
-    op->buf = malloc(sizeof(obj_ver_id) * count);
-    memcpy(op->buf, data, sizeof(obj_ver_id) * count);
-    if (peer_osd == this->osd_num)
-    {
-        // local
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t({
-            .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
-            .callback = [this, op, pg_num, fb](blockstore_op_t *bs_op)
-            {
-                add_bs_subop_stats(op);
-                handle_flush_op(bs_op->opcode == BS_OP_ROLLBACK, pg_num, fb, this->osd_num, bs_op->retval);
-                delete op->bs_op;
-                op->bs_op = NULL;
-                delete op;
-            },
-            .len = (uint32_t)count,
-            .buf = op->buf,
-        });
-        bs->enqueue_op(op->bs_op);
-    }
-    else
-    {
-        // Peer
-        int peer_fd = c_cli.osd_peer_fds[peer_osd];
-        op->op_type = OSD_OP_OUT;
-        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
-        op->send_list.push_back(op->buf, count * sizeof(obj_ver_id));
-        op->peer_fd = peer_fd;
-        op->req = {
-            .sec_stab = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = c_cli.next_subop_id++,
-                    .opcode = (uint64_t)(rollback ? OSD_OP_SECONDARY_ROLLBACK : OSD_OP_SECONDARY_STABILIZE),
-                },
-                .len = count * sizeof(obj_ver_id),
-            },
-        };
-        op->callback = [this, pg_num, fb, peer_osd](osd_op_t *op)
-        {
-            handle_flush_op(op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK, pg_num, fb, peer_osd, op->reply.hdr.retval);
-            delete op;
-        };
-        c_cli.outbox_push(op);
-    }
-}
-
-bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
-{
-    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-    {
-        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
-        {
-            for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
-            {
-                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                {
-                    op.degraded = true;
-                    op.pg_num = pg_it->first;
-                    op.oid = obj_it->first;
-                    return true;
-                }
-            }
-        }
-    }
-    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-    {
-        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
-        {
-            for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
-            {
-                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                {
-                    op.degraded = false;
-                    op.pg_num = pg_it->first;
-                    op.oid = obj_it->first;
-                    return true;
-                }
-            }
-        }
-    }
-    return false;
-}
-
-void osd_t::submit_recovery_op(osd_recovery_op_t *op)
-{
-    op->osd_op = new osd_op_t();
-    op->osd_op->op_type = OSD_OP_OUT;
-    op->osd_op->req = {
-        .rw = {
-            .header = {
-                .magic = SECONDARY_OSD_OP_MAGIC,
-                .id = 1,
-                .opcode = OSD_OP_WRITE,
-            },
-            .inode = op->oid.inode,
-            .offset = op->oid.stripe,
-            .len = 0,
-        },
-    };
-    op->osd_op->callback = [this, op](osd_op_t *osd_op)
-    {
-        // Don't sync the write, it will be synced by our regular sync coroutine
-        if (osd_op->reply.hdr.retval < 0)
-        {
-            // Error recovering object
-            if (osd_op->reply.hdr.retval == -EPIPE)
-            {
-                // PG is stopped or one of the OSDs is gone, error is harmless
-            }
-            else
-            {
-                throw std::runtime_error("Failed to recover an object");
-            }
-        }
-        // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
-        op->osd_op = NULL;
-        recovery_ops.erase(op->oid);
-        delete osd_op;
-        continue_recovery();
-    };
-    exec_op(op->osd_op);
-}
-
-// Just trigger write requests for degraded objects. They'll be recovered during writing
-bool osd_t::continue_recovery()
-{
-    while (recovery_ops.size() < recovery_queue_depth)
-    {
-        osd_recovery_op_t op;
-        if (pick_next_recovery(op))
-        {
-            recovery_ops[op.oid] = op;
-            submit_recovery_op(&recovery_ops[op.oid]);
-        }
-        else
-            return false;
-    }
-    return true;
-}
--- a/osd_main.cpp
+++ b/osd_main.cpp
@@ -2,17 +2,8 @@

 #include <signal.h>

-static osd_t *osd = NULL;
-static bool force_stopping = false;
-
-static void handle_sigint(int sig)
+void handle_sigint(int sig)
 {
-    if (osd && !force_stopping)
-    {
-        force_stopping = true;
-        osd->force_stop(0);
-        return;
-    }
    exit(0);
 }

@@ -34,11 +25,9 @@ int main(int narg, char *args[])
        }
    }
    signal(SIGINT, handle_sigint);
-    signal(SIGTERM, handle_sigint);
    ring_loop_t *ringloop = new ring_loop_t(512);
-    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
    blockstore_t *bs = new blockstore_t(config, ringloop);
-    osd = new osd_t(config, bs, ringloop);
+    osd_t *osd = new osd_t(config, bs, ringloop);
    while (1)
    {
        ringloop->loop();
--- a/osd_ops.h
+++ b/osd_ops.h
@@ -22,12 +22,9 @@
 #define OSD_OP_READ                 10
 #define OSD_OP_WRITE                11
 #define OSD_OP_SYNC                 12
-#define OSD_OP_DELETE               13
-#define OSD_OP_MAX                  13
+#define OSD_OP_MAX                  12
 // Alignment & limit for read/write operations
-#ifndef MEM_ALIGNMENT
-#define MEM_ALIGNMENT               512
-#endif
+#define OSD_RW_ALIGN                512
 #define OSD_RW_MAX                  64*1024*1024

 // common request and reply headers
@@ -60,7 +57,6 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
    // object
    object_id oid;
    // read/write version (automatic or specific)
-    // FIXME deny values close to UINT64_MAX
    uint64_t version;
    // offset
    uint32_t offset;
@@ -134,7 +130,7 @@ struct __attribute__((__packed__)) osd_op_secondary_list_t
    osd_op_header_t header;
    // placement group total number and total count
    pg_num_t list_pg, pg_count;
-    uint64_t pg_stripe_size;
+    uint64_t parity_block_size;
 };

 struct __attribute__((__packed__)) osd_reply_secondary_list_t
@@ -146,7 +142,6 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
 };

 // read or write to the primary OSD (must be within individual stripe)
-// FIXME: allow to return used block bitmap (required for snapshots)
 struct __attribute__((__packed__)) osd_op_rw_t
 {
    osd_op_header_t header;
@@ -174,7 +169,6 @@ struct __attribute__((__packed__)) osd_reply_sync_t
    osd_reply_header_t header;
 };

-// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
 union osd_any_op_t
 {
    osd_op_header_t hdr;
--- a/osd_peering.cpp
+++ b/osd_peering.cpp
@@ -3,212 +3,286 @@

 #include <algorithm>

-#include "base64.h"
 #include "osd.h"

+void osd_t::init_primary()
+{
+    // Initial test version of clustering code requires exactly 2 peers
+    // FIXME Hardcode
+    std::string peerstr = config["peers"];
+    while (peerstr.size())
+    {
+        int pos = peerstr.find(',');
+        peers.push_back(parse_peer(pos < 0 ? peerstr : peerstr.substr(0, pos)));
+        peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
+        for (int i = 0; i < peers.size()-1; i++)
+            if (peers[i].osd_num == peers[peers.size()-1].osd_num)
+                throw std::runtime_error("same osd number "+std::to_string(peers[i].osd_num)+" specified twice in peers");
+    }
+    if (peers.size() < 2)
+        throw std::runtime_error("run_primary requires at least 2 peers");
+    pgs.push_back((pg_t){
+        .state = PG_OFFLINE,
+        .pg_cursize = 0,
+        .pg_num = 1,
+        .target_set = { 1, 2, 3 },
+        .cur_set = { 1, 0, 0 },
+    });
+    pg_count = 1;
+    peering_state = OSD_PEERING_PEERS;
+}
+
+osd_peer_def_t osd_t::parse_peer(std::string peer)
+{
+    // OSD_NUM:IP:PORT
+    int pos1 = peer.find(':');
+    int pos2 = peer.find(':', pos1+1);
+    if (pos1 < 0 || pos2 < 0)
+        throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
+    osd_peer_def_t r;
+    r.addr = peer.substr(pos1+1, pos2-pos1-1);
+    std::string osd_num_str = peer.substr(0, pos1);
+    std::string port_str = peer.substr(pos2+1);
+    r.osd_num = strtoull(osd_num_str.c_str(), NULL, 10);
+    if (!r.osd_num)
+        throw new std::runtime_error("Could not parse OSD peer osd_num");
+    r.port = strtoull(port_str.c_str(), NULL, 10);
+    if (!r.port)
+        throw new std::runtime_error("Could not parse OSD peer port");
+    return r;
+}
+
+void osd_t::connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback)
+{
+    struct sockaddr_in addr;
+    int r;
+    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
+    {
+        callback(osd_num, -EINVAL);
+        return;
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(peer_port ? peer_port : 11203);
+    int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (peer_fd < 0)
+    {
+        callback(osd_num, -errno);
+        return;
+    }
+    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
+    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
+    if (r < 0 && errno != EINPROGRESS)
+    {
+        close(peer_fd);
+        callback(osd_num, -errno);
+        return;
+    }
+    clients[peer_fd] = (osd_client_t){
+        .peer_addr = addr,
+        .peer_port = peer_port,
+        .peer_fd = peer_fd,
+        .peer_state = PEER_CONNECTING,
+        .connect_callback = callback,
+        .osd_num = osd_num,
+    };
+    osd_peer_fds[osd_num] = peer_fd;
+    // Add FD to epoll (EPOLLOUT for tracking connect() result)
+    epoll_event ev;
+    ev.data.fd = peer_fd;
+    ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
+    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
+    {
+        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+    }
+}
+
+void osd_t::handle_connect_result(int peer_fd)
+{
+    auto & cl = clients[peer_fd];
+    osd_num_t osd_num = cl.osd_num;
+    auto callback = cl.connect_callback;
+    int result = 0;
+    socklen_t result_len = sizeof(result);
+    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
+    {
+        result = errno;
+    }
+    if (result != 0)
+    {
+        stop_client(peer_fd);
+        callback(osd_num, -result);
+        return;
+    }
+    int one = 1;
+    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+    // Disable EPOLLOUT on this fd
+    cl.connect_callback = NULL;
+    cl.peer_state = PEER_CONNECTED;
+    epoll_event ev;
+    ev.data.fd = peer_fd;
+    ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
+    if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
+    {
+        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+    }
+    callback(osd_num, peer_fd);
+}
+
 // Peering loop
 void osd_t::handle_peers()
 {
+    if (peering_state & OSD_PEERING_PEERS)
+    {
+        for (int i = 0; i < peers.size(); i++)
+        {
+            if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end() &&
+                time(NULL) - peers[i].last_connect_attempt > 5) // FIXME hardcode 5
+            {
+                peers[i].last_connect_attempt = time(NULL);
+                connect_peer(peers[i].osd_num, peers[i].addr.c_str(), peers[i].port, [this](osd_num_t osd_num, int peer_fd)
+                {
+                    // FIXME: Check peer config after connecting
+                    if (peer_fd < 0)
+                    {
+                        printf("Failed to connect to peer OSD %lu: %s\n", osd_num, strerror(-peer_fd));
+                        return;
+                    }
+                    printf("Connected with peer OSD %lu (fd %d)\n", clients[peer_fd].osd_num, peer_fd);
+                    int i;
+                    for (i = 0; i < peers.size(); i++)
+                    {
+                        if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end())
+                            break;
+                    }
+                    if (i >= peers.size())
+                    {
+                        // Connected to all peers
+                        peering_state = peering_state & ~OSD_PEERING_PEERS;
+                    }
+                    repeer_pgs(osd_num, true);
+                });
+            }
+        }
+    }
    if (peering_state & OSD_PEERING_PGS)
    {
-        bool still = false;
-        for (auto & p: pgs)
+        bool still_doing_pgs = false;
+        for (int i = 0; i < pgs.size(); i++)
        {
-            if (p.second.state == PG_PEERING)
+            if (pgs[i].state == PG_PEERING)
            {
-                if (!p.second.peering_state->list_ops.size())
+                if (!pgs[i].peering_state->list_ops.size())
                {
-                    p.second.calc_object_states(log_level);
-                    report_pg_state(p.second);
-                    incomplete_objects += p.second.incomplete_objects.size();
-                    misplaced_objects += p.second.misplaced_objects.size();
-                    // FIXME: degraded objects may currently include misplaced, too! Report them separately?
-                    degraded_objects += p.second.degraded_objects.size();
-                    if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
-                        peering_state = peering_state | OSD_FLUSHING_PGS;
-                    else
-                        peering_state = peering_state | OSD_RECOVERING;
+                    pgs[i].calc_object_states();
                }
                else
                {
-                    still = true;
+                    still_doing_pgs = true;
                }
            }
        }
-        if (!still)
+        if (!still_doing_pgs)
        {
            // Done all PGs
            peering_state = peering_state & ~OSD_PEERING_PGS;
        }
    }
-    if ((peering_state & OSD_FLUSHING_PGS) && !readonly)
-    {
-        bool still = false;
-        for (auto & p: pgs)
-        {
-            if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
-            {
-                if (!p.second.flush_batch)
-                {
-                    submit_pg_flush_ops(p.first);
-                }
-                still = true;
-            }
-        }
-        if (!still)
-        {
-            peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
-        }
-    }
-    if ((peering_state & OSD_RECOVERING) && !readonly)
-    {
-        if (!continue_recovery())
-        {
-            peering_state = peering_state & ~OSD_RECOVERING;
-        }
-    }
 }

-void osd_t::repeer_pgs(osd_num_t peer_osd)
+void osd_t::repeer_pgs(osd_num_t osd_num, bool is_connected)
 {
    // Re-peer affected PGs
-    for (auto & p: pgs)
+    // FIXME: We shouldn't rely just on target_set. Other OSDs may also contain PG data.
+    osd_num_t real_osd = (is_connected ? osd_num : 0);
+    for (int i = 0; i < pgs.size(); i++)
    {
        bool repeer = false;
-        if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
+        for (int r = 0; r < pgs[i].target_set.size(); r++)
        {
-            for (osd_num_t pg_osd: p.second.all_peers)
+            if (pgs[i].target_set[r] == osd_num &&
+                pgs[i].cur_set[r] != real_osd)
            {
-                if (pg_osd == peer_osd)
-                {
-                    repeer = true;
-                    break;
-                }
-            }
-            if (repeer)
-            {
-                // Repeer this pg
-                printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
-                start_pg_peering(p.second.pg_num);
+                pgs[i].cur_set[r] = real_osd;
+                repeer = true;
+                break;
            }
        }
+        if (repeer)
+        {
+            // Repeer this pg
+            printf("Repeer PG %d because of OSD %lu\n", i, osd_num);
+            start_pg_peering(i);
+            peering_state |= OSD_PEERING_PGS;
+        }
    }
 }

 // Repeer on each connect/disconnect peer event
-void osd_t::start_pg_peering(pg_num_t pg_num)
+void osd_t::start_pg_peering(int pg_idx)
 {
-    auto & pg = pgs[pg_num];
+    auto & pg = pgs[pg_idx];
    pg.state = PG_PEERING;
-    this->peering_state |= OSD_PEERING_PGS;
-    report_pg_state(pg);
-    // Reset PG state
-    pg.cur_peers.clear();
    pg.state_dict.clear();
-    incomplete_objects -= pg.incomplete_objects.size();
-    misplaced_objects -= pg.misplaced_objects.size();
-    degraded_objects -= pg.degraded_objects.size();
-    pg.incomplete_objects.clear();
-    pg.misplaced_objects.clear();
-    pg.degraded_objects.clear();
-    pg.flush_actions.clear();
+    pg.obj_states.clear();
    pg.ver_override.clear();
-    if (pg.flush_batch)
-    {
-        delete pg.flush_batch;
-    }
-    pg.flush_batch = NULL;
-    for (auto p: pg.write_queue)
-    {
-        cancel_primary_write(p.second);
-    }
-    pg.write_queue.clear();
-    for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
-    {
-        // Forget this PG's unstable writes
-        pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1;
-        if (n == pg.pg_num)
-            unstable_writes.erase(it++);
-        else
-            it++;
-    }
-    dirty_pgs.erase(pg.pg_num);
-    // Calculate current write OSD set
    pg.pg_cursize = 0;
-    pg.cur_set.resize(pg.target_set.size());
-    pg.cur_loc_set.clear();
-    for (int role = 0; role < pg.target_set.size(); role++)
+    for (int role = 0; role < pg.cur_set.size(); role++)
    {
-        pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
-            c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0;
        if (pg.cur_set[role] != 0)
        {
            pg.pg_cursize++;
-            pg.cur_loc_set.push_back({
-                .role = (uint64_t)role,
-                .osd_num = pg.cur_set[role],
-                .outdated = false,
-            });
-        }
-    }
-    if (pg.target_history.size())
-    {
-        // Refuse to start PG if no peers are available from any of the historical OSD sets
-        // (PG history is kept up to the latest active+clean state)
-        for (auto & history_set: pg.target_history)
-        {
-            bool found = false;
-            for (auto history_osd: history_set)
-            {
-                if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
-                {
-                    found = true;
-                    break;
-                }
-            }
-            if (!found)
-            {
-                pg.state = PG_INCOMPLETE;
-                report_pg_state(pg);
-            }
        }
    }
    if (pg.pg_cursize < pg.pg_minsize)
    {
        pg.state = PG_INCOMPLETE;
-        report_pg_state(pg);
    }
-    std::set<osd_num_t> cur_peers;
-    for (auto pg_osd: pg.all_peers)
-    {
-        if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end())
-        {
-            cur_peers.insert(pg_osd);
-        }
-        else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
-        {
-            c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
-        }
-    }
-    pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
    if (pg.peering_state)
    {
-        // Adjust the peering operation that's still in progress - discard unneeded results
-        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
+        // Adjust the peering operation that's still in progress
+        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
        {
-            if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
+            int role;
+            for (role = 0; role < pg.cur_set.size(); role++)
+            {
+                if (pg.cur_set[role] == it->first)
+                    break;
+            }
+            if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
            {
                // Discard the result after completion, which, chances are, will be unsuccessful
-                discard_list_subop(it->second);
+                auto list_op = it->second;
+                if (list_op->peer_fd == 0)
+                {
+                    // Self
+                    list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
+                    {
+                        if (list_op->bs_op->buf)
+                            free(list_op->bs_op->buf);
+                        delete list_op;
+                    };
+                }
+                else
+                {
+                    // Peer
+                    list_op->callback = [](osd_op_t *list_op)
+                    {
+                        delete list_op;
+                    };
+                }
                pg.peering_state->list_ops.erase(it);
                it = pg.peering_state->list_ops.begin();
            }
-            else
-                it++;
        }
-        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
+        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
        {
-            if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
+            int role;
+            for (role = 0; role < pg.cur_set.size(); role++)
+            {
+                if (pg.cur_set[role] == it->first)
+                    break;
+            }
+            if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
            {
                if (it->second.buf)
                {
@@ -217,8 +291,6 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
                pg.peering_state->list_results.erase(it);
                it = pg.peering_state->list_results.begin();
            }
-            else
-                it++;
        }
    }
    if (pg.state == PG_INCOMPLETE)
@@ -228,300 +300,107 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
            delete pg.peering_state;
            pg.peering_state = NULL;
        }
+        printf("PG %d is incomplete\n", pg.pg_num);
        return;
    }
    if (!pg.peering_state)
    {
        pg.peering_state = new pg_peering_state_t();
-        pg.peering_state->pg_num = pg.pg_num;
    }
-    for (osd_num_t peer_osd: cur_peers)
+    auto ps = pg.peering_state;
+    for (int role = 0; role < pg.cur_set.size(); role++)
    {
-        if (pg.peering_state->list_ops.find(peer_osd) != pg.peering_state->list_ops.end() ||
-            pg.peering_state->list_results.find(peer_osd) != pg.peering_state->list_results.end())
+        osd_num_t role_osd = pg.cur_set[role];
+        if (!role_osd)
        {
            continue;
        }
-        submit_sync_and_list_subop(peer_osd, pg.peering_state);
+        if (ps->list_ops.find(role_osd) != ps->list_ops.end() ||
+            ps->list_results.find(role_osd) != ps->list_results.end())
+        {
+            continue;
+        }
+        if (role_osd == this->osd_num)
+        {
+            // Self
+            osd_op_t *op = new osd_op_t();
+            op->op_type = 0;
+            op->peer_fd = 0;
+            op->bs_op = new blockstore_op_t();
+            op->bs_op->opcode = BS_OP_LIST;
+            op->bs_op->oid.stripe = parity_block_size;
+            op->bs_op->len = pg_count,
+            op->bs_op->offset = pg.pg_num-1,
+            op->bs_op->callback = [ps, op, role_osd](blockstore_op_t *bs_op)
+            {
+                if (op->bs_op->retval < 0)
+                {
+                    throw std::runtime_error("local OP_LIST failed");
+                }
+                printf(
+                    "Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
+                    role_osd, bs_op->retval, bs_op->version
+                );
+                ps->list_results[role_osd] = {
+                    .buf = (obj_ver_id*)op->bs_op->buf,
+                    .total_count = (uint64_t)op->bs_op->retval,
+                    .stable_count = op->bs_op->version,
+                };
+                ps->list_done++;
+                ps->list_ops.erase(role_osd);
+                delete op;
+            };
+            bs->enqueue_op(op->bs_op);
+            ps->list_ops[role_osd] = op;
+        }
+        else
+        {
+            // Peer
+            auto & cl = clients[osd_peer_fds[role_osd]];
+            osd_op_t *op = new osd_op_t();
+            op->op_type = OSD_OP_OUT;
+            op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+            op->peer_fd = cl.peer_fd;
+            op->req = {
+                .sec_list = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = this->next_subop_id++,
+                        .opcode = OSD_OP_SECONDARY_LIST,
+                    },
+                    .list_pg = pg.pg_num,
+                    .pg_count = pg_count,
+                    .parity_block_size = parity_block_size,
+                },
+            };
+            op->callback = [this, ps, role_osd](osd_op_t *op)
+            {
+                if (op->reply.hdr.retval < 0)
+                {
+                    printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
+                    ps->list_ops.erase(role_osd);
+                    stop_client(op->peer_fd);
+                    delete op;
+                    return;
+                }
+                printf(
+                    "Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
+                    role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
+                );
+                ps->list_results[role_osd] = {
+                    .buf = (obj_ver_id*)op->buf,
+                    .total_count = (uint64_t)op->reply.hdr.retval,
+                    .stable_count = op->reply.sec_list.stable_count,
+                };
+                // set op->buf to NULL so it doesn't get freed
+                op->buf = NULL;
+                ps->list_done++;
+                ps->list_ops.erase(role_osd);
+                delete op;
+            };
+            outbox_push(cl, op);
+            ps->list_ops[role_osd] = op;
+        }
    }
    ringloop->wakeup();
 }
-
-void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
-{
-    // Sync before listing, if not readonly
-    if (readonly)
-    {
-        submit_list_subop(role_osd, ps);
-    }
-    else if (role_osd == this->osd_num)
-    {
-        // Self
-        osd_op_t *op = new osd_op_t();
-        op->op_type = 0;
-        op->peer_fd = 0;
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t();
-        op->bs_op->opcode = BS_OP_SYNC;
-        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
-        {
-            if (bs_op->retval < 0)
-            {
-                printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
-                force_stop(1);
-                return;
-            }
-            add_bs_subop_stats(op);
-            delete op->bs_op;
-            op->bs_op = NULL;
-            delete op;
-            ps->list_ops.erase(role_osd);
-            submit_list_subop(role_osd, ps);
-        };
-        bs->enqueue_op(op->bs_op);
-        ps->list_ops[role_osd] = op;
-    }
-    else
-    {
-        // Peer
-        auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
-        osd_op_t *op = new osd_op_t();
-        op->op_type = OSD_OP_OUT;
-        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
-        op->peer_fd = cl.peer_fd;
-        op->req = {
-            .sec_sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = c_cli.next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_SYNC,
-                },
-            },
-        };
-        op->callback = [this, ps, role_osd](osd_op_t *op)
-        {
-            if (op->reply.hdr.retval < 0)
-            {
-                // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
-                printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
-                ps->list_ops.erase(role_osd);
-                c_cli.stop_client(op->peer_fd);
-                delete op;
-                return;
-            }
-            delete op;
-            ps->list_ops.erase(role_osd);
-            submit_list_subop(role_osd, ps);
-        };
-        c_cli.outbox_push(op);
-        ps->list_ops[role_osd] = op;
-    }
-}
-
-void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
-{
-    if (role_osd == this->osd_num)
-    {
-        // Self
-        osd_op_t *op = new osd_op_t();
-        op->op_type = 0;
-        op->peer_fd = 0;
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t();
-        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->oid.stripe = pg_stripe_size;
-        op->bs_op->len = pg_count;
-        op->bs_op->offset = ps->pg_num-1;
-        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
-        {
-            if (op->bs_op->retval < 0)
-            {
-                throw std::runtime_error("local OP_LIST failed");
-            }
-            add_bs_subop_stats(op);
-            printf(
-                "[PG %u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
-                ps->pg_num, role_osd, bs_op->retval, bs_op->version
-            );
-            ps->list_results[role_osd] = {
-                .buf = (obj_ver_id*)op->bs_op->buf,
-                .total_count = (uint64_t)op->bs_op->retval,
-                .stable_count = op->bs_op->version,
-            };
-            ps->list_ops.erase(role_osd);
-            delete op->bs_op;
-            op->bs_op = NULL;
-            delete op;
-        };
-        bs->enqueue_op(op->bs_op);
-        ps->list_ops[role_osd] = op;
-    }
-    else
-    {
-        // Peer
-        osd_op_t *op = new osd_op_t();
-        op->op_type = OSD_OP_OUT;
-        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
-        op->peer_fd = c_cli.osd_peer_fds[role_osd];
-        op->req = {
-            .sec_list = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = c_cli.next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_LIST,
-                },
-                .list_pg = ps->pg_num,
-                .pg_count = pg_count,
-                .pg_stripe_size = pg_stripe_size,
-            },
-        };
-        op->callback = [this, ps, role_osd](osd_op_t *op)
-        {
-            if (op->reply.hdr.retval < 0)
-            {
-                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
-                ps->list_ops.erase(role_osd);
-                c_cli.stop_client(op->peer_fd);
-                delete op;
-                return;
-            }
-            printf(
-                "[PG %u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
-                ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
-            );
-            ps->list_results[role_osd] = {
-                .buf = (obj_ver_id*)op->buf,
-                .total_count = (uint64_t)op->reply.hdr.retval,
-                .stable_count = op->reply.sec_list.stable_count,
-            };
-            // set op->buf to NULL so it doesn't get freed
-            op->buf = NULL;
-            ps->list_ops.erase(role_osd);
-            delete op;
-        };
-        c_cli.outbox_push(op);
-        ps->list_ops[role_osd] = op;
-    }
-}
-
-void osd_t::discard_list_subop(osd_op_t *list_op)
-{
-    if (list_op->peer_fd == 0)
-    {
-        // Self
-        list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
-        {
-            if (list_op->bs_op->buf)
-                free(list_op->bs_op->buf);
-            delete list_op->bs_op;
-            list_op->bs_op = NULL;
-            delete list_op;
-        };
-    }
-    else
-    {
-        // Peer
-        list_op->callback = [](osd_op_t *list_op)
-        {
-            delete list_op;
-        };
-    }
-}
-
-bool osd_t::stop_pg(pg_num_t pg_num)
-{
-    auto pg_it = pgs.find(pg_num);
-    if (pg_it == pgs.end())
-    {
-        return false;
-    }
-    auto & pg = pg_it->second;
-    if (pg.peering_state)
-    {
-        // Stop peering
-        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
-        {
-            discard_list_subop(it->second);
-        }
-        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
-        {
-            if (it->second.buf)
-            {
-                free(it->second.buf);
-            }
-        }
-        delete pg.peering_state;
-        pg.peering_state = NULL;
-    }
-    if (!(pg.state & PG_ACTIVE))
-    {
-        return false;
-    }
-    pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
-    if (pg.inflight == 0 && !pg.flush_batch)
-    {
-        finish_stop_pg(pg);
-    }
-    else
-    {
-        report_pg_state(pg);
-    }
-    return true;
-}
-
-void osd_t::finish_stop_pg(pg_t & pg)
-{
-    pg.state = PG_OFFLINE;
-    report_pg_state(pg);
-}
-
-void osd_t::report_pg_state(pg_t & pg)
-{
-    pg.print_state();
-    this->pg_state_dirty.insert(pg.pg_num);
-    if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
-    {
-        // Clear history of active+clean PGs
-        pg.history_changed = true;
-        pg.target_history.clear();
-        pg.all_peers = pg.target_set;
-        pg.cur_peers = pg.target_set;
-    }
-    else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
-    {
-        // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
-        pg.history_changed = true;
-        pg.target_history.clear();
-        std::set<osd_num_t> dead_peers;
-        for (auto pg_osd: pg.all_peers)
-        {
-            dead_peers.insert(pg_osd);
-        }
-        for (auto pg_osd: pg.cur_peers)
-        {
-            dead_peers.erase(pg_osd);
-        }
-        for (auto pg_osd: pg.target_set)
-        {
-            if (pg_osd)
-            {
-                dead_peers.insert(pg_osd);
-            }
-        }
-        pg.all_peers.clear();
-        pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
-        pg.cur_peers.clear();
-        for (auto pg_osd: pg.target_set)
-        {
-            if (pg_osd)
-            {
-                pg.cur_peers.push_back(pg_osd);
-            }
-        }
-    }
-    if (pg.state == PG_OFFLINE && !this->pg_config_applied)
-    {
-        apply_pg_config();
-    }
-    report_pg_states();
-}
--- a/osd_peering_pg.cpp
+++ b/osd_peering_pg.cpp
@@ -1,360 +1,159 @@
 #include "osd_peering_pg.h"

-struct obj_ver_role
+void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all)
 {
-    object_id oid;
-    uint64_t version;
-    uint64_t osd_num;
-    bool is_stable;
-};
-
-inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
-{
-    // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, role ASC, osd_num ASC
-    return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
-        (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
-        (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
-            a.version > b.version ||
-            a.version == b.version && (
-                a.oid.stripe < b.oid.stripe ||
-                a.oid.stripe == b.oid.stripe && a.osd_num < b.osd_num
-            )
-        )
-    );
-}
-
-struct obj_piece_ver_t
-{
-    uint64_t max_ver = 0;
-    uint64_t stable_ver = 0;
-    uint64_t max_target = 0;
-};
-
-struct pg_obj_state_check_t
-{
-    pg_t *pg;
-    std::vector<obj_ver_role> list;
-    int list_pos;
-    int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
-    object_id oid = { 0 };
-    uint64_t max_ver = 0;
-    uint64_t last_ver = 0;
-    uint64_t target_ver = 0;
-    uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
-    uint64_t n_unstable = 0, n_buggy = 0;
-    pg_osd_set_t osd_set;
-    int log_level;
-
-    void walk();
-    void start_object();
-    void handle_version();
-    void finish_object();
-};
-
-void pg_obj_state_check_t::walk()
-{
-    pg->clean_count = 0;
-    pg->total_count = 0;
-    pg->state = 0;
-    for (list_pos = 0; list_pos < list.size(); list_pos++)
-    {
-        if (oid.inode != list[list_pos].oid.inode ||
-            oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
-        {
-            if (oid.inode != 0)
-            {
-                finish_object();
-            }
-            start_object();
-        }
-        handle_version();
-    }
-    if (oid.inode != 0)
-    {
-        finish_object();
-    }
-    if (pg->pg_cursize < pg->pg_size)
-    {
-        pg->state |= PG_DEGRADED;
-    }
-    pg->state |= PG_ACTIVE;
-    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
-    {
-        pg->state |= PG_LEFT_ON_DEAD;
-    }
-}
-
-void pg_obj_state_check_t::start_object()
-{
-    obj_start = list_pos;
-    oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
-    last_ver = max_ver = list[list_pos].version;
-    target_ver = 0;
-    ver_start = list_pos;
-    has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
-    n_unstable = n_buggy = 0;
-}
-
-void pg_obj_state_check_t::handle_version()
-{
-    if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
-    {
-        // Version is either stable or recoverable
-        target_ver = last_ver;
-        ver_end = list_pos;
-    }
-    if (!target_ver)
-    {
-        if (last_ver != list[list_pos].version)
-        {
-            ver_start = list_pos;
-            has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
-            last_ver = list[list_pos].version;
-        }
-        int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
-        n_copies++;
-        if (replica >= pg->pg_size)
-        {
-            n_buggy++;
-        }
-        else
-        {
-            if (list[list_pos].is_stable)
-            {
-                n_stable++;
-            }
-            if (pg->cur_set[replica] != list[list_pos].osd_num)
-            {
-                n_mismatched++;
-            }
-            if (!(has_roles & (1 << replica)))
-            {
-                has_roles = has_roles | (1 << replica);
-                n_roles++;
-            }
-        }
-    }
-    if (!list[list_pos].is_stable)
-    {
-        n_unstable++;
-    }
-}
-
-void pg_obj_state_check_t::finish_object()
-{
-    if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
-    {
-        // Version is either stable or recoverable
-        target_ver = last_ver;
-        ver_end = list_pos;
-    }
-    obj_end = list_pos;
+    auto & pg = *this;
    // Remember the decision
    uint64_t state = 0;
-    if (n_buggy > 0)
+    if (st.n_roles == pg.pg_cursize)
    {
-        state = OBJ_BUGGY;
-        // FIXME: bring pg offline
-        throw std::runtime_error("buggy object state");
-    }
-    if (n_unstable > 0)
-    {
-        pg->state |= PG_HAS_UNCLEAN;
-        std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
-        for (int i = obj_start; i < obj_end; i++)
-        {
-            auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
-            if (!pcs.max_ver)
-            {
-                pcs.max_ver = list[i].version;
-            }
-            if (list[i].is_stable && !pcs.stable_ver)
-            {
-                pcs.stable_ver = list[i].version;
-            }
-            if (list[i].version <= target_ver && !pcs.max_target)
-            {
-                pcs.max_target = list[i].version;
-            }
-        }
-        for (auto pp: pieces)
-        {
-            auto & pcs = pp.second;
-            if (pcs.stable_ver < pcs.max_ver)
-            {
-                auto & act = pg->flush_actions[pp.first];
-                // osd_set doesn't include rollback/stable states, so don't include them in the state code either
-                if (pcs.max_ver > target_ver)
-                {
-                    act.rollback = true;
-                    act.rollback_to = pcs.max_target;
-                }
-                if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
-                {
-                    act.make_stable = true;
-                    act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
-                }
-            }
-        }
-    }
-    if (!target_ver)
-    {
-        return;
-    }
-    if (n_roles < pg->pg_minsize)
-    {
-        if (log_level > 1)
-        {
-            printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
-        }
-        state = OBJ_INCOMPLETE;
-        pg->state = pg->state | PG_HAS_INCOMPLETE;
-    }
-    else if (n_roles < pg->pg_cursize)
-    {
-        if (log_level > 1)
-        {
-            printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
-        }
-        state = OBJ_DEGRADED;
-        pg->state = pg->state | PG_HAS_DEGRADED;
-    }
-    if (n_mismatched > 0)
-    {
-        if (n_roles >= pg->pg_cursize && log_level > 1)
-        {
-            printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
-        }
-        state |= OBJ_MISPLACED;
-        pg->state = pg->state | PG_HAS_MISPLACED;
-    }
-    if (log_level > 1 && (n_roles < pg->pg_cursize || n_mismatched > 0))
-    {
-        if (log_level > 2)
-        {
-            for (int i = obj_start; i < obj_end; i++)
-            {
-                printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
-            }
-        }
+        if (st.n_matched == pg.pg_cursize)
+            state = OBJ_CLEAN;
        else
        {
-            for (int i = ver_start; i < ver_end; i++)
-            {
-                printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
-            }
+            state = OBJ_MISPLACED;
+            pg.state = pg.state | PG_HAS_MISPLACED;
        }
    }
-    pg->total_count++;
-    if (state != 0 || ver_end < obj_end)
+    else if (st.n_roles < pg.pg_minsize)
    {
-        osd_set.clear();
-        for (int i = ver_start; i < ver_end; i++)
-        {
-            osd_set.push_back((pg_obj_loc_t){
-                .role = (list[i].oid.stripe & STRIPE_MASK),
-                .osd_num = list[i].osd_num,
-                .outdated = false,
-            });
-        }
-    }
-    if (ver_end < obj_end)
-    {
-        // Check for outdated versions not present in the current target OSD set
-        for (int i = ver_end; i < obj_end; i++)
-        {
-            int j;
-            for (j = 0; j < osd_set.size(); j++)
-            {
-                if (osd_set[j].osd_num == list[i].osd_num)
-                {
-                    break;
-                }
-            }
-            if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
-            {
-                osd_set.push_back((pg_obj_loc_t){
-                    .role = (list[i].oid.stripe & STRIPE_MASK),
-                    .osd_num = list[i].osd_num,
-                    .outdated = true,
-                });
-                state |= OBJ_MISPLACED;
-                pg->state = pg->state | PG_HAS_MISPLACED;
-            }
-        }
-    }
-    if (target_ver < max_ver)
-    {
-        pg->ver_override[oid] = target_ver;
-    }
-    if (state == 0)
-    {
-        pg->clean_count++;
+        printf("Object is unfound: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
+        state = OBJ_INCOMPLETE;
+        pg.state = pg.state | PG_HAS_UNFOUND;
    }
    else
    {
-        auto it = pg->state_dict.find(osd_set);
-        if (it == pg->state_dict.end())
+        printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
+        state = OBJ_DEGRADED;
+        pg.state = pg.state | PG_HAS_DEGRADED;
+    }
+    if (st.n_copies > pg.pg_size)
+    {
+        state |= OBJ_OVERCOPIED;
+        pg.state = pg.state | PG_HAS_UNCLEAN;
+    }
+    if (st.n_stable < st.n_copies)
+    {
+        state |= OBJ_NEEDS_STABLE;
+        pg.state = pg.state | PG_HAS_UNCLEAN;
+    }
+    if (st.target_ver < st.max_ver || st.has_old_unstable)
+    {
+        state |= OBJ_NEEDS_ROLLBACK;
+        pg.state = pg.state | PG_HAS_UNCLEAN;
+        pg.ver_override[st.oid] = st.target_ver;
+    }
+    if (st.is_buggy)
+    {
+        state |= OBJ_BUGGY;
+        // FIXME: bring pg offline
+        throw std::runtime_error("buggy object state");
+    }
+    if (state != OBJ_CLEAN)
+    {
+        st.osd_set.clear();
+        for (int i = st.ver_start; i < st.ver_end; i++)
+        {
+            st.osd_set.push_back((pg_obj_loc_t){
+                .role = (all[i].oid.stripe & STRIPE_MASK),
+                .osd_num = all[i].osd_num,
+                .stable = all[i].is_stable,
+            });
+        }
+        std::sort(st.osd_set.begin(), st.osd_set.end());
+        auto it = pg.state_dict.find(st.osd_set);
+        if (it == pg.state_dict.end())
        {
            std::vector<uint64_t> read_target;
-            read_target.resize(pg->pg_size);
-            for (int i = 0; i < pg->pg_size; i++)
+            read_target.resize(pg.pg_size);
+            for (int i = 0; i < pg.pg_size; i++)
            {
                read_target[i] = 0;
            }
-            for (auto & o: osd_set)
+            for (auto & o: st.osd_set)
            {
-                if (!o.outdated)
-                {
-                    read_target[o.role] = o.osd_num;
-                }
+                read_target[o.role] = o.osd_num;
            }
-            pg->state_dict[osd_set] = {
+            pg.state_dict[st.osd_set] = {
                .read_target = read_target,
-                .osd_set = osd_set,
+                .osd_set = st.osd_set,
                .state = state,
                .object_count = 1,
            };
-            it = pg->state_dict.find(osd_set);
+            it = pg.state_dict.find(st.osd_set);
        }
        else
        {
            it->second.object_count++;
        }
-        if (state & OBJ_INCOMPLETE)
+        pg.obj_states[st.oid] = &it->second;
+        if (st.target_ver < st.max_ver)
        {
-            pg->incomplete_objects[oid] = &it->second;
+            pg.ver_override[st.oid] = st.target_ver;
        }
-        else if (state & OBJ_DEGRADED)
+        if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
        {
-            pg->degraded_objects[oid] = &it->second;
-        }
-        else
-        {
-            pg->misplaced_objects[oid] = &it->second;
+            spp::sparse_hash_map<obj_piece_id_t, obj_piece_ver_t> pieces;
+            for (int i = st.obj_start; i < st.obj_end; i++)
+            {
+                auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }];
+                if (!pcs.max_ver)
+                {
+                    pcs.max_ver = all[i].version;
+                }
+                if (all[i].is_stable && !pcs.stable_ver)
+                {
+                    pcs.stable_ver = all[i].version;
+                }
+            }
+            for (auto pp: pieces)
+            {
+                auto & pcs = pp.second;
+                if (pcs.stable_ver < pcs.max_ver)
+                {
+                    auto & act = obj_stab_actions[pp.first];
+                    if (pcs.max_ver > st.target_ver)
+                    {
+                        act.rollback = true;
+                        act.rollback_to = st.target_ver;
+                    }
+                    else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver)
+                    {
+                        act.rollback = true;
+                        act.rollback_to = pcs.stable_ver;
+                    }
+                    if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver)
+                    {
+                        act.make_stable = true;
+                        act.stable_to = st.target_ver;
+                    }
+                }
+            }
        }
    }
+    else
+        pg.clean_count++;
+    pg.total_count++;
 }

 // FIXME: Write at least some tests for this function
-void pg_t::calc_object_states(int log_level)
+void pg_t::calc_object_states()
 {
+    auto & pg = *this;
    // Copy all object lists into one array
-    pg_obj_state_check_t st;
-    st.log_level = log_level;
-    st.pg = this;
-    auto ps = peering_state;
+    std::vector<obj_ver_role> all;
+    auto ps = pg.peering_state;
    for (auto it: ps->list_results)
    {
        auto nstab = it.second.stable_count;
        auto n = it.second.total_count;
        auto osd_num = it.first;
-        uint64_t start = st.list.size();
-        st.list.resize(start + n);
+        uint64_t start = all.size();
+        all.resize(start + n);
        obj_ver_id *ov = it.second.buf;
        for (uint64_t i = 0; i < n; i++, ov++)
        {
-            st.list[start+i] = {
+            all[start+i] = {
                .oid = ov->oid,
                .version = ov->version,
                .osd_num = osd_num,
@@ -366,26 +165,101 @@ void pg_t::calc_object_states(int log_level)
    }
    ps->list_results.clear();
    // Sort
-    std::sort(st.list.begin(), st.list.end());
+    std::sort(all.begin(), all.end());
    // Walk over it and check object states
-    st.walk();
-}
-
-void pg_t::print_state()
-{
+    pg.clean_count = 0;
+    pg.total_count = 0;
+    pg.state = 0;
+    int replica = 0;
+    pg_obj_state_check_t st;
+    for (int i = 0; i < all.size(); i++)
+    {
+        if (st.oid.inode != all[i].oid.inode ||
+            st.oid.stripe != (all[i].oid.stripe & ~STRIPE_MASK))
+        {
+            if (st.oid.inode != 0)
+            {
+                // Remember object state
+                st.obj_end = st.ver_end = i;
+                remember_object(st, all);
+            }
+            st.obj_start = st.ver_start = i;
+            st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe & ~STRIPE_MASK };
+            st.max_ver = st.target_ver = all[i].version;
+            st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
+            st.is_buggy = st.has_old_unstable = false;
+        }
+        else if (st.target_ver != all[i].version)
+        {
+            if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
+            {
+                // Last processed version is either recoverable or stable, choose it as target and skip previous versions
+                st.ver_end = i;
+                i++;
+                while (i < all.size() && st.oid.inode == all[i].oid.inode &&
+                    st.oid.stripe == (all[i].oid.stripe & ~STRIPE_MASK))
+                {
+                    if (!all[i].is_stable)
+                    {
+                        st.has_old_unstable = true;
+                    }
+                    i++;
+                }
+                st.obj_end = i;
+                i--;
+                continue;
+            }
+            else
+            {
+                // Last processed version is unstable and unrecoverable
+                // We'll know that because target_ver < max_ver
+                st.ver_start = i;
+                st.target_ver = all[i].version;
+                st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
+            }
+        }
+        replica = (all[i].oid.stripe & STRIPE_MASK);
+        st.n_copies++;
+        if (replica >= pg.pg_size)
+        {
+            // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
+            st.is_buggy = true;
+        }
+        else
+        {
+            if (all[i].is_stable)
+            {
+                st.n_stable++;
+            }
+            if (pg.cur_set[replica] == all[i].osd_num)
+            {
+                st.n_matched++;
+            }
+            if (!(st.has_roles & (1 << replica)))
+            {
+                st.has_roles = st.has_roles | (1 << replica);
+                st.n_roles++;
+            }
+        }
+    }
+    if (st.oid.inode != 0)
+    {
+        // Remember object state
+        st.obj_end = st.ver_end = all.size();
+        remember_object(st, all);
+    }
+    if (pg.pg_cursize < pg.pg_size)
+    {
+        pg.state = pg.state | PG_DEGRADED;
+    }
    printf(
-        "[PG %u] is %s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
-        (state & PG_STARTING) ? "starting" : "",
-        (state & PG_OFFLINE) ? "offline" : "",
-        (state & PG_PEERING) ? "peering" : "",
-        (state & PG_INCOMPLETE) ? "incomplete" : "",
-        (state & PG_ACTIVE) ? "active" : "",
-        (state & PG_STOPPING) ? "stopping" : "",
-        (state & PG_DEGRADED) ? " + degraded" : "",
-        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
-        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
-        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
-        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
-        total_count
+        "PG %u is active%s%s%s%s%s (%lu objects)\n", pg.pg_num,
+        (pg.state & PG_DEGRADED) ? " + degraded" : "",
+        (pg.state & PG_HAS_UNFOUND) ? " + has_unfound" : "",
+        (pg.state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
+        (pg.state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
+        (pg.state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
+        pg.total_count
    );
+    pg.state = pg.state | PG_ACTIVE;
 }
--- a/osd_peering_pg.h
+++ b/osd_peering_pg.h
@@ -1,19 +1,43 @@
 #include <map>
-#include <unordered_map>
 #include <vector>
 #include <algorithm>

-#include "cpp-btree/btree_map.h"
-
 #include "object_id.h"
 #include "osd_ops.h"
-#include "pg_states.h"
+
+#include "sparsepp/sparsepp/spp.h"
+
+// Placement group states
+// Exactly one of these:
+#define PG_OFFLINE (1<<0)
+#define PG_PEERING (1<<1)
+#define PG_INCOMPLETE (1<<2)
+#define PG_ACTIVE (1<<3)
+// Plus any of these:
+#define PG_DEGRADED (1<<4)
+#define PG_HAS_UNFOUND (1<<5)
+#define PG_HAS_DEGRADED (1<<6)
+#define PG_HAS_MISPLACED (1<<7)
+#define PG_HAS_UNCLEAN (1<<8)
+
+// FIXME: Safe default that doesn't depend on parity_block_size of pg_parity_size
+#define STRIPE_MASK ((uint64_t)4096 - 1)
+
+// OSD object states
+#define OBJ_CLEAN 0x01
+#define OBJ_MISPLACED 0x02
+#define OBJ_DEGRADED 0x03
+#define OBJ_INCOMPLETE 0x04
+#define OBJ_NEEDS_STABLE 0x10000
+#define OBJ_NEEDS_ROLLBACK 0x20000
+#define OBJ_OVERCOPIED 0x40000
+#define OBJ_BUGGY 0x80000

 struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    bool outdated;
+    bool stable;
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -40,9 +64,28 @@ struct osd_op_t;
 struct pg_peering_state_t
 {
    // osd_num -> list result
-    std::unordered_map<osd_num_t, osd_op_t*> list_ops;
-    std::unordered_map<osd_num_t, pg_list_result_t> list_results;
-    pg_num_t pg_num = 0;
+    spp::sparse_hash_map<osd_num_t, osd_op_t*> list_ops;
+    spp::sparse_hash_map<osd_num_t, pg_list_result_t> list_results;
+    int list_done = 0;
+};
+
+struct pg_obj_state_check_t
+{
+    int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
+    object_id oid = { 0 };
+    uint64_t max_ver = 0;
+    uint64_t target_ver = 0;
+    uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
+    bool is_buggy = false, has_old_unstable = false;
+    pg_osd_set_t osd_set;
+};
+
+struct obj_ver_role
+{
+    object_id oid;
+    uint64_t version;
+    uint64_t osd_num;
+    bool is_stable;
 };

 struct obj_piece_id_t
@@ -51,63 +94,60 @@ struct obj_piece_id_t
    uint64_t osd_num;
 };

-struct flush_action_t
+struct obj_piece_ver_t
+{
+    uint64_t max_ver = 0;
+    uint64_t stable_ver = 0;
+};
+
+struct obj_stab_action_t
 {
    bool rollback = false, make_stable = false;
    uint64_t stable_to = 0, rollback_to = 0;
-    bool submitted = false;
-};
-
-struct pg_flush_batch_t
-{
-    std::map<osd_num_t, std::vector<obj_ver_id>> rollback_lists;
-    std::map<osd_num_t, std::vector<obj_ver_id>> stable_lists;
-    int flush_ops = 0, flush_done = 0;
-    int flush_objects = 0;
 };

 struct pg_t
 {
-    int state = 0;
+    int state;
    uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
    pg_num_t pg_num;
    uint64_t clean_count = 0, total_count = 0;
-    // target history and all potential peers
-    std::vector<std::vector<osd_num_t>> target_history;
-    std::vector<osd_num_t> all_peers;
-    bool history_changed = false;
-    // peer list from the last peering event
-    std::vector<osd_num_t> cur_peers;
    // target_set is the "correct" peer OSD set for this PG
    std::vector<osd_num_t> target_set;
    // cur_set is the current set of connected peer OSDs for this PG
    // cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
    std::vector<osd_num_t> cur_set;
-    // same thing in state_dict-like format
-    pg_osd_set_t cur_loc_set;
    // moved object map. by default, each object is considered to reside on the cur_set.
    // this map stores all objects that differ.
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
-    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
-    std::map<obj_piece_id_t, flush_action_t> flush_actions;
-    btree::btree_map<object_id, uint64_t> ver_override;
+    spp::sparse_hash_map<object_id, pg_osd_set_state_t*> obj_states;
+    std::map<obj_piece_id_t, obj_stab_action_t> obj_stab_actions;
+    spp::sparse_hash_map<object_id, uint64_t> ver_override;
    pg_peering_state_t *peering_state = NULL;
-    pg_flush_batch_t *flush_batch = NULL;

-    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

-    void calc_object_states(int log_level);
-    void print_state();
+    void calc_object_states();
+    void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.outdated < b.outdated ||
-        a.outdated == b.outdated && a.role < b.role ||
-        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
+    return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num ||
+        a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
+}
+
+inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
+{
+    // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
+    return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
+        (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
+        (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
+            a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
+        )
+    );
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
@@ -132,6 +172,7 @@ namespace std
                // Copy-pasted from spp::hash_combine()
                seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
                seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
            }
            return seed;
        }
--- a/osd_peering_pg_test.cpp
+++ b/osd_peering_pg_test.cpp
@@ -1,54 +0,0 @@
-#define _LARGEFILE64_SOURCE
-
-#include "osd_peering_pg.h"
-#define STRIPE_SHIFT 12
-
-/**
- * TODO tests for object & pg state calculation.
- *
- * 1) pg=1,2,3. objects:
- *    v1=1s,2s,3s -> clean
- *    v1=1s,2s,3 v2=1s,2s,_ -> degraded + needs_rollback
- *    v1=1s,2s,_ -> degraded
- *    v1=1s,2s,3s v2=1,6,_ -> degraded + needs_stabilize
- *    v1=2s,1s,3s -> misplaced
- *    v1=4,5,6 -> misplaced + needs_stabilize
- *    v1=1s,2s,6s -> misplaced
- * 2) ...
- */
-int main(int argc, char *argv[])
-{
-    pg_t pg = {
-        .state = PG_PEERING,
-        .pg_num = 1,
-        .target_set = { 1, 2, 3 },
-        .cur_set = { 1, 2, 3 },
-        .peering_state = new pg_peering_state_t(),
-    };
-    for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
-    {
-        pg_list_result_t r = {
-            .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
-            .total_count = 1024*1024*8,
-            .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
-        };
-        for (uint64_t i = 0; i < r.total_count; i++)
-        {
-            r.buf[i] = {
-                .oid = {
-                    .inode = 1,
-                    .stripe = (i << STRIPE_SHIFT) | (osd_num-1),
-                },
-                .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
-            };
-        }
-        pg.peering_state->list_results[osd_num] = r;
-    }
-    pg.calc_object_states(0);
-    printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
-    for (auto it: pg.state_dict)
-    {
-        printf("dev: state=%lx\n", it.second.state);
-    }
-    return 0;
-}
--- a/osd_primary.cpp
+++ b/osd_primary.cpp
@@ -1,81 +1,98 @@
-#include "osd_primary.h"
+#include "osd.h"
+#include "osd_rmw.h"
+
+#define SUBMIT_READ 0
+#define SUBMIT_RMW_READ 1
+#define SUBMIT_WRITE 2

 // read: read directly or read paired stripe(s), reconstruct, return
-// write: read paired stripe(s), reconstruct, modify, calculate parity, write
+// write: read paired stripe(s), modify, write
 //
 // nuance: take care to read the same version from paired stripes!
 // to do so, we remember "last readable" version until a write request completes
 // and we postpone other write requests to the same stripe until completion of previous ones
 //
-// sync: sync peers, get unstable versions, stabilize them
+// sync: sync peers, get unstable versions from somewhere, stabilize them
+
+struct unstable_osd_num_t
+{
+    osd_num_t osd_num;
+    int start, len;
+};
+
+struct osd_primary_op_data_t
+{
+    int st = 0;
+    pg_num_t pg_num;
+    object_id oid;
+    uint64_t target_ver;
+    uint64_t fact_ver = 0;
+    int n_subops = 0, done = 0, errors = 0;
+    int degraded = 0, pg_size, pg_minsize;
+    osd_rmw_stripe_t *stripes;
+    osd_op_t *subops = NULL;
+    // for sync. oops, requires freeing
+    std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
+    obj_ver_id *unstable_writes = NULL;
+};
+
+void osd_t::finish_primary_op(osd_op_t *cur_op, int retval)
+{
+    // FIXME add separate magic number
+    auto cl_it = clients.find(cur_op->peer_fd);
+    if (cl_it != clients.end())
+    {
+        cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        cur_op->reply.hdr.id = cur_op->req.hdr.id;
+        cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+        cur_op->reply.hdr.retval = retval;
+        outbox_push(cl_it->second, cur_op);
+    }
+    else
+    {
+        delete cur_op;
+    }
+}

 bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
-    // K = pg_minsize and will be a property of the inode. Not it's hardcoded (FIXME)
-    uint64_t pg_block_size = bs_block_size * 2;
-    object_id oid = {
-        .inode = cur_op->req.rw.inode,
-        // oid.stripe = starting offset of the parity stripe
-        .stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
-    };
-    pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_count + 1;
-    auto pg_it = pgs.find(pg_num);
-    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
+    // But we must not use K in the process of calculating the PG number
+    // So we calculate the PG number using a separate setting which should be per-inode (FIXME)
+    // FIXME Real pg_num should equal the below expression + 1
+    pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / parity_block_size) % pg_count;
+    // FIXME: Postpone operations in inactive PGs
+    if (pg_num > pgs.size() || !(pgs[pg_num].state & PG_ACTIVE))
    {
-        // This OSD is not primary for this PG or the PG is inactive
-        finish_op(cur_op, -EPIPE);
+        finish_primary_op(cur_op, -EINVAL);
        return false;
    }
-    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
+    uint64_t pg_parity_size = bs_block_size * pgs[pg_num].pg_minsize;
+    object_id oid = {
+        .inode = cur_op->req.rw.inode,
+        // oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
+        .stripe = (cur_op->req.rw.offset / parity_block_size) * parity_block_size +
+            ((cur_op->req.rw.offset % parity_block_size) / pg_parity_size) * pg_parity_size
+    };
+    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_parity_size) ||
        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
        (cur_op->req.rw.len % bs_disk_alignment) != 0)
    {
-        finish_op(cur_op, -EINVAL);
+        finish_primary_op(cur_op, -EINVAL);
        return false;
    }
    osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc(
-        sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pg_it->second.pg_size, 1
+        sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pgs[pg_num].pg_size, 1
    );
    op_data->pg_num = pg_num;
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    cur_op->op_data = op_data;
-    split_stripes(pg_it->second.pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
-    pg_it->second.inflight++;
+    split_stripes(pgs[pg_num].pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
    return true;
 }

-static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
-{
-    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
-    {
-        *object_state = NULL;
-        return def;
-    }
-    auto st_it = pg.incomplete_objects.find(oid);
-    if (st_it != pg.incomplete_objects.end())
-    {
-        *object_state = st_it->second;
-        return st_it->second->read_target.data();
-    }
-    st_it = pg.degraded_objects.find(oid);
-    if (st_it != pg.degraded_objects.end())
-    {
-        *object_state = st_it->second;
-        return st_it->second->read_target.data();
-    }
-    st_it = pg.misplaced_objects.find(oid);
-    if (st_it != pg.misplaced_objects.end())
-    {
-        *object_state = st_it->second;
-        return st_it->second->read_target.data();
-    }
-    *object_state = NULL;
-    return def;
-}
-
 void osd_t::continue_primary_read(osd_op_t *cur_op)
 {
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
@@ -106,10 +123,14 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        else
        {
            // PG may be degraded or have misplaced objects
-            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+            auto st_it = pg.obj_states.find(op_data->oid);
+            uint64_t* cur_set = (st_it != pg.obj_states.end()
+                ? st_it->second->read_target.data()
+                : pg.cur_set.data());
            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
            {
-                finish_op(cur_op, -EIO);
+                free(op_data);
+                finish_primary_op(cur_op, -EIO);
                return;
            }
            // Submit reads
@@ -126,7 +147,9 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+        free(op_data);
+        cur_op->op_data = NULL;
+        finish_primary_op(cur_op, -EIO);
        return;
    }
    if (op_data->degraded)
@@ -150,34 +173,143 @@ resume_2:
            }
        }
    }
-    finish_op(cur_op, cur_op->req.rw.len);
+    free(op_data);
+    cur_op->op_data = NULL;
+    finish_primary_op(cur_op, cur_op->req.rw.len);
 }

-bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
+void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
+{
+    bool w = submit_type == SUBMIT_WRITE;
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    osd_rmw_stripe_t *stripes = op_data->stripes;
+    // Allocate subops
+    int n_subops = 0, zero_read = -1;
+    for (int role = 0; role < pg_size; role++)
+    {
+        if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
+        {
+            zero_read = role;
+        }
+        if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
+        {
+            n_subops++;
+        }
+    }
+    if (!n_subops && submit_type == SUBMIT_RMW_READ)
+    {
+        n_subops = 1;
+    }
+    else
+    {
+        zero_read = -1;
+    }
+    osd_op_t *subops = new osd_op_t[n_subops];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_subops;
+    op_data->subops = subops;
+    int subop = 0;
+    for (int role = 0; role < pg_size; role++)
+    {
+        // We always submit zero-length writes to all replicas, even if the stripe is not modified
+        if (!(w || stripes[role].read_end != 0 || zero_read == role))
+        {
+            continue;
+        }
+        osd_num_t role_osd_num = osd_set[role];
+        if (role_osd_num != 0)
+        {
+            if (role_osd_num == this->osd_num)
+            {
+                subops[subop].bs_op = new blockstore_op_t({
+                    .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
+                    .callback = [cur_op, this](blockstore_op_t *subop)
+                    {
+                        handle_primary_subop(cur_op, subop->retval == subop->len, subop->version);
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | role,
+                    },
+                    .version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
+                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
+                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
+                    .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
+                });
+                bs->enqueue_op(subops[subop].bs_op);
+            }
+            else
+            {
+                subops[subop].op_type = OSD_OP_OUT;
+                subops[subop].send_list.push_back(subops[subop].req.buf, OSD_PACKET_SIZE);
+                subops[subop].peer_fd = this->osd_peer_fds.at(role_osd_num);
+                subops[subop].req.sec_rw = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = this->next_subop_id++,
+                        .opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | role,
+                    },
+                    .version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
+                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
+                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
+                };
+                subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
+                if (w && stripes[role].write_end > 0)
+                {
+                    subops[subop].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
+                }
+                subops[subop].callback = [cur_op, this](osd_op_t *subop)
+                {
+                    // so it doesn't get freed
+                    subop->buf = NULL;
+                    handle_primary_subop(cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
+                };
+                outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
+            }
+            subop++;
+        }
+    }
+}
+
+void osd_t::handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    // Check if actions are pending for this object
-    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
-        .oid = op_data->oid,
-        .osd_num = 0,
-    });
-    if (act_it != pg.flush_actions.end() &&
-        act_it->first.oid.inode == op_data->oid.inode &&
-        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
+    op_data->fact_ver = version;
+    if (!ok)
    {
-        pg.write_queue.emplace(op_data->oid, cur_op);
-        return false;
+        // FIXME: Handle errors
+        op_data->errors++;
    }
-    // Check if there are other write requests to the same object
-    auto vo_it = pg.write_queue.find(op_data->oid);
-    if (vo_it != pg.write_queue.end())
+    else
    {
-        op_data->st = 1;
-        pg.write_queue.emplace(op_data->oid, cur_op);
-        return false;
+        op_data->done++;
+    }
+    if ((op_data->errors + op_data->done) >= op_data->n_subops)
+    {
+        delete[] op_data->subops;
+        op_data->subops = NULL;
+        op_data->st++;
+        if (cur_op->req.hdr.opcode == OSD_OP_READ)
+        {
+            continue_primary_read(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
+        {
+            continue_primary_write(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
+        {
+            continue_primary_sync(cur_op);
+        }
+        else
+        {
+            throw std::runtime_error("BUG: unknown opcode");
+        }
    }
-    pg.write_queue.emplace(op_data->oid, cur_op);
-    return true;
 }

 void osd_t::continue_primary_write(osd_op_t *cur_op)
@@ -187,115 +319,89 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
+    // FIXME: Handle operation cancel
    auto & pg = pgs[op_data->pg_num];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
-    else if (op_data->st == 6) goto resume_6;
-    else if (op_data->st == 7) goto resume_7;
-    else if (op_data->st == 8) goto resume_8;
-    else if (op_data->st == 9) goto resume_9;
    assert(op_data->st == 0);
-    if (!check_write_queue(cur_op, pg))
+    // Check if actions are pending for this object
    {
-        return;
+        auto act_it = pg.obj_stab_actions.lower_bound((obj_piece_id_t){
+            .oid = op_data->oid,
+            .osd_num = 0,
+        });
+        if (act_it != pg.obj_stab_actions.end() &&
+            act_it->first.oid.inode == op_data->oid.inode &&
+            (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
+        {
+            // FIXME postpone the request until actions are done
+            free(op_data);
+            finish_primary_op(cur_op, -EIO);
+            return;
+        }
+    }
+    // Check if there are other write requests to the same object
+    {
+        auto vo_it = pg.write_queue.find(op_data->oid);
+        if (vo_it != pg.write_queue.end())
+        {
+            op_data->st = 1;
+            pg.write_queue.emplace(op_data->oid, cur_op);
+            return;
+        }
+        pg.write_queue.emplace(op_data->oid, cur_op);
    }
 resume_1:
-    // Determine blocks to read and write
-    // Missing chunks are allowed to be overwritten even in incomplete objects
-    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for the lower performance impact
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-    cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
-        pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
+    // Determine blocks to read
+    cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
    // Read required blocks
-    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
+    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
-    if (op_data->errors > 0)
-    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
-    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
-    // Recover missing stripes, calculate parity
-    calc_rmw_parity(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+    // Calculate parity
+    calc_rmw_parity(op_data->stripes, pg.pg_size);
    // Send writes
    submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
 resume_4:
    op_data->st = 4;
    return;
 resume_5:
-    if (op_data->errors > 0)
-    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
-    }
-    if (op_data->fact_ver == 1)
-    {
-        // Object is created
-        pg.clean_count++;
-        pg.total_count++;
-    }
-    if (op_data->object_state)
+    // Remember version as unstable
+    osd_num_t *osd_set = pg.cur_set.data();
+    for (int role = 0; role < pg.pg_size; role++)
    {
+        if (osd_set[role] != 0)
        {
-            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
-            recovery_stat_count[0][recovery_type]++;
-            if (!recovery_stat_count[0][recovery_type])
-            {
-                recovery_stat_count[0][recovery_type]++;
-                recovery_stat_bytes[0][recovery_type] = 0;
-            }
-            for (int role = 0; role < pg.pg_size; role++)
-            {
-                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
-            }
+            this->unstable_writes[(osd_object_id_t){
+                .osd_num = osd_set[role],
+                .oid = {
+                    .inode = op_data->oid.inode,
+                    .stripe = op_data->oid.stripe | role,
+                },
+            }] = op_data->fact_ver;
        }
-        if (op_data->object_state->state & OBJ_MISPLACED)
-        {
-            // Remove extra chunks
-            submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set);
-            if (op_data->n_subops > 0)
-            {
-resume_8:
-                op_data->st = 8;
-                return;
-resume_9:
-                if (op_data->errors > 0)
-                {
-                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-                    return;
-                }
-            }
-        }
-        // Clear object state
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
-        pg.clean_count++;
    }
+    // Remember PG as dirty to drop the connection when PG goes offline
+    // (this is required because of the "lazy sync")
+    this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
    // Remove version override
    pg.ver_override.erase(op_data->oid);
-    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
-resume_6:
-resume_7:
-    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
-    {
-        return;
-    }
-    object_id oid = op_data->oid;
-    finish_op(cur_op, cur_op->req.rw.len);
+    finish_primary_op(cur_op, cur_op->req.rw.len);
    // Continue other write operations to the same object
-    auto next_it = pg.write_queue.find(oid);
-    auto this_it = next_it;
-    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
+        auto next_it = pg.write_queue.find(op_data->oid);
+        auto this_it = next_it;
        next_it++;
        pg.write_queue.erase(this_it);
-        if (next_it != pg.write_queue.end() && next_it->first == oid)
+        if (next_it != pg.write_queue.end() &&
+            next_it->first == op_data->oid)
        {
            osd_op_t *next_op = next_it->second;
            continue_primary_write(next_op);
@@ -303,98 +409,27 @@ resume_7:
    }
 }

-bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (op_data->st == base_state)
-    {
-        goto resume_6;
-    }
-    else if (op_data->st == base_state+1)
-    {
-        goto resume_7;
-    }
-    if (immediate_commit == IMMEDIATE_ALL)
-    {
-        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-        op_data->unstable_writes = new obj_ver_id[loc_set.size()];
-        {
-            int last_start = 0;
-            for (auto & chunk: loc_set)
-            {
-                op_data->unstable_writes[last_start] = (obj_ver_id){
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | chunk.role,
-                    },
-                    .version = op_data->fact_ver,
-                };
-                op_data->unstable_write_osds->push_back((unstable_osd_num_t){
-                    .osd_num = chunk.osd_num,
-                    .start = last_start,
-                    .len = 1,
-                });
-                last_start++;
-            }
-        }
-        submit_primary_stab_subops(cur_op);
-resume_6:
-        op_data->st = 6;
-        return false;
-resume_7:
-        // FIXME: Free those in the destructor?
-        delete op_data->unstable_write_osds;
-        delete[] op_data->unstable_writes;
-        op_data->unstable_writes = NULL;
-        op_data->unstable_write_osds = NULL;
-        if (op_data->errors > 0)
-        {
-            pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-            return false;
-        }
-    }
-    else
-    {
-        // Remember version as unstable
-        for (auto & chunk: loc_set)
-        {
-            this->unstable_writes[(osd_object_id_t){
-                .osd_num = chunk.osd_num,
-                .oid = {
-                    .inode = op_data->oid.inode,
-                    .stripe = op_data->oid.stripe | chunk.role,
-                },
-            }] = op_data->fact_ver;
-        }
-        // Remember PG as dirty to drop the connection when PG goes offline
-        // (this is required because of the "lazy sync")
-        c_cli.clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
-        dirty_pgs.insert(op_data->pg_num);
-    }
-    return true;
-}
-
 // Save and clear unstable_writes -> SYNC all -> STABLE all
+// FIXME: Run regular automatic syncs based on the number of unstable writes and/or system time
 void osd_t::continue_primary_sync(osd_op_t *cur_op)
 {
    if (!cur_op->op_data)
    {
        cur_op->op_data = (osd_primary_op_data_t*)calloc(sizeof(osd_primary_op_data_t), 1);
    }
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (op_data->st == 1)      goto resume_1;
-    else if (op_data->st == 2) goto resume_2;
-    else if (op_data->st == 3) goto resume_3;
-    else if (op_data->st == 4) goto resume_4;
-    else if (op_data->st == 5) goto resume_5;
-    else if (op_data->st == 6) goto resume_6;
-    assert(op_data->st == 0);
+    if (cur_op->op_data->st == 1)      goto resume_1;
+    else if (cur_op->op_data->st == 2) goto resume_2;
+    else if (cur_op->op_data->st == 3) goto resume_3;
+    else if (cur_op->op_data->st == 4) goto resume_4;
+    else if (cur_op->op_data->st == 5) goto resume_5;
+    else if (cur_op->op_data->st == 6) goto resume_6;
+    assert(cur_op->op_data->st == 0);
    if (syncs_in_progress.size() > 0)
    {
        // Wait for previous syncs, if any
        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
        syncs_in_progress.push_back(cur_op);
-        op_data->st = 1;
+        cur_op->op_data->st = 1;
 resume_1:
        return;
    }
@@ -403,28 +438,27 @@ resume_1:
        syncs_in_progress.push_back(cur_op);
    }
 resume_2:
+    // FIXME: Handle operation cancel
    if (unstable_writes.size() == 0)
    {
        // Nothing to sync
        goto finish;
    }
    // Save and clear unstable_writes
-    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
-    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
+    // FIXME: This is possible to do it on a per-client basis
+    // It would be cool not to copy them here at all, but someone has to deduplicate them by object IDs anyway
+    cur_op->op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+    cur_op->op_data->unstable_writes = new obj_ver_id[unstable_writes.size()];
    {
-        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
-        op_data->dirty_pgs = new pg_num_t[dirty_pgs.size()];
-        op_data->dirty_pg_count = dirty_pgs.size();
        osd_num_t last_osd = 0;
        int last_start = 0, last_end = 0;
-        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
+        for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++)
        {
            if (last_osd != it->first.osd_num)
            {
                if (last_osd != 0)
                {
-                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                    cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = last_osd,
                        .start = last_start,
                        .len = last_end - last_start,
@@ -433,7 +467,7 @@ resume_2:
                last_osd = it->first.osd_num;
                last_start = last_end;
            }
-            op_data->unstable_writes[last_end] = (obj_ver_id){
+            cur_op->op_data->unstable_writes[last_end] = (obj_ver_id){
                .oid = it->first.oid,
                .version = it->second,
            };
@@ -441,226 +475,129 @@ resume_2:
        }
        if (last_osd != 0)
        {
-            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+            cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                .osd_num = last_osd,
                .start = last_start,
                .len = last_end - last_start,
            });
        }
-        int dpg = 0;
-        for (auto dirty_pg_num: dirty_pgs)
-        {
-            pgs[dirty_pg_num].inflight++;
-            op_data->dirty_pgs[dpg++] = dirty_pg_num;
-        }
-        dirty_pgs.clear();
-        this->unstable_writes.clear();
    }
-    if (immediate_commit != IMMEDIATE_ALL)
-    {
-        // SYNC
-        submit_primary_sync_subops(cur_op);
+    unstable_writes.clear();
+    // SYNC
+    submit_primary_sync_subops(cur_op);
 resume_3:
-        op_data->st = 3;
-        return;
+    cur_op->op_data->st = 3;
+    return;
 resume_4:
-        if (op_data->errors > 0)
-        {
-            goto resume_6;
-        }
-    }
    // Stabilize version sets
    submit_primary_stab_subops(cur_op);
 resume_5:
-    op_data->st = 5;
+    cur_op->op_data->st = 5;
    return;
 resume_6:
-    if (op_data->errors > 0)
-    {
-        // Return objects back into the unstable write set
-        for (auto unstable_osd: *(op_data->unstable_write_osds))
-        {
-            for (int i = 0; i < unstable_osd.len; i++)
-            {
-                // Except those from peered PGs
-                auto & w = op_data->unstable_writes[i];
-                pg_num_t wpg = map_to_pg(w.oid);
-                if (pgs[wpg].state & PG_ACTIVE)
-                {
-                    uint64_t & dest = this->unstable_writes[(osd_object_id_t){
-                        .osd_num = unstable_osd.osd_num,
-                        .oid = w.oid,
-                    }];
-                    dest = dest < w.version ? w.version : dest;
-                    dirty_pgs.insert(wpg);
-                }
-            }
-        }
-    }
-    for (int i = 0; i < op_data->dirty_pg_count; i++)
-    {
-        auto & pg = pgs.at(op_data->dirty_pgs[i]);
-        pg.inflight--;
-        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
-        {
-            finish_stop_pg(pg);
-        }
-    }
-    // FIXME: Free those in the destructor?
-    delete op_data->dirty_pgs;
-    delete op_data->unstable_write_osds;
-    delete[] op_data->unstable_writes;
-    op_data->unstable_writes = NULL;
-    op_data->unstable_write_osds = NULL;
-    if (op_data->errors > 0)
-    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
-    }
-    else
-    {
+    // FIXME: Free them correctly (via a destructor or so)
+    delete cur_op->op_data->unstable_write_osds;
+    delete[] cur_op->op_data->unstable_writes;
+    cur_op->op_data->unstable_writes = NULL;
+    cur_op->op_data->unstable_write_osds = NULL;
 finish:
-        if (cur_op->peer_fd)
-        {
-            auto it = c_cli.clients.find(cur_op->peer_fd);
-            if (it != c_cli.clients.end())
-                it->second.dirty_pgs.clear();
-        }
-        finish_op(cur_op, 0);
-    }
    assert(syncs_in_progress.front() == cur_op);
    syncs_in_progress.pop_front();
+    finish_primary_op(cur_op, 0);
    if (syncs_in_progress.size() > 0)
    {
        cur_op = syncs_in_progress.front();
-        op_data = cur_op->op_data;
-        op_data->st++;
+        cur_op->op_data->st++;
        goto resume_2;
    }
 }

-// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
-void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
+void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
 {
-    if (object_state->state & OBJ_INCOMPLETE)
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int n_osds = op_data->unstable_write_osds->size();
+    osd_op_t *subops = new osd_op_t[n_osds];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_osds;
+    op_data->subops = subops;
+    for (int i = 0; i < n_osds; i++)
    {
-        // Successful write means that object is not incomplete anymore
-        this->incomplete_objects--;
-        pg.incomplete_objects.erase(oid);
-        if (!pg.incomplete_objects.size())
+        osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
+        if (sync_osd == this->osd_num)
        {
-            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
-            report_pg_state(pg);
+            subops[i].bs_op = new blockstore_op_t({
+                .opcode = BS_OP_SYNC,
+                .callback = [cur_op, this](blockstore_op_t *subop)
+                {
+                    handle_primary_subop(cur_op, subop->retval == 0, 0);
+                },
+            });
+            bs->enqueue_op(subops[i].bs_op);
        }
-    }
-    else if (object_state->state & OBJ_DEGRADED)
-    {
-        this->degraded_objects--;
-        pg.degraded_objects.erase(oid);
-        if (!pg.degraded_objects.size())
+        else
        {
-            pg.state = pg.state & ~PG_HAS_DEGRADED;
-            report_pg_state(pg);
+            subops[i].op_type = OSD_OP_OUT;
+            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+            subops[i].peer_fd = osd_peer_fds.at(sync_osd);
+            subops[i].req.sec_sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = this->next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_SYNC,
+                },
+            };
+            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            {
+                handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
+            };
+            outbox_push(clients[subops[i].peer_fd], &subops[i]);
        }
    }
-    else if (object_state->state & OBJ_MISPLACED)
-    {
-        this->misplaced_objects--;
-        pg.misplaced_objects.erase(oid);
-        if (!pg.misplaced_objects.size())
-        {
-            pg.state = pg.state & ~PG_HAS_MISPLACED;
-            report_pg_state(pg);
-        }
-    }
-    else
-    {
-        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
-    }
-    object_state->object_count--;
-    if (!object_state->object_count)
-    {
-        pg.state_dict.erase(object_state->osd_set);
-    }
 }

-void osd_t::continue_primary_del(osd_op_t *cur_op)
+void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
 {
-    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
-    {
-        return;
-    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    auto & pg = pgs[op_data->pg_num];
-    if (op_data->st == 1)      goto resume_1;
-    else if (op_data->st == 2) goto resume_2;
-    else if (op_data->st == 3) goto resume_3;
-    else if (op_data->st == 4) goto resume_4;
-    else if (op_data->st == 5) goto resume_5;
-    assert(op_data->st == 0);
-    // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
-    if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
+    int n_osds = op_data->unstable_write_osds->size();
+    osd_op_t *subops = new osd_op_t[n_osds];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_osds;
+    op_data->subops = subops;
+    for (int i = 0; i < n_osds; i++)
    {
-        finish_op(cur_op, -EBUSY);
-        return;
-    }
-    if (!check_write_queue(cur_op, pg))
-    {
-        return;
-    }
-resume_1:
-    // Determine which OSDs contain this object and delete it
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-    // Submit 1 read to determine the actual version number
-    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
-resume_2:
-    op_data->st = 2;
-    return;
-resume_3:
-    if (op_data->errors > 0)
-    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
-    }
-    // Save version override for parallel reads
-    pg.ver_override[op_data->oid] = op_data->fact_ver;
-    // Submit deletes
-    op_data->fact_ver++;
-    submit_primary_del_subops(cur_op, NULL, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
-resume_4:
-    op_data->st = 4;
-    return;
-resume_5:
-    if (op_data->errors > 0)
-    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
-    }
-    // Remove version override
-    pg.ver_override.erase(op_data->oid);
-    // Adjust PG stats after "instant stabilize", because we need object_state above
-    if (!op_data->object_state)
-    {
-        pg.clean_count--;
-    }
-    else
-    {
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
-    }
-    pg.total_count--;
-    object_id oid = op_data->oid;
-    finish_op(cur_op, cur_op->req.rw.len);
-    // Continue other write operations to the same object
-    auto next_it = pg.write_queue.find(oid);
-    auto this_it = next_it;
-    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
-    {
-        next_it++;
-        pg.write_queue.erase(this_it);
-        if (next_it != pg.write_queue.end() &&
-            next_it->first == oid)
+        auto & stab_osd = (*(op_data->unstable_write_osds))[i];
+        if (stab_osd.osd_num == this->osd_num)
        {
-            osd_op_t *next_op = next_it->second;
-            continue_primary_write(next_op);
+            subops[i].bs_op = new blockstore_op_t({
+                .opcode = BS_OP_STABLE,
+                .callback = [cur_op, this](blockstore_op_t *subop)
+                {
+                    handle_primary_subop(cur_op, subop->retval == 0, 0);
+                },
+                .len = (uint32_t)stab_osd.len,
+                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
+            });
+            bs->enqueue_op(subops[i].bs_op);
+        }
+        else
+        {
+            subops[i].op_type = OSD_OP_OUT;
+            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+            subops[i].peer_fd = osd_peer_fds.at(stab_osd.osd_num);
+            subops[i].req.sec_stab = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = this->next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_STABILIZE,
+                },
+                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
+            };
+            subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
+            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            {
+                handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
+            };
+            outbox_push(clients[subops[i].peer_fd], &subops[i]);
        }
    }
 }
--- a/osd_primary.h
+++ b/osd_primary.h
@@ -1,35 +0,0 @@
-#pragma once
-
-#include "osd.h"
-#include "osd_rmw.h"
-
-#define SUBMIT_READ 0
-#define SUBMIT_RMW_READ 1
-#define SUBMIT_WRITE 2
-
-struct unstable_osd_num_t
-{
-    osd_num_t osd_num;
-    int start, len;
-};
-
-struct osd_primary_op_data_t
-{
-    int st = 0;
-    pg_num_t pg_num;
-    object_id oid;
-    uint64_t target_ver;
-    uint64_t fact_ver = 0;
-    int n_subops = 0, done = 0, errors = 0, epipe = 0;
-    int degraded = 0, pg_size, pg_minsize;
-    osd_rmw_stripe_t *stripes;
-    osd_op_t *subops = NULL;
-    uint64_t *prev_set = NULL;
-    pg_osd_set_state_t *object_state = NULL;
-
-    // for sync. oops, requires freeing
-    std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
-    pg_num_t *dirty_pgs = NULL;
-    int dirty_pg_count = 0;
-    obj_ver_id *unstable_writes = NULL;
-};
--- a/osd_primary_subops.cpp
+++ b/osd_primary_subops.cpp
@@ -1,551 +0,0 @@
-#include "osd_primary.h"
-
-void osd_t::autosync()
-{
-    // FIXME Autosync based on the number of unstable writes to prevent
-    // "journal_sector_buffer_count is too low for this batch" errors
-    if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
-    {
-        autosync_op = new osd_op_t();
-        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->req = {
-            .sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = 1,
-                    .opcode = OSD_OP_SYNC,
-                },
-            },
-        };
-        autosync_op->callback = [this](osd_op_t *op)
-        {
-            if (op->reply.hdr.retval < 0)
-            {
-                printf("Warning: automatic sync resulted in an error: %ld (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
-            }
-            delete autosync_op;
-            autosync_op = NULL;
-        };
-        exec_op(autosync_op);
-    }
-}
-
-void osd_t::finish_op(osd_op_t *cur_op, int retval)
-{
-    inflight_ops--;
-    if (cur_op->op_data)
-    {
-        if (cur_op->op_data->pg_num > 0)
-        {
-            auto & pg = pgs[cur_op->op_data->pg_num];
-            pg.inflight--;
-            assert(pg.inflight >= 0);
-            if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
-            {
-                finish_stop_pg(pg);
-            }
-        }
-        assert(!cur_op->op_data->subops);
-        assert(!cur_op->op_data->unstable_write_osds);
-        assert(!cur_op->op_data->unstable_writes);
-        assert(!cur_op->op_data->dirty_pgs);
-        free(cur_op->op_data);
-        cur_op->op_data = NULL;
-    }
-    if (!cur_op->peer_fd)
-    {
-        // Copy lambda to be unaffected by `delete op`
-        std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
-    }
-    else
-    {
-        // FIXME add separate magic number
-        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
-        if (cl_it != c_cli.clients.end())
-        {
-            cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-            cur_op->reply.hdr.id = cur_op->req.hdr.id;
-            cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-            cur_op->reply.hdr.retval = retval;
-            c_cli.outbox_push(cur_op);
-        }
-        else
-        {
-            delete cur_op;
-        }
-    }
-}
-
-void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
-{
-    bool w = submit_type == SUBMIT_WRITE;
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    osd_rmw_stripe_t *stripes = op_data->stripes;
-    // Allocate subops
-    int n_subops = 0, zero_read = -1;
-    for (int role = 0; role < pg_size; role++)
-    {
-        if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
-        {
-            zero_read = role;
-        }
-        if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
-        {
-            n_subops++;
-        }
-    }
-    if (!n_subops && submit_type == SUBMIT_RMW_READ)
-    {
-        n_subops = 1;
-    }
-    else
-    {
-        zero_read = -1;
-    }
-    uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
-    osd_op_t *subops = new osd_op_t[n_subops];
-    op_data->fact_ver = 0;
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_subops;
-    op_data->subops = subops;
-    int i = 0;
-    for (int role = 0; role < pg_size; role++)
-    {
-        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(w || stripes[role].read_end != 0 || zero_read == role))
-        {
-            continue;
-        }
-        osd_num_t role_osd_num = osd_set[role];
-        if (role_osd_num != 0)
-        {
-            if (role_osd_num == this->osd_num)
-            {
-                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
-                subops[i].op_type = (uint64_t)cur_op;
-                subops[i].bs_op = new blockstore_op_t({
-                    .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
-                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
-                    {
-                        handle_primary_bs_subop(subop);
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | role,
-                    },
-                    .version = op_version,
-                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
-                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
-                    .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
-                });
-#ifdef OSD_DEBUG
-                printf(
-                    "Submit %s to local: %lu:%lu v%lu %u-%u\n", w ? "write" : "read",
-                    op_data->oid.inode, op_data->oid.stripe | role, op_version,
-                    subops[i].bs_op->offset, subops[i].bs_op->len
-                );
-#endif
-                bs->enqueue_op(subops[i].bs_op);
-            }
-            else
-            {
-                subops[i].op_type = OSD_OP_OUT;
-                subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-                subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
-                subops[i].req.sec_rw = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = c_cli.next_subop_id++,
-                        .opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | role,
-                    },
-                    .version = op_version,
-                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
-                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
-                };
-#ifdef OSD_DEBUG
-                printf(
-                    "Submit %s to osd %lu: %lu:%lu v%lu %u-%u\n", w ? "write" : "read", role_osd_num,
-                    op_data->oid.inode, op_data->oid.stripe | role, op_version,
-                    subops[i].req.sec_rw.offset, subops[i].req.sec_rw.len
-                );
-#endif
-                subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
-                if (w && stripes[role].write_end > 0)
-                {
-                    subops[i].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
-                }
-                subops[i].callback = [cur_op, this](osd_op_t *subop)
-                {
-                    int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE &&
-                        subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
-                    // so it doesn't get freed
-                    subop->buf = NULL;
-                    handle_primary_subop(subop, cur_op);
-                    if (fail_fd >= 0)
-                    {
-                        // write operation failed, drop the connection
-                        c_cli.stop_client(fail_fd);
-                    }
-                };
-                c_cli.outbox_push(&subops[i]);
-            }
-            i++;
-        }
-    }
-}
-
-static uint64_t bs_op_to_osd_op[] = {
-    0,
-    OSD_OP_SECONDARY_READ,      // BS_OP_READ
-    OSD_OP_SECONDARY_WRITE,     // BS_OP_WRITE
-    OSD_OP_SECONDARY_SYNC,      // BS_OP_SYNC
-    OSD_OP_SECONDARY_STABILIZE, // BS_OP_STABLE
-    OSD_OP_SECONDARY_DELETE,    // BS_OP_DELETE
-    OSD_OP_SECONDARY_LIST,      // BS_OP_LIST
-    OSD_OP_SECONDARY_ROLLBACK,  // BS_OP_ROLLBACK
-    OSD_OP_TEST_SYNC_STAB_ALL,  // BS_OP_SYNC_STAB_ALL
-};
-
-void osd_t::handle_primary_bs_subop(osd_op_t *subop)
-{
-    osd_op_t *cur_op = (osd_op_t*)subop->op_type;
-    blockstore_op_t *bs_op = subop->bs_op;
-    int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE ? bs_op->len : 0;
-    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
-    {
-        // die
-        throw std::runtime_error(
-            "local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
-            " retval = "+std::to_string(bs_op->retval)+")"
-        );
-    }
-    add_bs_subop_stats(subop);
-    subop->req.hdr.opcode = bs_op_to_osd_op[bs_op->opcode];
-    subop->reply.hdr.retval = bs_op->retval;
-    if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE)
-    {
-        subop->req.sec_rw.len = bs_op->len;
-        subop->reply.sec_rw.version = bs_op->version;
-    }
-    delete bs_op;
-    subop->bs_op = NULL;
-    handle_primary_subop(subop, cur_op);
-}
-
-void osd_t::add_bs_subop_stats(osd_op_t *subop)
-{
-    // Include local blockstore ops in statistics
-    uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
-    timespec tv_end;
-    clock_gettime(CLOCK_REALTIME, &tv_end);
-    c_cli.stats.op_stat_count[opcode]++;
-    if (!c_cli.stats.op_stat_count[opcode])
-    {
-        c_cli.stats.op_stat_count[opcode] = 1;
-        c_cli.stats.op_stat_sum[opcode] = 0;
-        c_cli.stats.op_stat_bytes[opcode] = 0;
-    }
-    c_cli.stats.op_stat_sum[opcode] += (
-        (tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
-        (tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
-    );
-    if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
-    {
-        c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
-    }
-}
-
-void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
-{
-    uint64_t opcode = subop->req.hdr.opcode;
-    int retval = subop->reply.hdr.retval;
-    int expected = opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE
-        ? subop->req.sec_rw.len : 0;
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (retval != expected)
-    {
-        printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected);
-        if (retval == -EPIPE)
-        {
-            op_data->epipe++;
-        }
-        op_data->errors++;
-    }
-    else
-    {
-        op_data->done++;
-        if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
-        {
-            uint64_t version = subop->reply.sec_rw.version;
-#ifdef OSD_DEBUG
-            uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
-                ? c_cli.clients[subop->peer_fd].osd_num : osd_num;
-            printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
-#endif
-            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
-            {
-                throw std::runtime_error(
-                    "different fact_versions returned from "+std::string(osd_op_names[opcode])+
-                    " subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
-                );
-            }
-            op_data->fact_ver = version;
-        }
-    }
-    if ((op_data->errors + op_data->done) >= op_data->n_subops)
-    {
-        delete[] op_data->subops;
-        op_data->subops = NULL;
-        op_data->st++;
-        if (cur_op->req.hdr.opcode == OSD_OP_READ)
-        {
-            continue_primary_read(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
-        {
-            continue_primary_write(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
-        {
-            continue_primary_sync(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
-        {
-            continue_primary_del(cur_op);
-        }
-        else
-        {
-            throw std::runtime_error("BUG: unknown opcode");
-        }
-    }
-}
-
-void osd_t::cancel_primary_write(osd_op_t *cur_op)
-{
-    if (cur_op->op_data && cur_op->op_data->subops)
-    {
-        // Primary-write operation is waiting for subops, subops
-        // are sent to peer OSDs, so we can't just throw them away.
-        // Mark them with an extra EPIPE.
-        cur_op->op_data->errors++;
-        cur_op->op_data->epipe++;
-        cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
-    }
-    else
-    {
-        finish_op(cur_op, -EPIPE);
-    }
-}
-
-void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int extra_chunks = 0;
-    for (auto & chunk: loc_set)
-    {
-        if (!cur_set || chunk.osd_num != cur_set[chunk.role])
-        {
-            extra_chunks++;
-        }
-    }
-    op_data->n_subops = extra_chunks;
-    op_data->done = op_data->errors = 0;
-    if (!extra_chunks)
-    {
-        return;
-    }
-    osd_op_t *subops = new osd_op_t[extra_chunks];
-    op_data->subops = subops;
-    int i = 0;
-    for (auto & chunk: loc_set)
-    {
-        if (!cur_set || chunk.osd_num != cur_set[chunk.role])
-        {
-            if (chunk.osd_num == this->osd_num)
-            {
-                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
-                subops[i].op_type = (uint64_t)cur_op;
-                subops[i].bs_op = new blockstore_op_t({
-                    .opcode = BS_OP_DELETE,
-                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
-                    {
-                        handle_primary_bs_subop(subop);
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | chunk.role,
-                    },
-                    // Same version as write
-                    .version = op_data->fact_ver,
-                });
-                bs->enqueue_op(subops[i].bs_op);
-            }
-            else
-            {
-                subops[i].op_type = OSD_OP_OUT;
-                subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-                subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
-                subops[i].req.sec_del = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = c_cli.next_subop_id++,
-                        .opcode = OSD_OP_SECONDARY_DELETE,
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | chunk.role,
-                    },
-                    // Same version as write
-                    .version = op_data->fact_ver,
-                };
-                subops[i].callback = [cur_op, this](osd_op_t *subop)
-                {
-                    int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
-                    handle_primary_subop(subop, cur_op);
-                    if (fail_fd >= 0)
-                    {
-                        // delete operation failed, drop the connection
-                        c_cli.stop_client(fail_fd);
-                    }
-                };
-                c_cli.outbox_push(&subops[i]);
-            }
-            i++;
-        }
-    }
-}
-
-void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int n_osds = op_data->unstable_write_osds->size();
-    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_osds;
-    op_data->subops = subops;
-    for (int i = 0; i < n_osds; i++)
-    {
-        osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
-        if (sync_osd == this->osd_num)
-        {
-            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
-            subops[i].op_type = (uint64_t)cur_op;
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_SYNC,
-                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
-                {
-                    handle_primary_bs_subop(subop);
-                },
-            });
-            bs->enqueue_op(subops[i].bs_op);
-        }
-        else
-        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-            subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
-            subops[i].req.sec_sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = c_cli.next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_SYNC,
-                },
-            };
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
-            {
-                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
-                handle_primary_subop(subop, cur_op);
-                if (fail_fd >= 0)
-                {
-                    // sync operation failed, drop the connection
-                    c_cli.stop_client(fail_fd);
-                }
-            };
-            c_cli.outbox_push(&subops[i]);
-        }
-    }
-}
-
-void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int n_osds = op_data->unstable_write_osds->size();
-    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_osds;
-    op_data->subops = subops;
-    for (int i = 0; i < n_osds; i++)
-    {
-        auto & stab_osd = (*(op_data->unstable_write_osds))[i];
-        if (stab_osd.osd_num == this->osd_num)
-        {
-            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
-            subops[i].op_type = (uint64_t)cur_op;
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_STABLE,
-                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
-                {
-                    handle_primary_bs_subop(subop);
-                },
-                .len = (uint32_t)stab_osd.len,
-                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
-            });
-            bs->enqueue_op(subops[i].bs_op);
-        }
-        else
-        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-            subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
-            subops[i].req.sec_stab = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = c_cli.next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_STABILIZE,
-                },
-                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
-            };
-            subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
-            {
-                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
-                handle_primary_subop(subop, cur_op);
-                if (fail_fd >= 0)
-                {
-                    // sync operation failed, drop the connection
-                    c_cli.stop_client(fail_fd);
-                }
-            };
-            c_cli.outbox_push(&subops[i]);
-        }
-    }
-}
-
-void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
-{
-    auto st_it = pg.write_queue.find(oid), it = st_it;
-    finish_op(first_op, retval);
-    if (it != pg.write_queue.end() && it->second == first_op)
-    {
-        it++;
-    }
-    else
-    {
-        // Write queue doesn't match the first operation.
-        // first_op is a leftover operation from the previous peering of the same PG.
-        return;
-    }
-    while (it != pg.write_queue.end() && it->first == oid)
-    {
-        finish_op(it->second, retval);
-        it++;
-    }
-    if (st_it != it)
-    {
-        pg.write_queue.erase(st_it, it);
-    }
-}
--- a/osd_receive.cpp
+++ b/osd_receive.cpp
@@ -0,0 +1,204 @@
+#include "osd.h"
+
+void osd_t::read_requests()
+{
+    for (int i = 0; i < read_ready_clients.size(); i++)
+    {
+        int peer_fd = read_ready_clients[i];
+        auto & cl = clients[peer_fd];
+        io_uring_sqe* sqe = ringloop->get_sqe();
+        if (!sqe)
+        {
+            read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
+            return;
+        }
+        ring_data_t* data = ((ring_data_t*)sqe->user_data);
+        if (!cl.read_buf)
+        {
+            // no reads in progress
+            // so this is either a new command or a reply to a previously sent command
+            if (!cl.read_op)
+            {
+                cl.read_op = new osd_op_t;
+                cl.read_op->peer_fd = peer_fd;
+            }
+            cl.read_op->op_type = OSD_OP_IN;
+            cl.read_buf = &cl.read_op->req.buf;
+            cl.read_remaining = OSD_PACKET_SIZE;
+            cl.read_state = CL_READ_OP;
+        }
+        cl.read_iov.iov_base = cl.read_buf;
+        cl.read_iov.iov_len = cl.read_remaining;
+        cl.read_msg.msg_iov = &cl.read_iov;
+        cl.read_msg.msg_iovlen = 1;
+        data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
+        my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
+    }
+    read_ready_clients.clear();
+}
+
+void osd_t::handle_read(ring_data_t *data, int peer_fd)
+{
+    auto cl_it = clients.find(peer_fd);
+    if (cl_it != clients.end())
+    {
+        auto & cl = cl_it->second;
+        if (data->res == -EAGAIN)
+        {
+            cl.read_ready--;
+            if (cl.read_ready > 0)
+                read_ready_clients.push_back(peer_fd);
+            return;
+        }
+        else if (data->res < 0)
+        {
+            // this is a client socket, so don't panic. just disconnect it
+            printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
+            stop_client(peer_fd);
+            return;
+        }
+        read_ready_clients.push_back(peer_fd);
+        if (data->res > 0)
+        {
+            cl.read_remaining -= data->res;
+            cl.read_buf += data->res;
+            if (cl.read_remaining <= 0)
+            {
+                cl.read_buf = NULL;
+                if (cl.read_state == CL_READ_OP)
+                {
+                    if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
+                    {
+                        handle_reply_hdr(&cl);
+                    }
+                    else
+                    {
+                        handle_op_hdr(&cl);
+                    }
+                }
+                else if (cl.read_state == CL_READ_DATA)
+                {
+                    // Operation is ready
+                    exec_op(cl.read_op);
+                    cl.read_op = NULL;
+                    cl.read_state = 0;
+                }
+                else if (cl.read_state == CL_READ_REPLY_DATA)
+                {
+                    // Reply is ready
+                    auto req_it = cl.sent_ops.find(cl.read_reply_id);
+                    osd_op_t *request = req_it->second;
+                    cl.sent_ops.erase(req_it);
+                    cl.read_reply_id = 0;
+                    cl.read_state = 0;
+                    // Measure subop latency
+                    timespec tv_end;
+                    clock_gettime(CLOCK_REALTIME, &tv_end);
+                    subop_stat_count[request->req.hdr.opcode]++;
+                    subop_stat_sum[request->req.hdr.opcode] += (
+                        (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
+                        (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
+                    );
+                    request->callback(request);
+                }
+            }
+        }
+    }
+}
+
+void osd_t::handle_op_hdr(osd_client_t *cl)
+{
+    osd_op_t *cur_op = cl->read_op;
+    if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
+    {
+        if (cur_op->req.sec_rw.len > 0)
+            cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
+        cl->read_remaining = 0;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
+    {
+        if (cur_op->req.sec_rw.len > 0)
+            cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
+        cl->read_remaining = cur_op->req.sec_rw.len;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
+        cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
+    {
+        if (cur_op->req.sec_stab.len > 0)
+            cur_op->buf = memalign(512, cur_op->req.sec_stab.len);
+        cl->read_remaining = cur_op->req.sec_stab.len;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
+    {
+        if (cur_op->req.rw.len > 0)
+            cur_op->buf = memalign(512, cur_op->req.rw.len);
+        cl->read_remaining = 0;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
+    {
+        if (cur_op->req.rw.len > 0)
+            cur_op->buf = memalign(512, cur_op->req.rw.len);
+        cl->read_remaining = cur_op->req.rw.len;
+    }
+    if (cl->read_remaining > 0)
+    {
+        // Read data
+        cl->read_buf = cur_op->buf;
+        cl->read_state = CL_READ_DATA;
+    }
+    else
+    {
+        // Operation is ready
+        cl->read_op = NULL;
+        cl->read_state = 0;
+        exec_op(cur_op);
+    }
+}
+
+void osd_t::handle_reply_hdr(osd_client_t *cl)
+{
+    osd_op_t *cur_op = cl->read_op;
+    auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
+    if (req_it == cl->sent_ops.end())
+    {
+        // Command out of sync. Drop connection
+        printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
+        stop_client(cl->peer_fd);
+        return;
+    }
+    osd_op_t *op = req_it->second;
+    memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
+    if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
+        op->reply.hdr.retval > 0)
+    {
+        // Read data. In this case we assume that the buffer is preallocated by the caller (!)
+        assert(op->buf);
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_reply_id = op->req.hdr.id;
+        cl->read_buf = op->buf;
+        cl->read_remaining = op->reply.hdr.retval;
+    }
+    else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
+        op->reply.hdr.retval > 0)
+    {
+        op->buf = memalign(512, sizeof(obj_ver_id) * op->reply.hdr.retval);
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_reply_id = op->req.hdr.id;
+        cl->read_buf = op->buf;
+        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
+    }
+    else
+    {
+        cl->read_state = 0;
+        cl->sent_ops.erase(req_it);
+        // Measure subop latency
+        timespec tv_end;
+        clock_gettime(CLOCK_REALTIME, &tv_end);
+        subop_stat_count[op->req.hdr.opcode]++;
+        subop_stat_sum[op->req.hdr.opcode] += (
+            (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
+            (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
+        );
+        op->callback(op);
+    }
+}
--- a/osd_rmw.cpp
+++ b/osd_rmw.cpp
@@ -1,5 +1,4 @@
 #include <malloc.h>
-#include <string.h>
 #include <assert.h>
 #include "xor.h"
 #include "osd_rmw.h"
@@ -56,11 +55,6 @@ static inline void cover_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & s

 void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t end, osd_rmw_stripe_t *stripes)
 {
-    if (end == 0)
-    {
-        // Zero length request - offset doesn't matter
-        return;
-    }
    end = start+end;
    for (int role = 0; role < pg_minsize; role++)
    {
@@ -85,21 +79,18 @@ void reconstruct_stripe(osd_rmw_stripe_t *stripes, int pg_size, int role)
            }
            else if (prev >= 0)
            {
-                assert(stripes[role].read_start >= stripes[prev].read_start &&
-                    stripes[role].read_start >= stripes[other].read_start);
                memxor(
-                    stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
-                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                    stripes[prev].read_buf + (stripes[prev].read_start - stripes[role].read_start),
+                    stripes[other].read_buf + (stripes[other].read_start - stripes[other].read_start),
                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
                );
                prev = -1;
            }
            else
            {
-                assert(stripes[role].read_start >= stripes[other].read_start);
                memxor(
                    stripes[role].read_buf,
-                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                    stripes[other].read_buf + (stripes[other].read_start - stripes[role].read_start),
                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
                );
            }
@@ -165,11 +156,10 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
    return buf;
 }

-void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
-    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size)
+void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize)
 {
    // Generic parity modification (read-modify-write) algorithm
-    // Read -> Reconstruct missing chunks -> Calc parity chunks -> Write
+    // Reconstruct -> Read -> Calc parity -> Write
    // Now we always read continuous ranges. This means that an update of the beginning
    // of one data stripe and the end of another will lead to a read of full paired stripes.
    // FIXME: (Maybe) read small individual ranges in that case instead.
@@ -184,90 +174,64 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
            stripes[role].write_end = stripes[role].req_end;
        }
    }
-    int write_parity = 0;
+    for (int role = 0; role < pg_minsize; role++)
+    {
+        cover_read(start, end, stripes[role]);
+    }
+    int has_parity = 0;
    for (int role = pg_minsize; role < pg_size; role++)
    {
-        if (write_osd_set[role] != 0)
+        if (osd_set[role] != 0)
        {
-            write_parity = 1;
+            has_parity++;
            stripes[role].write_start = start;
            stripes[role].write_end = end;
        }
-    }
-    if (write_parity)
-    {
-        for (int role = 0; role < pg_minsize; role++)
-        {
-            cover_read(start, end, stripes[role]);
-        }
-    }
-    if (write_osd_set != read_osd_set)
-    {
-        pg_cursize = 0;
-        // Object is degraded/misplaced and will be moved to <write_osd_set>
-        for (int role = 0; role < pg_size; role++)
-        {
-            if (write_osd_set[role] != read_osd_set[role])
-            {
-                // FIXME: For EC more than 2+1: handle case when write_osd_set == 0 and read_osd_set != 0
-                // We need to get data for any moved / recovered chunk
-                // And we need a continuous write buffer so we'll only optimize
-                // for the case when the whole chunk is ovewritten in the request
-                if (stripes[role].req_start != 0 ||
-                    stripes[role].req_end != chunk_size)
-                {
-                    stripes[role].read_start = 0;
-                    stripes[role].read_end = chunk_size;
-                    // Warning: We don't modify write_start/write_end here, we do it in calc_rmw_parity()
-                }
-            }
-            if (read_osd_set[role] != 0)
-            {
-                pg_cursize++;
-            }
-        }
+        else
+            stripes[role].missing = true;
    }
    if (pg_cursize < pg_size)
    {
-        // Some stripe(s) are missing, so we need to read parity
-        for (int role = 0; role < pg_size; role++)
+        if (has_parity == 0)
        {
-            if (read_osd_set[role] == 0)
+            // Parity is missing, we don't need to read anything
+            for (int role = 0; role < pg_minsize; role++)
            {
-                stripes[role].missing = true;
-                if (stripes[role].read_end != 0)
+                stripes[role].read_end = 0;
+            }
+        }
+        else
+        {
+            // Other stripe(s) are missing
+            for (int role = 0; role < pg_minsize; role++)
+            {
+                if (osd_set[role] == 0 && stripes[role].read_end != 0)
                {
-                    int found = 0;
-                    for (int r2 = 0; r2 < pg_size && found < pg_minsize; r2++)
+                    stripes[role].missing = true;
+                    for (int r2 = 0; r2 < pg_size; r2++)
                    {
-                        // Read the non-covered range of <role> from at least <minsize> other stripes to reconstruct it
-                        if (read_osd_set[r2] != 0)
+                        // Read the non-covered range of <role> from all other stripes to reconstruct it
+                        if (r2 != role && osd_set[r2] != 0)
                        {
                            extend_read(stripes[role].read_start, stripes[role].read_end, stripes[r2]);
-                            found++;
                        }
                    }
-                    if (found < pg_minsize)
-                    {
-                        // FIXME Object is incomplete - refuse partial overwrite
-                        assert(0);
-                    }
                }
            }
        }
    }
    // Allocate read buffers
-    void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
-    // Position write buffers
+    void *rmw_buf = alloc_read_buffer(stripes, pg_size, has_parity * (end - start));
+    // Position parity & write buffers
    uint64_t buf_pos = 0, in_pos = 0;
    for (int role = 0; role < pg_size; role++)
    {
        if (stripes[role].req_end != 0)
        {
-            stripes[role].write_buf = request_buf + in_pos;
+            stripes[role].write_buf = write_buf + in_pos;
            in_pos += stripes[role].req_end - stripes[role].req_start;
        }
-        else if (role >= pg_minsize && write_osd_set[role] != 0 && end != 0)
+        else if (role >= pg_minsize && osd_set[role] != 0)
        {
            stripes[role].write_buf = rmw_buf + buf_pos;
            buf_pos += end - start;
@@ -357,9 +321,13 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
    }
 }

-void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size)
 {
-    int pg_minsize = pg_size-1;
+    if (stripes[pg_size-1].missing)
+    {
+        // Parity OSD is unavailable
+        return;
+    }
    for (int role = 0; role < pg_size; role++)
    {
        if (stripes[role].read_end != 0 && stripes[role].missing)
@@ -369,82 +337,31 @@ void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_
            break;
        }
    }
-    uint32_t start = 0, end = 0;
-    if (!stripes[pg_minsize].missing || write_osd_set != read_osd_set)
+    // Calculate new parity (EC k+1)
+    int parity = pg_size-1, prev = -2;
+    auto wr_end = stripes[parity].write_end;
+    auto wr_start = stripes[parity].write_start;
+    for (int other = 0; other < pg_size-1; other++)
    {
-        for (int role = 0; role < pg_minsize; role++)
+        if (prev == -2)
        {
-            if (stripes[role].req_end != 0)
-            {
-                start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
-                end = std::max(stripes[role].req_end, end);
-            }
+            prev = other;
        }
-    }
-    if (write_osd_set != read_osd_set)
-    {
-        for (int role = 0; role < pg_minsize; role++)
+        else
        {
-            if (write_osd_set[role] != read_osd_set[role] &&
-                (stripes[role].req_start != 0 || stripes[role].req_end != chunk_size))
+            int n1 = 0, n2 = 0;
+            buf_len_t xor1[3], xor2[3];
+            if (prev == -1)
            {
-                // FIXME again, handle case when write_osd_set[role] is 0
-                // Copy modified chunk into the read buffer to write it back
-                memcpy(
-                    stripes[role].read_buf + stripes[role].req_start,
-                    stripes[role].write_buf,
-                    stripes[role].req_end - stripes[role].req_start
-                );
-                stripes[role].write_buf = stripes[role].read_buf;
-                stripes[role].write_start = 0;
-                stripes[role].write_end = chunk_size;
-            }
-        }
-    }
-    if (!stripes[pg_minsize].missing && end != 0)
-    {
-        // Calculate new parity (EC k+1)
-        int parity = pg_minsize, prev = -2;
-        for (int other = 0; other < pg_minsize; other++)
-        {
-            if (prev == -2)
-            {
-                prev = other;
+                xor1[n1++] = { .buf = stripes[parity].write_buf, .len = wr_end-wr_start };
            }
            else
            {
-                int n1 = 0, n2 = 0;
-                buf_len_t xor1[3], xor2[3];
-                if (prev == -1)
-                {
-                    xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
-                }
-                else
-                {
-                    get_old_new_buffers(stripes[prev], start, end, xor1, n1);
-                    prev = -1;
-                }
-                get_old_new_buffers(stripes[other], start, end, xor2, n2);
-                xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
-            }
-        }
-    }
-    if (write_osd_set != read_osd_set)
-    {
-        for (int role = pg_minsize; role < pg_size; role++)
-        {
-            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
-            {
-                // Copy new parity into the read buffer to write it back
-                memcpy(
-                    stripes[role].read_buf + start,
-                    stripes[role].write_buf,
-                    end - start
-                );
-                stripes[role].write_buf = stripes[role].read_buf;
-                stripes[role].write_start = 0;
-                stripes[role].write_end = chunk_size;
+                get_old_new_buffers(stripes[prev], wr_start, wr_end, xor1, n1);
+                prev = -1;
            }
+            get_old_new_buffers(stripes[other], wr_start, wr_end, xor2, n2);
+            xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, wr_end-wr_start);
        }
    }
 }
--- a/osd_rmw.h
+++ b/osd_rmw.h
@@ -31,7 +31,6 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi

 void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);

-void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
-    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);
+void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize);

-void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
+void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size);
--- a/osd_rmw_test.cpp
+++ b/osd_rmw_test.cpp
@@ -2,147 +2,16 @@
 #include "osd_rmw.cpp"
 #include "test_pattern.h"

-void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size);
-void test1();
-void test4();
-void test5();
-void test6();
-void test7();
-void test8();
-void test9();
-
-/***
-
-Cases:
-
-1. split(offset=128K-4K, len=8K)
-   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
-
-2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
-
-3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
-   = { read: [ 0, 128K-4K ] }
-
-4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = {
-     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   + check write2 buffer
-
-5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-
-6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-
-7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity(): {
-     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write1==read1,
-   }
-   + check write1 buffer
-   + check write2 buffer
-
-8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-   + check write2 buffer
-
-9. object recovery case:
-   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
-     input buffer: NULL,
-     rmw buffer: [ read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity(): {
-     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write0==read0,
-   }
-   + check write0 buffer
-
-***/
-
 int main(int narg, char *args[])
-{
-    // Test 1
-    test1();
-    // Test 4
-    test4();
-    // Test 5
-    test5();
-    // Test 6
-    test6();
-    // Test 7
-    test7();
-    // Test 8
-    test8();
-    // Test 9
-    test9();
-    // End
-    printf("all ok\n");
-    return 0;
-}
-
-void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
-{
-    printf("request");
-    for (int i = 0; i < pg_size; i++)
-    {
-        printf(" {%uK-%uK}", stripes[i].req_start/1024, stripes[i].req_end/1024);
-    }
-    printf("\n");
-    printf("read");
-    for (int i = 0; i < pg_size; i++)
-    {
-        printf(" {%uK-%uK}", stripes[i].read_start/1024, stripes[i].read_end/1024);
-    }
-    printf("\n");
-    printf("write");
-    for (int i = 0; i < pg_size; i++)
-    {
-        printf(" {%uK-%uK}", stripes[i].write_start/1024, stripes[i].write_end/1024);
-    }
-    printf("\n");
-}
-
-void test1()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 1.1
+    // Test 1
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
    assert(stripes[2].req_end == 0);
-    // Test 1.2
+    // Test 2
    for (int i = 0; i < 3; i++)
    {
        stripes[i].read_start = stripes[i].req_start;
@@ -151,26 +20,18 @@ void test1()
    assert(extend_missing_stripes(stripes, osd_set, 2, 3) == 0);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
-    // Test 1.3
+    // Test 3
    stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
    cover_read(0, 128*1024, stripes[0]);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
-}
-
-void test4()
-{
-    osd_num_t osd_set[3] = { 1, 0, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 4.1
+    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    void* write_buf = malloc(8192);
-    void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
+    void* rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 4096 && stripes[2].read_end == 128*1024);
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == rmw_buf+128*1024);
    assert(stripes[1].read_buf == rmw_buf+128*1024*2);
    assert(stripes[2].read_buf == rmw_buf+128*1024*3-4096);
@@ -182,32 +43,24 @@ void test4()
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
    set_pattern(stripes[1].read_buf, 128*1024-4096, UINT64_MAX); // didn't read it, it's missing
    set_pattern(stripes[2].read_buf, 128*1024-4096, 0); // old parity = 0
-    calc_rmw_parity(stripes, 3, osd_set, osd_set, 128*1024);
+    calc_rmw_parity(stripes, 3);
    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
    free(rmw_buf);
    free(write_buf);
-}
-
-void test5()
-{
-    osd_num_t osd_set[3] = { 1, 0, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 5.1
+    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 64*1024);
    assert(stripes[2].req_end == 0);
    // Test 5.2
-    void *write_buf = malloc(64*1024*3);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
+    write_buf = malloc(64*1024*3);
+    rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
    assert(stripes[0].read_start == 64*1024 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 64*1024 && stripes[2].read_end == 128*1024);
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == rmw_buf+128*1024);
    assert(stripes[1].read_buf == rmw_buf+64*3*1024);
    assert(stripes[2].read_buf == rmw_buf+64*4*1024);
@@ -216,22 +69,15 @@ void test5()
    assert(stripes[2].write_buf == rmw_buf);
    free(rmw_buf);
    free(write_buf);
-}
-
-void test6()
-{
-    osd_num_t osd_set[3] = { 1, 2, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 6.1
+    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
-    void *write_buf = malloc(64*1024*3);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024);
+    osd_set[1] = 2;
+    write_buf = malloc(64*1024*3);
+    rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 3);
    assert(stripes[0].read_end == 0);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_end == 0);
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == 0);
    assert(stripes[1].read_buf == rmw_buf+128*1024);
    assert(stripes[2].read_buf == 0);
@@ -240,121 +86,8 @@ void test6()
    assert(stripes[2].write_buf == rmw_buf);
    free(rmw_buf);
    free(write_buf);
-}
-
-void test7()
-{
-    osd_num_t osd_set[3] = { 1, 0, 3 };
-    osd_num_t write_osd_set[3] = { 1, 2, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 7.1
-    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
-    void *write_buf = malloc(8192);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
-    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[0].read_buf == rmw_buf+128*1024);
-    assert(stripes[1].read_buf == rmw_buf+128*1024*2);
-    assert(stripes[2].read_buf == rmw_buf+128*1024*3);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == write_buf+4096);
-    assert(stripes[2].write_buf == rmw_buf);
-    // Test 7.2
-    set_pattern(write_buf, 8192, PATTERN0);
-    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
-    set_pattern(stripes[1].read_buf, 128*1024, UINT64_MAX); // didn't read it, it's missing
-    set_pattern(stripes[2].read_buf, 128*1024, 0); // old parity = 0
-    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[1].write_buf == stripes[1].read_buf);
-    check_pattern(stripes[1].write_buf, 4096, PATTERN0);
-    check_pattern(stripes[1].write_buf+4096, 128*1024-4096, PATTERN1);
-    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
-    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
-    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
-    free(rmw_buf);
-    free(write_buf);
-}
-
-void test8()
-{
-    osd_num_t osd_set[3] = { 0, 2, 3 };
-    osd_num_t write_osd_set[3] = { 1, 2, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 8.1
-    split_stripes(2, 128*1024, 0, 128*1024+4096, stripes);
-    void *write_buf = malloc(128*1024+4096);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
-    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[0].read_buf == NULL);
-    assert(stripes[1].read_buf == rmw_buf+128*1024);
-    assert(stripes[2].read_buf == NULL);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == write_buf+128*1024);
-    assert(stripes[2].write_buf == rmw_buf);
-    // Test 8.2
-    set_pattern(write_buf, 128*1024+4096, PATTERN0);
-    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN1);
-    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); // recheck again
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);     // recheck again
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); // recheck again
-    assert(stripes[0].write_buf == write_buf);                               // recheck again
-    assert(stripes[1].write_buf == write_buf+128*1024);                      // recheck again
-    assert(stripes[2].write_buf == rmw_buf);                                 // recheck again
-    check_pattern(stripes[2].write_buf, 4096, 0); // new parity
-    check_pattern(stripes[2].write_buf+4096, 128*1024-4096, PATTERN0^PATTERN1); // new parity
-    free(rmw_buf);
-    free(write_buf);
-}
-
-void test9()
-{
-    osd_num_t osd_set[3] = { 0, 2, 3 };
-    osd_num_t write_osd_set[3] = { 1, 2, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 9.0
-    split_stripes(2, 128*1024, 64*1024, 0, stripes);
-    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
-    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
-    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    // Test 9.1
-    void *write_buf = NULL;
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
-    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
-    assert(stripes[0].read_buf == rmw_buf);
-    assert(stripes[1].read_buf == rmw_buf+128*1024);
-    assert(stripes[2].read_buf == rmw_buf+128*1024*2);
-    assert(stripes[0].write_buf == NULL);
-    assert(stripes[1].write_buf == NULL);
-    assert(stripes[2].write_buf == NULL);
-    // Test 8.2
-    set_pattern(stripes[1].read_buf, 128*1024, 0);
-    set_pattern(stripes[2].read_buf, 128*1024, PATTERN1);
-    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
-    assert(stripes[0].write_buf == rmw_buf);
-    assert(stripes[1].write_buf == NULL);
-    assert(stripes[2].write_buf == NULL);
-    check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
-    check_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
-    free(rmw_buf);
+    osd_set[1] = 0;
+    // End
+    printf("all ok\n");
+    return 0;
 }
--- a/osd_secondary.cpp
+++ b/osd_secondary.cpp
@@ -4,34 +4,45 @@

 void osd_t::secondary_op_callback(osd_op_t *op)
 {
-    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
-        op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
+    inflight_ops--;
+    auto cl_it = clients.find(op->peer_fd);
+    if (cl_it != clients.end())
    {
-        op->reply.sec_rw.version = op->bs_op->version;
-    }
-    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
-    {
-        op->reply.sec_del.version = op->bs_op->version;
-    }
-    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
-        op->bs_op->retval > 0)
-    {
-        op->send_list.push_back(op->buf, op->bs_op->retval);
-    }
-    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
-    {
-        // allocated by blockstore
-        op->buf = op->bs_op->buf;
-        if (op->bs_op->retval > 0)
+        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        op->reply.hdr.id = op->req.hdr.id;
+        op->reply.hdr.opcode = op->req.hdr.opcode;
+        op->reply.hdr.retval = op->bs_op->retval;
+        if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
+            op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
        {
-            op->send_list.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id));
+            op->reply.sec_rw.version = op->bs_op->version;
        }
-        op->reply.sec_list.stable_count = op->bs_op->version;
+        else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
+        {
+            op->reply.sec_del.version = op->bs_op->version;
+        }
+        if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
+            op->reply.hdr.retval > 0)
+        {
+            op->send_list.push_back(op->buf, op->reply.hdr.retval);
+        }
+        else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
+        {
+            // allocated by blockstore
+            op->buf = op->bs_op->buf;
+            if (op->reply.hdr.retval > 0)
+            {
+                op->send_list.push_back(op->buf, op->reply.hdr.retval * sizeof(obj_ver_id));
+            }
+            op->reply.sec_list.stable_count = op->bs_op->version;
+        }
+        auto & cl = cl_it->second;
+        outbox_push(cl, op);
+    }
+    else
+    {
+        delete op;
    }
-    int retval = op->bs_op->retval;
-    delete op->bs_op;
-    op->bs_op = NULL;
-    finish_op(op, retval);
 }

 void osd_t::exec_secondary(osd_op_t *cur_op)
@@ -84,7 +95,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
+        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.parity_block_size;
        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
 #ifdef OSD_STUB
@@ -103,10 +114,15 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
 {
    // FIXME: Send the real config, not its source
    std::string cfg_str = json11::Json(config).dump();
+    cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+    cur_op->reply.hdr.id = cur_op->req.hdr.id;
+    cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+    cur_op->reply.hdr.retval = cfg_str.size()+1;
    cur_op->buf = malloc(cfg_str.size()+1);
    memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
-    cur_op->send_list.push_back(cur_op->buf, cfg_str.size()+1);
-    finish_op(cur_op, cfg_str.size()+1);
+    auto & cl = clients[cur_op->peer_fd];
+    cur_op->send_list.push_back(cur_op->buf, cur_op->reply.hdr.retval);
+    outbox_push(cl, cur_op);
 }

 void osd_t::exec_sync_stab_all(osd_op_t *cur_op)
--- a/osd_send.cpp
+++ b/osd_send.cpp
@@ -0,0 +1,131 @@
+#include "osd.h"
+
+void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
+{
+    assert(cur_op->peer_fd);
+    if (cur_op->op_type == OSD_OP_OUT)
+    {
+        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
+    }
+    cl.outbox.push_back(cur_op);
+    if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
+    {
+        if (cl.write_state == 0)
+        {
+            cl.write_state = CL_WRITE_READY;
+            write_ready_clients.push_back(cur_op->peer_fd);
+        }
+        ringloop->wakeup();
+    }
+}
+
+bool osd_t::try_send(osd_client_t & cl)
+{
+    int peer_fd = cl.peer_fd;
+    io_uring_sqe* sqe = ringloop->get_sqe();
+    if (!sqe)
+    {
+        return false;
+    }
+    ring_data_t* data = ((ring_data_t*)sqe->user_data);
+    if (!cl.write_op)
+    {
+        // pick next command
+        cl.write_op = cl.outbox.front();
+        cl.outbox.pop_front();
+        cl.write_state = CL_WRITE_REPLY;
+        clock_gettime(CLOCK_REALTIME, &cl.write_op->tv_send);
+        if (cl.write_op->op_type == OSD_OP_IN)
+        {
+            // Measure execution latency
+            timespec tv_end = cl.write_op->tv_send;
+            op_stat_count[cl.write_op->req.hdr.opcode]++;
+            op_stat_sum[cl.write_op->req.hdr.opcode] += (
+                (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
+                (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
+            );
+        }
+    }
+    cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
+    cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
+    data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
+    my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
+    return true;
+}
+
+void osd_t::send_replies()
+{
+    for (int i = 0; i < write_ready_clients.size(); i++)
+    {
+        int peer_fd = write_ready_clients[i];
+        if (!try_send(clients[peer_fd]))
+        {
+            write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
+            return;
+        }
+    }
+    write_ready_clients.clear();
+}
+
+void osd_t::handle_send(ring_data_t *data, int peer_fd)
+{
+    auto cl_it = clients.find(peer_fd);
+    if (cl_it != clients.end())
+    {
+        auto & cl = cl_it->second;
+        if (data->res < 0 && data->res != -EAGAIN)
+        {
+            // this is a client socket, so don't panic. just disconnect it
+            printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
+            stop_client(peer_fd);
+            return;
+        }
+        if (data->res >= 0)
+        {
+            osd_op_t *cur_op = cl.write_op;
+            while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
+            {
+                iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
+                if (iov.iov_len <= data->res)
+                {
+                    data->res -= iov.iov_len;
+                    cur_op->send_list.sent++;
+                }
+                else
+                {
+                    iov.iov_len -= data->res;
+                    iov.iov_base += data->res;
+                    break;
+                }
+            }
+            if (cur_op->send_list.sent >= cur_op->send_list.count)
+            {
+                // Done
+                if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
+                {
+                    timespec tv_end;
+                    clock_gettime(CLOCK_REALTIME, &tv_end);
+                    send_stat_count++;
+                    send_stat_sum += (
+                        (tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
+                        (tv_end.tv_nsec - cl.write_op->tv_send.tv_nsec)/1000
+                    );
+                }
+                if (cur_op->op_type == OSD_OP_IN)
+                {
+                    delete cur_op;
+                }
+                else
+                {
+                    cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
+                }
+                cl.write_op = NULL;
+                cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
+            }
+        }
+        if (cl.write_state != 0)
+        {
+            write_ready_clients.push_back(peer_fd);
+        }
+    }
+}
--- a/osd_test.cpp
+++ b/osd_test.cpp
@@ -19,8 +19,6 @@

 int connect_osd(const char *osd_address, int osd_port);

-uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len);
-
 uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern);

 void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_t len);
@@ -31,8 +29,6 @@ void test_primary_sync(int connect_fd);

 void test_sync_stab_all(int connect_fd);

-void test_list_stab(int connect_fd);
-
 int main0(int narg, char *args[])
 {
    int connect_fd;
@@ -98,16 +94,7 @@ int main2(int narg, char *args[])
    return 0;
 }

-int main3(int narg, char *args[])
-{
-    int connect_fd;
-    connect_fd = connect_osd("127.0.0.1", 11203);
-    test_list_stab(connect_fd);
-    close(connect_fd);
-    return 0;
-}
-
-int main4(int narg, char *args[])
+int main(int narg, char *args[])
 {
    int connect_fd;
    // Cluster write (sync not implemented yet)
@@ -119,15 +106,6 @@ int main4(int narg, char *args[])
    return 0;
 }

-int main(int narg, char *args[])
-{
-    int connect_fd;
-    connect_fd = connect_osd("192.168.7.2", 43051);
-    test_read(connect_fd, 1, 1039663104, UINT64_MAX, 0, 128*1024);
-    close(connect_fd);
-    return 0;
-}
-
 int connect_osd(const char *osd_address, int osd_port)
 {
    struct sockaddr_in addr;
@@ -170,7 +148,7 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
        printf("bad reply: magic, id or opcode does not match request\n");
        return false;
    }
-    if (expected >= 0 && reply.hdr.retval != expected)
+    if (reply.hdr.retval != expected)
    {
        printf("operation failed, retval=%ld\n", reply.hdr.retval);
        return false;
@@ -178,66 +156,6 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
    return true;
 }

-uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t offset, uint64_t len)
-{
-    osd_any_op_t op;
-    osd_any_reply_t reply;
-    op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
-    op.hdr.id = 1;
-    op.hdr.opcode = OSD_OP_SECONDARY_READ;
-    op.sec_rw.oid = {
-        .inode = inode,
-        .stripe = stripe,
-    };
-    op.sec_rw.version = version;
-    op.sec_rw.offset = offset;
-    op.sec_rw.len = len;
-    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
-    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
-    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
-    if (!check_reply(r, op, reply, op.sec_rw.len))
-    {
-        free(data);
-        return 0;
-    }
-    r = read_blocking(connect_fd, data, len);
-    if (r != len)
-    {
-        free(data);
-        perror("read data");
-        return 0;
-    }
-    free(data);
-    printf("Read %lu:%lu v%lu = v%lu\n", inode, stripe, version, reply.sec_rw.version);
-    op.hdr.opcode = OSD_OP_SECONDARY_LIST;
-    op.sec_list.list_pg = 1;
-    op.sec_list.pg_count = 1;
-    op.sec_list.pg_stripe_size = 4*1024*1024;
-    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
-    r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
-    if (reply.hdr.retval < 0 || !check_reply(r, op, reply, reply.hdr.retval))
-    {
-        return 0;
-    }
-    data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
-    r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
-    if (r != sizeof(obj_ver_id)*reply.hdr.retval)
-    {
-        free(data);
-        perror("read data");
-        return 0;
-    }
-    obj_ver_id *ov = (obj_ver_id*)data;
-    for (int i = 0; i < reply.hdr.retval; i++)
-    {
-        if (ov[i].oid.inode == inode && (ov[i].oid.stripe & ~(4096-1)) == (stripe & ~(4096-1)))
-        {
-            printf("list: %lu:%lu v%lu stable=%d\n", ov[i].oid.inode, ov[i].oid.stripe, ov[i].version, i < reply.sec_list.stable_count ? 1 : 0);
-        }
-    }
-    return 0;
-}
-
 uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t version, uint64_t pattern)
 {
    osd_any_op_t op;
@@ -252,7 +170,7 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
    op.sec_rw.version = version;
    op.sec_rw.offset = 0;
    op.sec_rw.len = 128*1024;
-    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
+    void *data = memalign(512, op.sec_rw.len);
    for (int i = 0; i < (op.sec_rw.len)/sizeof(uint64_t); i++)
        ((uint64_t*)data)[i] = pattern;
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
@@ -287,7 +205,7 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(MEM_ALIGNMENT, len);
+    void *data = memalign(512, len);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    if (!check_reply(r, op, reply, len))
@@ -315,7 +233,7 @@ void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(MEM_ALIGNMENT, len);
+    void *data = memalign(512, len);
    set_pattern(data, len, pattern);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    write_blocking(connect_fd, data, len);
@@ -347,40 +265,3 @@ void test_sync_stab_all(int connect_fd)
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    assert(check_reply(r, op, reply, 0));
 }
-
-void test_list_stab(int connect_fd)
-{
-    osd_any_op_t op;
-    osd_any_reply_t reply;
-    op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
-    op.hdr.id = 1;
-    op.hdr.opcode = OSD_OP_SECONDARY_LIST;
-    op.sec_list.pg_count = 0;
-    assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
-    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
-    assert(check_reply(r, op, reply, -1));
-    int total_count = reply.hdr.retval;
-    int stable_count = reply.sec_list.stable_count;
-    obj_ver_id *data = (obj_ver_id*)malloc(total_count * sizeof(obj_ver_id));
-    assert(data);
-    assert(read_blocking(connect_fd, data, total_count * sizeof(obj_ver_id)) == (total_count * sizeof(obj_ver_id)));
-    int last_start = stable_count;
-    for (int i = stable_count; i <= total_count; i++)
-    {
-        // Stabilize in portions of 32 entries
-        if (i - last_start >= 32 || i == total_count)
-        {
-            op.hdr.opcode = OSD_OP_SECONDARY_STABILIZE;
-            op.sec_stab.len = sizeof(obj_ver_id) * (i - last_start);
-            assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
-            assert(write_blocking(connect_fd, data + last_start, op.sec_stab.len) == op.sec_stab.len);
-            r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
-            assert(check_reply(r, op, reply, 0));
-            last_start = i;
-        }
-    }
-    obj_ver_id *data2 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 32);
-    assert(data2);
-    free(data2);
-    free(data);
-}
--- a/pg_states.cpp
+++ b/pg_states.cpp
@@ -1,33 +0,0 @@
-#include "pg_states.h"
-
-const int pg_state_bit_count = 13;
-
-const int pg_state_bits[13] = {
-    PG_STARTING,
-    PG_PEERING,
-    PG_INCOMPLETE,
-    PG_ACTIVE,
-    PG_STOPPING,
-    PG_OFFLINE,
-    PG_DEGRADED,
-    PG_HAS_INCOMPLETE,
-    PG_HAS_DEGRADED,
-    PG_HAS_MISPLACED,
-    PG_HAS_UNCLEAN,
-    PG_LEFT_ON_DEAD,
-};
-
-const char *pg_state_names[13] = {
-    "starting",
-    "peering",
-    "incomplete",
-    "active",
-    "stopping",
-    "offline",
-    "degraded",
-    "has_incomplete",
-    "has_degraded",
-    "has_misplaced",
-    "has_unclean",
-    "left_on_dead",
-};
--- a/pg_states.h
+++ b/pg_states.h
@@ -1,33 +0,0 @@
-#pragma once
-
-// Placement group states
-// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
-// Exactly one of these:
-#define PG_STARTING (1<<0)
-#define PG_PEERING (1<<1)
-#define PG_INCOMPLETE (1<<2)
-#define PG_ACTIVE (1<<3)
-#define PG_STOPPING (1<<4)
-#define PG_OFFLINE (1<<5)
-// Plus any of these:
-#define PG_DEGRADED (1<<6)
-#define PG_HAS_INCOMPLETE (1<<7)
-#define PG_HAS_DEGRADED (1<<8)
-#define PG_HAS_MISPLACED (1<<9)
-#define PG_HAS_UNCLEAN (1<<10)
-#define PG_LEFT_ON_DEAD (1<<11)
-
-// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
-#define STRIPE_MASK ((uint64_t)4096 - 1)
-
-// OSD object states
-#define OBJ_DEGRADED 0x02
-#define OBJ_INCOMPLETE 0x04
-#define OBJ_MISPLACED 0x08
-#define OBJ_NEEDS_STABLE 0x10000
-#define OBJ_NEEDS_ROLLBACK 0x20000
-#define OBJ_BUGGY 0x80000
-
-extern const int pg_state_bits[];
-extern const char *pg_state_names[];
-extern const int pg_state_bit_count;
--- a/ringloop.cpp
+++ b/ringloop.cpp
@@ -18,7 +18,6 @@ ring_loop_t::ring_loop_t(int qd)
    {
        free_ring_data[i] = i;
    }
-    wait_sqe_id = 1;
 }

 ring_loop_t::~ring_loop_t()
@@ -28,10 +27,11 @@ ring_loop_t::~ring_loop_t()
    io_uring_queue_exit(&ring);
 }

-void ring_loop_t::register_consumer(ring_consumer_t *consumer)
+int ring_loop_t::register_consumer(ring_consumer_t & consumer)
 {
-    unregister_consumer(consumer);
+    consumer.number = consumers.size();
    consumers.push_back(consumer);
+    return consumer.number;
 }

 void ring_loop_t::wakeup()
@@ -39,15 +39,12 @@ void ring_loop_t::wakeup()
    loop_again = true;
 }

-void ring_loop_t::unregister_consumer(ring_consumer_t *consumer)
+void ring_loop_t::unregister_consumer(ring_consumer_t & consumer)
 {
-    for (int i = 0; i < consumers.size(); i++)
+    if (consumer.number >= 0 && consumer.number < consumers.size())
    {
-        if (consumers[i] == consumer)
-        {
-            consumers.erase(consumers.begin()+i, consumers.begin()+i+1);
-            break;
-        }
+        consumers[consumer.number].loop = NULL;
+        consumer.number = -1;
    }
 }

@@ -65,17 +62,12 @@ void ring_loop_t::loop()
        free_ring_data[free_ring_data_ptr++] = d - ring_datas;
        io_uring_cqe_seen(&ring, cqe);
    }
-    while (get_sqe_queue.size() > 0)
-    {
-        (get_sqe_queue[0].second)();
-        get_sqe_queue.erase(get_sqe_queue.begin());
-    }
    do
    {
        loop_again = false;
        for (int i = 0; i < consumers.size(); i++)
        {
-            consumers[i]->loop();
+            consumers[i].loop();
        }
    } while (loop_again);
 }
--- a/ringloop.h
+++ b/ringloop.h
@@ -113,24 +113,23 @@ struct ring_data_t

 struct ring_consumer_t
 {
+    int number;
    std::function<void(void)> loop;
 };

 class ring_loop_t
 {
-    std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
-    std::vector<ring_consumer_t*> consumers;
+    std::vector<ring_consumer_t> consumers;
    struct ring_data_t *ring_datas;
    int *free_ring_data;
-    int wait_sqe_id;
    unsigned free_ring_data_ptr;
    bool loop_again;
    struct io_uring ring;
 public:
    ring_loop_t(int qd);
    ~ring_loop_t();
-    void register_consumer(ring_consumer_t *consumer);
-    void unregister_consumer(ring_consumer_t *consumer);
+    int register_consumer(ring_consumer_t & consumer);
+    void unregister_consumer(ring_consumer_t & consumer);

    inline struct io_uring_sqe* get_sqe()
    {
@@ -141,35 +140,19 @@ public:
            io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
        return sqe;
    }
-    inline int wait_sqe(std::function<void()> cb)
-    {
-        get_sqe_queue.push_back({ wait_sqe_id, cb });
-        return wait_sqe_id++;
-    }
-    inline void cancel_wait_sqe(int wait_id)
-    {
-        for (int i = 0; i < get_sqe_queue.size(); i++)
-        {
-            if (get_sqe_queue[i].first == wait_id)
-            {
-                get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
-            }
-        }
-    }
    inline int submit()
    {
        return io_uring_submit(&ring);
    }
    inline int wait()
    {
-        struct io_uring_cqe *cqe;
-        return io_uring_wait_cqe(&ring, &cqe);
+        return io_uring_submit_and_wait(&ring, 1);
    }
    inline unsigned space_left()
    {
        return free_ring_data_ptr;
    }
-    inline bool has_work()
+    inline bool get_loop_again()
    {
        return loop_again;
    }
--- a/rw_blocking.cpp
+++ b/rw_blocking.cpp
@@ -51,40 +51,6 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
    return done;
 }

-int readv_blocking(int fd, iovec *iov, int iovcnt)
-{
-    int v = 0;
-    int done = 0;
-    while (v < iovcnt)
-    {
-        ssize_t r = readv(fd, iov, iovcnt);
-        if (r < 0)
-        {
-            if (errno != EAGAIN && errno != EPIPE)
-            {
-                perror("writev");
-                exit(1);
-            }
-            continue;
-        }
-        while (v < iovcnt)
-        {
-            if (iov[v].iov_len > r)
-            {
-                iov[v].iov_len -= r;
-                iov[v].iov_base += r;
-                break;
-            }
-            else
-            {
-                v++;
-            }
-        }
-        done += r;
-    }
-    return done;
-}
-
 int writev_blocking(int fd, iovec *iov, int iovcnt)
 {
    int v = 0;
--- a/rw_blocking.h
+++ b/rw_blocking.h
@@ -5,5 +5,4 @@

 int read_blocking(int fd, void *read_buf, size_t remaining);
 int write_blocking(int fd, void *write_buf, size_t remaining);
-int readv_blocking(int fd, iovec *iov, int iovcnt);
 int writev_blocking(int fd, iovec *iov, int iovcnt);
--- a/stub_bench.cpp
+++ b/stub_bench.cpp
@@ -25,37 +25,20 @@ int connect_stub(const char *server_address, int server_port);

 void run_bench(int peer_fd);

-static uint64_t read_sum = 0, read_count = 0;
 static uint64_t write_sum = 0, write_count = 0;
 static uint64_t sync_sum = 0, sync_count = 0;

 void handle_sigint(int sig)
 {
-    printf("4k randread: %lu us avg\n", read_count ? read_sum/read_count : 0);
-    printf("4k randwrite: %lu us avg\n", write_count ? write_sum/write_count : 0);
-    printf("sync: %lu us avg\n", sync_count ? sync_sum/sync_count : 0);
+    printf("4k randwrite: %lu us avg\n", write_sum/write_count);
+    printf("sync: %lu us avg\n", sync_sum/sync_count);
    exit(0);
 }

 int main(int narg, char *args[])
 {
-    if (narg < 2)
-    {
-        printf("USAGE: %s SERVER_IP [PORT]\n", args[0]);
-        return 1;
-    }
-    int port = 11203;
-    if (narg >= 3)
-    {
-        port = atoi(args[2]);
-        if (port <= 0 || port >= 65536)
-        {
-            printf("Bad port number\n");
-            return 1;
-        }
-    }
    signal(SIGINT, handle_sigint);
-    int peer_fd = connect_stub(args[1], port);
+    int peer_fd = connect_stub("127.0.0.1", 11203);
    run_bench(peer_fd);
    close(peer_fd);
    return 0;
@@ -115,37 +98,10 @@ void run_bench(int peer_fd)
    osd_any_reply_t reply;
    void *buf = NULL;
    int r;
-    iovec iov[2];
    timespec tv_begin, tv_end;
    clock_gettime(CLOCK_REALTIME, &tv_begin);
    while (1)
    {
-        // read
-        op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
-        op.hdr.id = 1;
-        op.hdr.opcode = OSD_OP_SECONDARY_READ;
-        op.sec_rw.oid.inode = 3;
-        op.sec_rw.oid.stripe = (rand() << 17) % (1 << 29); // 512 MB
-        op.sec_rw.version = 0;
-        op.sec_rw.len = 4096;
-        op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
-        r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
-        if (!r)
-            break;
-        buf = malloc(op.sec_rw.len);
-        iov[0] = { reply.buf, OSD_PACKET_SIZE };
-        iov[1] = { buf, op.sec_rw.len };
-        r = readv_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
-        free(buf);
-        if (!r || !check_reply(OSD_PACKET_SIZE, op, reply, op.sec_rw.len))
-            break;
-        clock_gettime(CLOCK_REALTIME, &tv_end);
-        read_count++;
-        read_sum += (
-            (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
-            tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
-        );
-        tv_begin = tv_end;
        // write
        op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
        op.hdr.id = 1;
@@ -157,9 +113,9 @@ void run_bench(int peer_fd)
        op.sec_rw.offset = (rand() * op.sec_rw.len) % (1 << 17);
        buf = malloc(op.sec_rw.len);
        memset(buf, rand() % 255, op.sec_rw.len);
-        iov[0] = { op.buf, OSD_PACKET_SIZE };
-        iov[1] = { buf, op.sec_rw.len };
-        r = writev_blocking(peer_fd, iov, 2) == (OSD_PACKET_SIZE + op.sec_rw.len);
+        r = write_blocking(peer_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE;
+        if (r)
+            r = write_blocking(peer_fd, buf, op.sec_rw.len) == op.sec_rw.len;
        free(buf);
        if (!r)
            break;
@@ -172,7 +128,6 @@ void run_bench(int peer_fd)
            (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
            tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
        );
-        tv_begin = tv_end;
        // sync/stab
        op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
        op.hdr.id = 1;
@@ -183,12 +138,11 @@ void run_bench(int peer_fd)
        r = read_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
        if (!check_reply(r, op, reply, 0))
            break;
-        clock_gettime(CLOCK_REALTIME, &tv_end);
+        clock_gettime(CLOCK_REALTIME, &tv_begin);
        sync_count++;
        sync_sum += (
-            (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
-            tv_end.tv_nsec/1000 - tv_begin.tv_nsec/1000
+            (tv_begin.tv_sec - tv_end.tv_sec)*1000000 +
+            tv_begin.tv_nsec/1000 - tv_end.tv_nsec/1000
        );
-        tv_begin = tv_end;
    }
 }
--- a/stub_uring_osd.cpp
+++ b/stub_uring_osd.cpp
@@ -1,129 +0,0 @@
-/**
- * Stub "OSD" implemented on top of osd_messenger to test & compare
- * network performance with sync read/write and io_uring
- */
-
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <arpa/inet.h>
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdlib.h>
-
-#include <stdexcept>
-
-#include "ringloop.h"
-#include "epoll_manager.h"
-#include "messenger.h"
-
-int bind_stub(const char *bind_address, int bind_port);
-
-void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
-
-int main(int narg, char *args[])
-{
-    ring_consumer_t looper;
-    ring_loop_t *ringloop = new ring_loop_t(512);
-    epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
-    osd_messenger_t *msgr = new osd_messenger_t();
-    msgr->osd_num = 1351;
-    msgr->tfd = epmgr->tfd;
-    msgr->ringloop = ringloop;
-    msgr->repeer_pgs = [](osd_num_t) {};
-    msgr->exec_op = [msgr](osd_op_t *op) { stub_exec_op(msgr, op); };
-    // Accept new connections
-    int listen_fd = bind_stub("0.0.0.0", 11203);
-    epmgr->set_fd_handler(listen_fd, [listen_fd, msgr](int fd, int events)
-    {
-        msgr->accept_connections(listen_fd);
-    });
-    looper.loop = [msgr, ringloop]()
-    {
-        msgr->read_requests();
-        msgr->send_replies();
-        ringloop->submit();
-    };
-    ringloop->register_consumer(&looper);
-    printf("stub_uring_osd: waiting for clients\n");
-    while (true)
-    {
-        ringloop->loop();
-        ringloop->wait();
-    }
-    delete msgr;
-    delete epmgr;
-    delete ringloop;
-    return 0;
-}
-
-int bind_stub(const char *bind_address, int bind_port)
-{
-    int listen_backlog = 128;
-
-    int listen_fd = socket(AF_INET, SOCK_STREAM, 0);
-    if (listen_fd < 0)
-    {
-        throw std::runtime_error(std::string("socket: ") + strerror(errno));
-    }
-    int enable = 1;
-    setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
-
-    sockaddr_in addr;
-    int r;
-    if ((r = inet_pton(AF_INET, bind_address, &addr.sin_addr)) != 1)
-    {
-        close(listen_fd);
-        throw std::runtime_error("bind address "+std::string(bind_address)+(r == 0 ? " is not valid" : ": no ipv4 support"));
-    }
-    addr.sin_family = AF_INET;
-    addr.sin_port = htons(bind_port);
-
-    if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
-    {
-        close(listen_fd);
-        throw std::runtime_error(std::string("bind: ") + strerror(errno));
-    }
-
-    if (listen(listen_fd, listen_backlog) < 0)
-    {
-        close(listen_fd);
-        throw std::runtime_error(std::string("listen: ") + strerror(errno));
-    }
-
-    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
-
-    return listen_fd;
-}
-
-void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
-{
-    op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-    op->reply.hdr.id = op->req.hdr.id;
-    op->reply.hdr.opcode = op->req.hdr.opcode;
-    op->send_list.push_back(op->reply.buf, OSD_PACKET_SIZE);
-    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
-    {
-        op->reply.hdr.retval = op->req.sec_rw.len;
-        op->buf = malloc(op->req.sec_rw.len);
-        op->send_list.push_back(op->buf, op->req.sec_rw.len);
-    }
-    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
-    {
-        op->reply.hdr.retval = op->req.sec_rw.len;
-    }
-    else if (op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
-    {
-        op->reply.hdr.retval = 0;
-    }
-    else
-    {
-        printf("client %d: unsupported stub opcode: %lu\n", op->peer_fd, op->req.hdr.opcode);
-        op->reply.hdr.retval = -EINVAL;
-    }
-    msgr->outbox_push(op);
-}
--- a/test.cpp
+++ b/test.cpp
@@ -13,7 +13,6 @@
 #include <assert.h>
 #include <stdio.h>
 #include <liburing.h>
-#include <math.h>

 #include <sys/socket.h>
 #include <sys/epoll.h>
@@ -62,6 +61,24 @@ static void test_write(struct io_uring *ring, int fd)
    free(buf);
 }

+class obj_ver_hash
+{
+public:
+    size_t operator()(const obj_ver_id &s) const
+    {
+        size_t seed = 0;
+        spp::hash_combine(seed, s.oid.inode);
+        spp::hash_combine(seed, s.oid.stripe);
+        spp::hash_combine(seed, s.version);
+        return seed;
+    }
+};
+
+inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
+{
+    return a.oid == b.oid && a.version == b.version;
+}
+
 int main00(int argc, char *argv[])
 {
    // queue with random removal: vector is best :D
@@ -153,9 +170,9 @@ int main0(int argc, char *argv[])
    // btree_map 5M entries monotone -> 0.458s, random -> 5.429s
    // absl::btree_map 5M entries random -> 5.09s
    // sparse_hash_map 5M entries -> 2.193s, random -> 2.586s
-    btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
+    //btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
    //std::map<obj_ver_id, dirty_entry> dirty_db;
-    //spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
+    spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
    for (int i = 0; i < 5000000; i++)
    {
        dirty_db[(obj_ver_id){
@@ -165,7 +182,7 @@ int main0(int argc, char *argv[])
            },
            .version = 1,
        }] = (dirty_entry){
-            .state = ST_D_SYNCED,
+            .state = ST_D_META_SYNCED,
            .flags = 0,
            .location = (uint64_t)i << 17,
            .offset = 0,
@@ -320,253 +337,87 @@ int main04(int argc, char *argv[])
    return 0;
 }

-uint64_t jumphash(uint64_t key, int count)
+int main05(int argc, char *argv[])
 {
-    uint64_t b = 0;
-    uint64_t seed = key;
-    for (int j = 1; j < count; j++)
+    // FIXME extract this into a test
+    pg_t pg = {
+        .state = PG_PEERING,
+        .pg_num = 1,
+        .target_set = { 1, 2, 3 },
+        .cur_set = { 1, 2, 3 },
+        .peering_state = new pg_peering_state_t(),
+    };
+    for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
    {
-        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-        if (seed < (UINT64_MAX / (j+1)))
+        pg_list_result_t r = {
+            .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
+            .total_count = 1024*1024*8,
+            .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
+        };
+        for (uint64_t i = 0; i < r.total_count; i++)
        {
-            b = j;
+            r.buf[i] = {
+                .oid = {
+                    .inode = 1,
+                    .stripe = (i << STRIPE_SHIFT) | (osd_num-1),
+                },
+                .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
+            };
        }
+        pg.peering_state->list_results[osd_num] = r;
    }
-    return b;
-}
-
-void jumphash_prepare(int count, uint64_t *out_weights, uint64_t *in_weights)
-{
-    if (count <= 0)
+    pg.calc_object_states();
+    printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
+    for (auto it: pg.state_dict)
    {
-        return;
-    }
-    uint64_t total_weight = in_weights[0];
-    out_weights[0] = UINT64_MAX;
-    for (int j = 1; j < count; j++)
-    {
-        total_weight += in_weights[j];
-        out_weights[j] = UINT64_MAX / total_weight * in_weights[j];
-    }
-}
-
-uint64_t jumphash_weights(uint64_t key, int count, uint64_t *prepared_weights)
-{
-    uint64_t b = 0;
-    uint64_t seed = key;
-    for (int j = 1; j < count; j++)
-    {
-        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-        if (seed < prepared_weights[j])
-        {
-            b = j;
-        }
-    }
-    return b;
-}
-
-void jumphash3(uint64_t key, int count, uint64_t *weights, uint64_t *r)
-{
-    r[0] = 0;
-    r[1] = 1;
-    r[2] = 2;
-    uint64_t total_weight = weights[0]+weights[1]+weights[2];
-    uint64_t seed = key;
-    for (int j = 3; j < count; j++)
-    {
-        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-        total_weight += weights[j];
-        if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
-            r[0] = j;
-        else
-        {
-            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-            if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
-                r[1] = j;
-            else
-            {
-                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-                if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
-                    r[2] = j;
-            }
-        }
-    }
-}
-
-uint64_t crush(uint64_t key, int count, uint64_t *weights)
-{
-    uint64_t b = 0;
-    uint64_t seed = 0;
-    uint64_t max = 0;
-    for (int j = 0; j < count; j++)
-    {
-        seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-        seed ^= (j + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-        seed = -log(((double)seed) / (1ul << 32) / (1ul << 32)) * weights[j];
-        if (seed > max)
-        {
-            max = seed;
-            b = j;
-        }
-    }
-    return b;
-}
-
-void crush3(uint64_t key, int count, uint64_t *weights, uint64_t *r, uint64_t total_weight)
-{
-    uint64_t seed = 0;
-    uint64_t max = 0;
-    for (int k1 = 0; k1 < count; k1++)
-    {
-        for (int k2 = k1+1; k2 < count; k2++)
-        {
-            if (k2 == k1)
-            {
-                continue;
-            }
-            for (int k3 = k2+1; k3 < count; k3++)
-            {
-                if (k3 == k1 || k3 == k2)
-                {
-                    continue;
-                }
-                seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-                seed ^= (k1 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-                seed ^= (k2 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-                seed ^= (k3 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-                //seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (weights[k1] + weights[k2] + weights[k3]);
-                seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (1 -
-                    (1 - 1.0*weights[k1]/total_weight)*
-                    (1 - 1.0*weights[k2]/total_weight)*
-                    (1 - 1.0*weights[k3]/total_weight)
-                ) * UINT64_MAX;
-                if (seed > max)
-                {
-                    r[0] = k1;
-                    r[1] = k2;
-                    r[2] = k3;
-                    max = seed;
-                }
-            }
-        }
+        printf("dev: state=%lx\n", it.second.state);
    }
+    return 0;
 }

 int main(int argc, char *argv[])
 {
-    int host_count = 6;
-    uint64_t host_weights[] = {
-        34609*3,
-        34931*3,
-        35850+36387+35859,
-        36387,
-        36387*2,
-        36387,
-    };
-    /*int osd_count[] = { 3, 3, 3, 1, 2 };
-    uint64_t osd_weights[][3] = {
-        { 34609, 34609, 34609 },
-        { 34931, 34931, 34931 },
-        { 35850, 36387, 35859 },
-        { 36387 },
-        { 36387, 36387 },
-    };*/
-    uint64_t total_weight = 0;
-    for (int i = 0; i < host_count; i++)
+    timeval fill_start, fill_end, filter_end;
+    spp::sparse_hash_map<object_id, clean_entry> clean_db;
+    //std::map<object_id, clean_entry> clean_db;
+    //btree::btree_map<object_id, clean_entry> clean_db;
+    gettimeofday(&fill_start, NULL);
+    printf("filling\n");
+    uint64_t total = 1024*1024*8*4;
+    clean_db.resize(total);
+    for (uint64_t i = 0; i < total; i++)
    {
-        total_weight += host_weights[i];
+        clean_db[(object_id){
+            .inode = 1,
+            //.stripe = (i << STRIPE_SHIFT),
+            .stripe = (((367*i) % total) << STRIPE_SHIFT),
+        }] = (clean_entry){
+            .version = 1,
+            .location = i << DEFAULT_ORDER,
+        };
    }
-    uint64_t host_weights_prepared[host_count];
-    jumphash_prepare(host_count, host_weights_prepared, host_weights);
-    uint64_t total_pgs[host_count] = { 0 };
-    int pg_count = 256;
-    double uniformity[pg_count] = { 0 };
-    for (uint64_t pg = 1; pg <= pg_count; pg++)
+    gettimeofday(&fill_end, NULL);
+    // no resize():
+    // spp = 17.87s (seq), 41.81s (rand), 3.29s (seq+resize), 8.3s (rand+resize), ~1.3G RAM in all cases
+    // std::unordered_map = 6.14 sec, ~2.3G RAM
+    // std::map = 13 sec (seq), 5.54 sec (rand), ~2.5G RAM
+    // cpp-btree = 2.47 sec (seq) ~1.2G RAM, 20.6 sec (pseudo-random 367*i % total) ~1.5G RAM
+    printf("filled %.2f sec\n", (fill_end.tv_sec - fill_start.tv_sec) + (fill_end.tv_usec - fill_start.tv_usec) / 1000000.0);
+    for (int pg = 0; pg < 100; pg++)
    {
-        uint64_t r[3];
-
-/*
-        // Select first host
-        //r[0] = jumphash_weights(pg, host_count, host_weights_prepared);
-        r[0] = crush(pg, host_count, host_weights);
-        // Select second host
-        uint64_t seed = pg;
-        r[1] = r[0];
-        while (r[1] == r[0])
-        {
-            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-            //r[1] = jumphash_weights(seed, host_count, host_weights_prepared);
-            r[1] = crush(seed, host_count, host_weights);
-        }
-        // Select third host
-        seed = pg;
-        r[2] = r[0];
-        while (r[2] == r[0] || r[2] == r[1])
-        {
-            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-            //r[2] = jumphash_weights(seed, host_count, host_weights_prepared);
-            r[2] = crush(seed, host_count, host_weights);
-        }
-*/
-
-/*
-        // Select second host
-        uint64_t host_weights1[host_count];
-        for (int i = 0; i < r[0]; i++)
-            host_weights1[i] = host_weights[i];
-        for (int i = r[0]+1; i < host_count; i++)
-            host_weights1[i-1] = host_weights[i];
-        r[1] = crush(pg, host_count-1, host_weights1);
-        // Select third host
-        for (int i = r[1]+1; i < host_count-1; i++)
-            host_weights1[i-1] = host_weights[i];
-        r[2] = crush(pg, host_count-2, host_weights1);
-        // Transform numbers
-        r[2] = r[2] >= r[1] ? 1+r[2] : r[2];
-        r[2] = r[2] >= r[0] ? 1+r[2] : r[2];
-        r[1] = r[1] >= r[0] ? 1+r[1] : r[1];
-*/
-
-        crush3(pg, host_count, host_weights, r, total_weight);
-        uint64_t shift = (2862933555777941757ull*pg + 3037000493ull) % host_count;
-        if (shift == 1)
-        {
-            uint64_t tmp;
-            tmp = r[0];
-            r[0] = r[1];
-            r[1] = r[2];
-            r[2] = tmp;
-        }
-        else if (shift == 2)
-        {
-            uint64_t tmp;
-            tmp = r[0];
-            r[0] = r[2];
-            r[2] = r[1];
-            r[1] = tmp;
-        }
-
-        total_pgs[r[0]]++;
-        total_pgs[r[1]]++;
-        total_pgs[r[2]]++;
-
-        double u = 0;
-        for (int i = 0; i < host_count; i++)
-        {
-            double d = abs(1 - total_pgs[i]/3.0/pg * total_weight/host_weights[i]);
-            u += d;
-        }
-        uniformity[pg-1] = u/host_count;
-
-        printf("pg %lu: hosts %lu, %lu, %lu ; avg deviation = %.2f\n", pg, r[0], r[1], r[2], u/host_count);
+        obj_ver_id* buf1 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * ((total+99)/100));
+        int j = 0;
+        for (auto it: clean_db)
+            if ((it.first % 100) == pg)
+                buf1[j++] = { .oid = it.first, .version = it.second.version };
+        free(buf1);
+        printf("filtered %d\n", j);
    }
-    printf("total PGs: ");
-    for (int i = 0; i < host_count; i++)
-    {
-        printf(i > 0 ? ", %lu (%.2f)" : "%lu (%.2f)", total_pgs[i], total_pgs[i]/3.0/pg_count * total_weight/host_weights[i]);
-    }
-    printf("\n");
+    gettimeofday(&filter_end, NULL);
+    // spp = 42.15 sec / 60 sec (rand)
+    // std::unordered_map = 43.7 sec
+    // std::map = 156.13 sec
+    // cpp-btree = 21.87 sec (seq), 44.33 sec (rand)
+    printf("100 times filter %.2f sec\n", (filter_end.tv_sec - fill_end.tv_sec) + (filter_end.tv_usec - fill_end.tv_usec) / 1000000.0);
    return 0;
 }
--- a/test_blockstore.cpp
+++ b/test_blockstore.cpp
@@ -115,7 +115,7 @@ int main(int narg, char *args[])
        }
    };

-    ringloop->register_consumer(&main_cons);
+    ringloop->register_consumer(main_cons);
    while (1)
    {
        ringloop->loop();
--- a/timerfd_interval.cpp
+++ b/timerfd_interval.cpp
@@ -20,14 +20,14 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func
        throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
    }
    consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(&consumer);
+    ringloop->register_consumer(consumer);
    this->ringloop = ringloop;
    this->callback = cb;
 }

 timerfd_interval::~timerfd_interval()
 {
-    ringloop->unregister_consumer(&consumer);
+    ringloop->unregister_consumer(consumer);
    close(timerfd);
 }

--- a/timerfd_interval.h
+++ b/timerfd_interval.h
@@ -6,6 +6,7 @@ class timerfd_interval
 {
    int wait_state;
    int timerfd;
+    int status;
    ring_loop_t *ringloop;
    ring_consumer_t consumer;
    std::function<void(void)> callback;
--- a/timerfd_manager.cpp
+++ b/timerfd_manager.cpp
@@ -1,159 +0,0 @@
-#include <sys/timerfd.h>
-#include <sys/poll.h>
-#include <sys/epoll.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include "timerfd_manager.h"
-
-timerfd_manager_t::timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler)
-{
-    this->set_fd_handler = set_fd_handler;
-    wait_state = 0;
-    timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-    if (timerfd < 0)
-    {
-        throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
-    }
-    set_fd_handler(timerfd, [this](int fd, int events)
-    {
-        handle_readable();
-    });
-}
-
-timerfd_manager_t::~timerfd_manager_t()
-{
-    set_fd_handler(timerfd, NULL);
-    close(timerfd);
-}
-
-void timerfd_manager_t::inc_timer(timerfd_timer_t & t)
-{
-    t.next.tv_sec += t.millis/1000;
-    t.next.tv_nsec += (t.millis%1000)*1000000;
-    if (t.next.tv_nsec > 1000000000)
-    {
-        t.next.tv_sec++;
-        t.next.tv_nsec -= 1000000000;
-    }
-}
-
-int timerfd_manager_t::set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback)
-{
-    int timer_id = id++;
-    timespec start;
-    clock_gettime(CLOCK_MONOTONIC, &start);
-    timers.push_back({
-        .id = timer_id,
-        .millis = millis,
-        .start = start,
-        .next = start,
-        .repeat = repeat,
-        .callback = callback,
-    });
-    inc_timer(timers[timers.size()-1]);
-    set_nearest();
-    return timer_id;
-}
-
-void timerfd_manager_t::clear_timer(int timer_id)
-{
-    for (int i = 0; i < timers.size(); i++)
-    {
-        if (timers[i].id == timer_id)
-        {
-            timers.erase(timers.begin()+i, timers.begin()+i+1);
-            if (nearest == i)
-            {
-                nearest = -1;
-                wait_state = wait_state & ~1;
-            }
-            else if (nearest > i)
-            {
-                nearest--;
-            }
-            set_nearest();
-            break;
-        }
-    }
-}
-
-void timerfd_manager_t::set_nearest()
-{
-again:
-    if (!timers.size())
-    {
-        nearest = -1;
-        itimerspec exp = { 0 };
-        if (timerfd_settime(timerfd, 0, &exp, NULL))
-        {
-            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
-        }
-        wait_state = wait_state & ~1;
-    }
-    else
-    {
-        nearest = 0;
-        for (int i = 1; i < timers.size(); i++)
-        {
-            if (timers[i].next.tv_sec < timers[nearest].next.tv_sec ||
-                timers[i].next.tv_sec == timers[nearest].next.tv_sec &&
-                timers[i].next.tv_nsec < timers[nearest].next.tv_nsec)
-            {
-                nearest = i;
-            }
-        }
-        timespec now;
-        clock_gettime(CLOCK_MONOTONIC, &now);
-        itimerspec exp = {
-            .it_interval = { 0 },
-            .it_value = timers[nearest].next,
-        };
-        exp.it_value.tv_sec -= now.tv_sec;
-        exp.it_value.tv_nsec -= now.tv_nsec;
-        if (exp.it_value.tv_nsec < 0)
-        {
-            exp.it_value.tv_sec--;
-            exp.it_value.tv_nsec += 1000000000;
-        }
-        if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
-        {
-            // It already happened
-            trigger_nearest();
-            goto again;
-        }
-        if (timerfd_settime(timerfd, 0, &exp, NULL))
-        {
-            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
-        }
-        wait_state = wait_state | 1;
-    }
-}
-
-void timerfd_manager_t::handle_readable()
-{
-    uint64_t n;
-    size_t res = read(timerfd, &n, 8);
-    if (res == 8 && nearest >= 0)
-    {
-        trigger_nearest();
-    }
-    wait_state = 0;
-    set_nearest();
-}
-
-void timerfd_manager_t::trigger_nearest()
-{
-    int nearest_id = timers[nearest].id;
-    auto cb = timers[nearest].callback;
-    if (timers[nearest].repeat)
-    {
-        inc_timer(timers[nearest]);
-    }
-    else
-    {
-        timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
-    }
-    cb(nearest_id);
-    nearest = -1;
-}
--- a/timerfd_manager.h
+++ b/timerfd_manager.h
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <time.h>
-#include <vector>
-#include <functional>
-
-struct timerfd_timer_t
-{
-    int id;
-    uint64_t millis;
-    timespec start, next;
-    bool repeat;
-    std::function<void(int)> callback;
-};
-
-class timerfd_manager_t
-{
-    int wait_state = 0;
-    int timerfd;
-    int nearest = -1;
-    int id = 1;
-    std::vector<timerfd_timer_t> timers;
-
-    void inc_timer(timerfd_timer_t & t);
-    void set_nearest();
-    void trigger_nearest();
-    void handle_readable();
-public:
-    std::function<void(int, std::function<void(int, int)>)> set_fd_handler;
-
-    timerfd_manager_t(std::function<void(int, std::function<void(int, int)>)> set_fd_handler);
-    ~timerfd_manager_t();
-    int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
-    void clear_timer(int timer_id);
-};