TRACE

EPOLLLT
Submit
2020-05-28 12:41:08 +03:00 · 2020-05-28 12:41:08 +03:00 · 2020-05-28 12:41:08 +03:00 · 2020-05-27 13:55:25 +03:00 · 2020-05-27 10:58:40 +03:00 · 2020-05-26 22:11:30 +03:00
63 changed files with 8812 additions and 1990 deletions
--- a/46
+++ b/46
@@ -1,8 +1,8 @@
 BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
-	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o timerfd_interval.o
+	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
 # -fsanitize=address
 CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
-all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so test_blockstore stub_osd stub_bench osd_test
+all: $(BLOCKSTORE_OBJS) libfio_blockstore.so osd libfio_sec_osd.so stub_osd stub_bench osd_test dump_journal
 clean:
 	rm -f *.o

@@ -16,16 +16,24 @@ ringloop.o: ringloop.cpp ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_interval.o: timerfd_interval.cpp timerfd_interval.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-
-%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h timerfd_interval.h object_id.h
+timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<

+%.o: %.cpp allocator.h blockstore_flush.h blockstore.h blockstore_impl.h blockstore_init.h blockstore_journal.h crc32c.h ringloop.h object_id.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
+	g++ $(CXXFLAGS) -o $@ $< crc32c.o
+
 libblockstore.so: $(BLOCKSTORE_OBJS)
 	g++ $(CXXFLAGS) -o libblockstore.so -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
 libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
 	g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring

-OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_peering_pg.o osd_primary.o osd_rmw.o json11.o timerfd_interval.o
+OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
+	osd_primary.o osd_primary_subops.o etcd_state_client.o cluster_client.o osd_cluster.o http_client.o pg_states.o \
+	osd_rmw.o json11.o base64.o timerfd_manager.o
+base64.o: base64.cpp base64.h
+	g++ $(CXXFLAGS) -c -o $@ $<
 osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_receive.o: osd_receive.cpp osd.h osd_ops.h ringloop.h
@@ -34,15 +42,29 @@ osd_send.o: osd_send.cpp osd.h osd_ops.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_peering.o: osd_peering.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h
+osd_cluster.o: osd_cluster.cpp osd.h osd_ops.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+http_client.o: http_client.cpp http_client.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+etcd_state_client.o: etcd_state_client.cpp etcd_state_client.h http_client.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+cluster_client.o: cluster_client.cpp cluster_client.h osd_ops.h timerfd_manager.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_flush.o: osd_flush.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg.o: osd_peering_pg.cpp object_id.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+pg_states.o: pg_states.cpp pg_states.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_rmw.o: osd_rmw.cpp osd_rmw.h xor.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_rmw_test: osd_rmw_test.cpp osd_rmw.cpp osd_rmw.h xor.h
 	g++ $(CXXFLAGS) -o $@ $<
-osd_primary.o: osd_primary.cpp osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
+osd_primary.o: osd_primary.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-osd.o: osd.cpp osd.h osd_ops.h osd_peering_pg.h ringloop.h
+osd_primary_subops.o: osd_primary_subops.cpp osd_primary.h osd_rmw.h osd.h osd_ops.h osd_peering_pg.h xor.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd.o: osd.cpp osd.h http_client.h osd_ops.h osd_peering_pg.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
 	g++ $(CXXFLAGS) -o osd osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
@@ -54,13 +76,15 @@ rw_blocking.o: rw_blocking.cpp rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 osd_test: osd_test.cpp osd_ops.h rw_blocking.o
 	g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
+osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal

 libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring

-test_blockstore: ./libblockstore.so test_blockstore.cpp
-	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp ./libblockstore.so -ltcmalloc_minimal -luring
+test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
+	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
 test: test.cpp osd_peering_pg.o
-	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring
+	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
 test_allocator: test_allocator.cpp allocator.o
 	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
--- a/base64.cpp
+++ b/base64.cpp
@@ -0,0 +1,52 @@
+#include "base64.h"
+
+std::string base64_encode(const std::string &in)
+{
+    std::string out;
+    unsigned val = 0;
+    int valb = -6;
+    for (unsigned char c: in)
+    {
+        val = (val << 8) + c;
+        valb += 8;
+        while (valb >= 0)
+        {
+            out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb) & 0x3F]);
+            valb -= 6;
+        }
+    }
+    if (valb > -6)
+        out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8)) & 0x3F]);
+    while (out.size() % 4)
+        out.push_back('=');
+    return out;
+}
+
+static char T[256] = { 0 };
+
+std::string base64_decode(const std::string &in)
+{
+    std::string out;
+    if (T[0] == 0)
+    {
+        for (int i = 0; i < 256; i++)
+            T[i] = -1;
+        for (int i = 0; i < 64; i++)
+            T[(unsigned char)("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i])] = i;
+    }
+    unsigned val = 0;
+    int valb = -8;
+    for (unsigned char c: in)
+    {
+        if (T[c] == -1)
+            break;
+        val = (val<<6) + T[c];
+        valb += 6;
+        if (valb >= 0)
+        {
+            out.push_back(char((val >> valb) & 0xFF));
+            valb -= 8;
+        }
+    }
+    return out;
+}
--- a/base64.h
+++ b/base64.h
@@ -0,0 +1,5 @@
+#pragma once
+#include <string>
+
+std::string base64_encode(const std::string &in);
+std::string base64_decode(const std::string &in);
--- a/blockstore.cpp
+++ b/blockstore.cpp
@@ -55,6 +55,11 @@ uint64_t blockstore_t::get_block_count()
    return impl->get_block_count();
 }

+uint64_t blockstore_t::get_free_block_count()
+{
+    return impl->get_free_block_count();
+}
+
 uint32_t blockstore_t::get_disk_alignment()
 {
    return impl->get_disk_alignment();
--- a/blockstore.h
+++ b/blockstore.h
@@ -15,7 +15,9 @@

 // Memory alignment for direct I/O (usually 512 bytes)
 // All other alignments must be a multiple of this one
+#ifndef MEM_ALIGNMENT
 #define MEM_ALIGNMENT 512
+#endif

 // Default block size is 128 KB, current allowed range is 4K - 128M
 #define DEFAULT_ORDER 17
@@ -50,6 +52,7 @@ Input:
  - version == 0: read the last stable version,
  - version == UINT64_MAX: read the last version,
  - otherwise: read the newest version that is <= the specified version
+  - reads aren't guaranteed to return data from previous unfinished writes
  For writes:
  - if version == 0, a new version is assigned automatically
  - if version != 0, it is assigned for the new write if possible, otherwise -EINVAL is returned
@@ -92,7 +95,7 @@ Input:
 - buf = pre-allocated obj_ver_id array <len> units long

 Output:
- retval = 0 or negative error number (-EINVAL)
+- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)

 ## BS_OP_SYNC_STAB_ALL

@@ -175,6 +178,7 @@ public:
    // FIXME rename to object_size
    uint32_t get_block_size();
    uint64_t get_block_count();
+    uint64_t get_free_block_count();

    uint32_t get_disk_alignment();
 };
--- a/blockstore_flush.cpp
+++ b/blockstore_flush.cpp
@@ -4,8 +4,10 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
 {
    this->bs = bs;
    this->flusher_count = flusher_count;
+    dequeuing = false;
    active_flushers = 0;
-    sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
+    syncing_flushers = 0;
+    sync_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
    journal_trim_interval = sync_threshold;
    journal_trim_counter = 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(MEM_ALIGNMENT, bs->journal_block_size);
@@ -55,17 +57,13 @@ journal_flusher_t::~journal_flusher_t()

 bool journal_flusher_t::is_active()
 {
-    return active_flushers > 0 || start_forced && flush_queue.size() > 0 || flush_queue.size() >= sync_threshold;
+    return active_flushers > 0 || dequeuing;
 }

 void journal_flusher_t::loop()
 {
-    for (int i = 0; i < flusher_count; i++)
+    for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
    {
-        if (!active_flushers && (start_forced ? !flush_queue.size() : (flush_queue.size() < sync_threshold)))
-        {
-            return;
-        }
        co[i].loop();
    }
 }
@@ -83,6 +81,11 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_back(ov.oid);
    }
+    if (!dequeuing && flush_queue.size() >= sync_threshold)
+    {
+        dequeuing = true;
+        bs->ringloop->wakeup();
+    }
 }

 void journal_flusher_t::unshift_flush(obj_ver_id ov)
@@ -98,16 +101,26 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_front(ov.oid);
    }
+    if (!dequeuing && flush_queue.size() >= sync_threshold)
+    {
+        dequeuing = true;
+        bs->ringloop->wakeup();
+    }
 }

 void journal_flusher_t::force_start()
 {
-    start_forced = true;
+    dequeuing = true;
    bs->ringloop->wakeup();
 }

 #define await_sqe(label) \
    resume_##label:\
+        {\
+            timespec now;\
+            clock_gettime(CLOCK_REALTIME, &now);\
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
+        }\
        sqe = bs->get_sqe();\
        if (!sqe)\
        {\
@@ -116,6 +129,7 @@ void journal_flusher_t::force_start()
        }\
        data = ((ring_data_t*)sqe->user_data);

+// FIXME: Implement batch flushing
 bool journal_flusher_co::loop()
 {
    // This is much better than implementing the whole function as an FSM
@@ -155,10 +169,9 @@ bool journal_flusher_co::loop()
    else if (wait_state == 18)
        goto resume_18;
 resume_0:
-    if (!flusher->flush_queue.size() ||
-        !flusher->start_forced && !flusher->active_flushers && flusher->flush_queue.size() < flusher->sync_threshold)
+    if (!flusher->flush_queue.size() || !flusher->dequeuing)
    {
-        flusher->start_forced = false;
+        flusher->dequeuing = false;
        wait_state = 0;
        return true;
    }
@@ -169,6 +182,16 @@ resume_0:
    dirty_end = bs->dirty_db.find(cur);
    if (dirty_end != bs->dirty_db.end())
    {
+        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+            (bs->journal.dirty_start >= bs->journal.used_start ||
+            dirty_end->second.journal_sector < bs->journal.used_start))
+        {
+            // We can't flush journal sectors that are still written to
+            flusher->enqueue_flush(cur);
+            flusher->dequeuing = false;
+            wait_state = 0;
+            return true;
+        }
        repeat_it = flusher->sync_to_repeat.find(cur.oid);
        if (repeat_it != flusher->sync_to_repeat.end())
        {
@@ -331,6 +354,7 @@ resume_1:
        else
        {
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
+            assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
            new_entry->oid = cur.oid;
            new_entry->version = cur.version;
            if (!bs->inmemory_meta)
@@ -632,7 +656,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
        });
    sync_found:
        cur_sync->ready_count++;
-        if (cur_sync->ready_count >= flusher->sync_threshold || !flusher->flush_queue.size())
+        flusher->syncing_flushers++;
+        if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
        {
            // Sync batch is ready. Do it.
            await_sqe(0);
@@ -658,6 +683,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
            wait_state = 2;
            return false;
        }
+        flusher->syncing_flushers--;
        cur_sync->ready_count--;
        if (cur_sync->ready_count == 0)
        {
--- a/blockstore_flush.h
+++ b/blockstore_flush.h
@@ -46,7 +46,7 @@ class journal_flusher_co
    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;

    bool skip_copy, has_delete, has_empty;
-    spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
+    blockstore_clean_db_t::iterator clean_it;
    std::vector<copy_buffer_t> v;
    std::vector<copy_buffer_t>::iterator it;
    int copy_count;
@@ -73,7 +73,7 @@ public:
 // Journal flusher itself
 class journal_flusher_t
 {
-    bool start_forced = false;
+    bool dequeuing;
    int flusher_count;
    int sync_threshold;
    journal_flusher_co *co;
@@ -84,6 +84,7 @@ class journal_flusher_t
    void* journal_superblock;

    int active_flushers;
+    int syncing_flushers;
    std::list<flusher_sync_t> syncs;
    std::map<object_id, uint64_t> sync_to_repeat;

--- a/blockstore_impl.cpp
+++ b/blockstore_impl.cpp
@@ -5,7 +5,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
    this->ringloop = ringloop;
    ring_consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(ring_consumer);
+    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
    zero_object = (uint8_t*)memalign(MEM_ALIGNMENT, block_size);
    data_fd = meta_fd = journal.fd = -1;
@@ -36,7 +36,7 @@ blockstore_impl_t::~blockstore_impl_t()
    delete data_alloc;
    delete flusher;
    free(zero_object);
-    ringloop->unregister_consumer(ring_consumer);
+    ringloop->unregister_consumer(&ring_consumer);
    if (data_fd >= 0)
        close(data_fd);
    if (meta_fd >= 0 && meta_fd != data_fd)
@@ -98,10 +98,19 @@ void blockstore_impl_t::loop()
    {
        // try to submit ops
        unsigned initial_ring_space = ringloop->space_left();
+        // FIXME: rework this "sync polling"
        auto cur_sync = in_progress_syncs.begin();
        while (cur_sync != in_progress_syncs.end())
        {
-            continue_sync(*cur_sync++);
+            if (continue_sync(*cur_sync) != 2)
+            {
+                // List is unmodified
+                cur_sync++;
+            }
+            else
+            {
+                cur_sync = in_progress_syncs.begin();
+            }
        }
        auto cur = submit_queue.begin();
        int has_writes = 0;
@@ -136,7 +145,7 @@ void blockstore_impl_t::loop()
            }
            unsigned ring_space = ringloop->space_left();
            unsigned prev_sqe_pos = ringloop->save();
-            int dequeue_op = 0;
+            bool dequeue_op = false;
            if (op->opcode == BS_OP_READ)
            {
                dequeue_op = dequeue_read(op);
@@ -166,16 +175,33 @@ void blockstore_impl_t::loop()
            }
            else if (op->opcode == BS_OP_STABLE)
            {
+                if (has_writes == 2)
+                {
+                    // Don't submit additional flushes before completing previous LISTs
+                    break;
+                }
                dequeue_op = dequeue_stable(op);
            }
            else if (op->opcode == BS_OP_ROLLBACK)
            {
+                if (has_writes == 2)
+                {
+                    // Don't submit additional flushes before completing previous LISTs
+                    break;
+                }
                dequeue_op = dequeue_rollback(op);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                process_list(op);
-                dequeue_op = true;
+                // Block LIST operation by previous modifications,
+                // so it always returns a consistent state snapshot
+                if (has_writes == 2 || inflight_writes > 0)
+                    has_writes = 2;
+                else
+                {
+                    process_list(op);
+                    dequeue_op = true;
+                }
            }
            if (dequeue_op)
            {
@@ -196,11 +222,16 @@ void blockstore_impl_t::loop()
        {
            flusher->loop();
        }
+        int ret = ringloop->submit();
+        if (ret < 0)
+        {
+            throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
+        }
        if ((initial_ring_space - ringloop->space_left()) > 0)
        {
            live = true;
        }
-        queue_stall = !live && !ringloop->get_loop_again();
+        queue_stall = !live && !ringloop->has_work();
        live = false;
    }
 }
@@ -244,19 +275,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        }
        PRIV(op)->wait_for = 0;
    }
-    else if (PRIV(op)->wait_for == WAIT_IN_FLIGHT)
-    {
-        auto dirty_it = dirty_db.find((obj_ver_id){
-            .oid = op->oid,
-            .version = PRIV(op)->wait_detail,
-        });
-        if (dirty_it != dirty_db.end() && IS_IN_FLIGHT(dirty_it->second.state))
-        {
-            // do not submit
-            return;
-        }
-        PRIV(op)->wait_for = 0;
-    }
    else if (PRIV(op)->wait_for == WAIT_JOURNAL)
    {
        if (journal.used_start == PRIV(op)->wait_detail)
@@ -299,12 +317,12 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
            op->len > block_size-op->offset ||
            (op->len % disk_alignment)
        )) ||
-        readonly && op->opcode != BS_OP_READ ||
+        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
        first && op->opcode == BS_OP_WRITE)
    {
        // Basic verification not passed
        op->retval = -EINVAL;
-        op->callback(op);
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -347,19 +365,19 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
    }
    if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
    {
-        op->callback(op);
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
-    if (0 && op->opcode == BS_OP_SYNC && immediate_commit)
+    if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
    {
        op->retval = 0;
-        op->callback(op);
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
    PRIV(op)->wait_for = 0;
-    PRIV(op)->sync_state = 0;
+    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
    if (!first)
    {
@@ -377,8 +395,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    // Count objects
    uint32_t list_pg = op->offset;
    uint32_t pg_count = op->len;
-    uint64_t parity_block_size = op->oid.stripe;
-    if (pg_count != 0 && (parity_block_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
+    uint64_t pg_stripe_size = op->oid.stripe;
+    if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
    {
        op->retval = -EINVAL;
        FINISH_OP(op);
@@ -389,7 +407,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    {
        for (auto it = clean_db.begin(); it != clean_db.end(); it++)
        {
-            uint32_t pg = (it->first.inode + it->first.stripe / parity_block_size) % pg_count;
+            uint32_t pg = (it->first.inode + it->first.stripe / pg_stripe_size) % pg_count;
            if (pg == list_pg)
            {
                stable_count++;
@@ -403,7 +421,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    uint64_t total_count = stable_count;
    for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
    {
-        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
        {
            if (IS_STABLE(it->second.state))
            {
@@ -426,7 +444,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    int i = 0;
    for (auto it = clean_db.begin(); it != clean_db.end(); it++)
    {
-        if (!pg_count || ((it->first.inode + it->first.stripe / parity_block_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.inode + it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
        {
            vers[i++] = {
                .oid = it->first,
@@ -437,7 +455,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    int j = stable_count;
    for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
    {
-        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
+        if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
        {
            if (IS_STABLE(it->second.state))
            {
--- a/blockstore_impl.h
+++ b/blockstore_impl.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "blockstore.h"
-#include "timerfd_interval.h"

 #include <sys/types.h>
 #include <sys/ioctl.h>
@@ -16,7 +15,7 @@
 #include <deque>
 #include <new>

-#include "sparsepp/sparsepp/spp.h"
+#include "cpp-btree/btree_map.h"

 #include "allocator.h"

@@ -25,17 +24,17 @@
 // States are not stored on disk. Instead, they're deduced from the journal
 // FIXME: Rename to BS_ST_*

-#define ST_J_IN_FLIGHT 1
-#define ST_J_SUBMITTED 2
-#define ST_J_WRITTEN 3
-#define ST_J_SYNCED 4
-#define ST_J_STABLE 5
+#define ST_J_WAIT_BIG 1
+#define ST_J_IN_FLIGHT 2
+#define ST_J_SUBMITTED 3
+#define ST_J_WRITTEN 4
+#define ST_J_SYNCED 5
+#define ST_J_STABLE 6

 #define ST_D_IN_FLIGHT 15
 #define ST_D_SUBMITTED 16
 #define ST_D_WRITTEN 17
-#define ST_D_META_WRITTEN 19
-#define ST_D_META_SYNCED 20
+#define ST_D_SYNCED 20
 #define ST_D_STABLE 21

 #define ST_DEL_IN_FLIGHT 31
@@ -46,19 +45,28 @@

 #define ST_CURRENT 48

-#define IS_IN_FLIGHT(st) (st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
+#define IMMEDIATE_NONE 0
+#define IMMEDIATE_SMALL 1
+#define IMMEDIATE_ALL 2
+
+#define IS_IN_FLIGHT(st) (st == ST_J_WAIT_BIG || st == ST_J_IN_FLIGHT || st == ST_D_IN_FLIGHT || st == ST_DEL_IN_FLIGHT || st == ST_J_SUBMITTED || st == ST_D_SUBMITTED || st == ST_DEL_SUBMITTED)
 #define IS_STABLE(st) (st == ST_J_STABLE || st == ST_D_STABLE || st == ST_DEL_STABLE || st == ST_CURRENT)
-#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_META_SYNCED || st == ST_DEL_SYNCED)
-#define IS_JOURNAL(st) (st >= ST_J_SUBMITTED && st <= ST_J_STABLE)
-#define IS_BIG_WRITE(st) (st >= ST_D_SUBMITTED && st <= ST_D_STABLE)
-#define IS_DELETE(st) (st >= ST_DEL_SUBMITTED && st <= ST_DEL_STABLE)
-#define IS_UNSYNCED(st) (st >= ST_J_SUBMITTED && st <= ST_J_WRITTEN || st >= ST_D_SUBMITTED && st <= ST_D_META_WRITTEN || st >= ST_DEL_SUBMITTED && st <= ST_DEL_WRITTEN)
+#define IS_SYNCED(st) (IS_STABLE(st) || st == ST_J_SYNCED || st == ST_D_SYNCED || st == ST_DEL_SYNCED)
+#define IS_JOURNAL(st) (st >= ST_J_WAIT_BIG && st <= ST_J_STABLE)
+#define IS_BIG_WRITE(st) (st >= ST_D_IN_FLIGHT && st <= ST_D_STABLE)
+#define IS_DELETE(st) (st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_STABLE)
+#define IS_UNSYNCED(st) (st >= ST_J_WAIT_BIG && st <= ST_J_WRITTEN || st >= ST_D_IN_FLIGHT && st <= ST_D_WRITTEN|| st >= ST_DEL_IN_FLIGHT && st <= ST_DEL_WRITTEN)

 #define BS_SUBMIT_GET_SQE(sqe, data) \
    BS_SUBMIT_GET_ONLY_SQE(sqe); \
    struct ring_data_t *data = ((ring_data_t*)sqe->user_data)

 #define BS_SUBMIT_GET_ONLY_SQE(sqe) \
+        {\
+            timespec now;\
+            clock_gettime(CLOCK_REALTIME, &now);\
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
+        }\
    struct io_uring_sqe *sqe = get_sqe();\
    if (!sqe)\
    {\
@@ -68,6 +76,11 @@
    }

 #define BS_SUBMIT_GET_SQE_DECL(sqe) \
+        {\
+            timespec now;\
+            clock_gettime(CLOCK_REALTIME, &now);\
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);\
+        }\
    sqe = get_sqe();\
    if (!sqe)\
    {\
@@ -124,8 +137,6 @@ struct __attribute__((__packed__)) dirty_entry

 // Suspend operation until there are more free SQEs
 #define WAIT_SQE 1
-// Suspend operation until version <wait_detail> of object <oid> is written
-#define WAIT_IN_FLIGHT 2
 // Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
 #define WAIT_JOURNAL 3
 // Suspend operation until the next journal sector buffer is free
@@ -139,7 +150,7 @@ struct fulfill_read_t
 };

 #define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
-#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); op->callback(op)
+#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)

 struct blockstore_op_private_t
 {
@@ -147,12 +158,13 @@ struct blockstore_op_private_t
    int wait_for;
    uint64_t wait_detail;
    int pending_ops;
+    int op_state;

    // Read
    std::vector<fulfill_read_t> read_vec;

    // Sync, write
-    uint64_t min_used_journal_sector, max_used_journal_sector;
+    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;

    // Write
    struct iovec iov_zerofill[3];
@@ -161,9 +173,13 @@ struct blockstore_op_private_t
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
    int sync_small_checked, sync_big_checked;
    std::list<blockstore_op_t*>::iterator in_progress_ptr;
-    int sync_state, prev_sync_count;
+    int prev_sync_count;
 };

+// https://github.com/algorithm-ninja/cpp-btree
+// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
+// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
+typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
 typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;

 #include "blockstore_init.h"
@@ -177,29 +193,30 @@ class blockstore_impl_t
    uint32_t block_size;
    uint64_t meta_offset;
    uint64_t data_offset;
-    uint64_t cfg_journal_size;
+    uint64_t cfg_journal_size, cfg_data_size;
    // Required write alignment and journal/metadata/data areas' location alignment
-    uint32_t disk_alignment = 512;
+    uint32_t disk_alignment = 4096;
    // Journal block size - minimum_io_size of the journal device is the best choice
-    uint64_t journal_block_size = 512;
+    uint64_t journal_block_size = 4096;
    // Metadata block size - minimum_io_size of the metadata device is the best choice
-    uint64_t meta_block_size = 512;
+    uint64_t meta_block_size = 4096;
    // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
    uint64_t bitmap_granularity = 4096;
    bool readonly = false;
+    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
+    bool disable_flock = false;
    // It is safe to disable fsync() if drive write cache is writethrough
    bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
    // Enable if you want every operation to be executed with an "implicit fsync"
-    // FIXME Not implemented yet
-    bool immediate_commit = false;
+    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
+    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
    int flusher_count;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

-    // Another option is https://github.com/algorithm-ninja/cpp-btree
-    spp::sparse_hash_map<object_id, clean_entry> clean_db;
+    blockstore_clean_db_t clean_db;
    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
    std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
@@ -224,6 +241,7 @@ class blockstore_impl_t

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
+    int inflight_writes = 0;

    bool stop_sync_submitted;

@@ -264,7 +282,7 @@ class blockstore_impl_t
    bool enqueue_write(blockstore_op_t *op);
    int dequeue_write(blockstore_op_t *op);
    int dequeue_del(blockstore_op_t *op);
-    void ack_write(blockstore_op_t *op);
+    int continue_write(blockstore_op_t *op);
    void release_journal_sectors(blockstore_op_t *op);
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

@@ -277,11 +295,13 @@ class blockstore_impl_t

    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
+    int continue_stable(blockstore_op_t *op);
    void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
    void stabilize_object(object_id oid, uint64_t max_ver);

    // Rollback
    int dequeue_rollback(blockstore_op_t *op);
+    int continue_rollback(blockstore_op_t *op);
    void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
    void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);

@@ -316,5 +336,6 @@ public:

    inline uint32_t get_block_size() { return block_size; }
    inline uint64_t get_block_count() { return block_count; }
+    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
    inline uint32_t get_disk_alignment() { return disk_alignment; }
 };
--- a/blockstore_init.cpp
+++ b/blockstore_init.cpp
@@ -402,6 +402,7 @@ resume_1:
    }
    // Trim journal on start so we don't stall when all entries are older
    bs->journal.trim();
+    bs->journal.dirty_start = bs->journal.next_free;
    printf(
        "Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
        entries_loaded,
@@ -439,7 +440,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
        {
            journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
            if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
-                je->type < JE_SMALL_WRITE || je->type > JE_DELETE || started && je->crc32_prev != crc32_last)
+                je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
            {
                if (pos == 0)
                {
@@ -509,7 +510,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                if (data_crc32 != je->small_write.crc32_data)
                {
                    // journal entry is corrupt, stop here
-                    // interesting thing is that we must clear the corrupt entry if we're not readonly
+                    // interesting thing is that we must clear the corrupt entry if we're not readonly,
+                    // because we don't write next entries in the same journal block
+                    printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
                    memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
                    bs->journal.next_free = prev_free;
                    init_write_buf = buf + proc_pos - done_pos;
@@ -518,7 +521,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                }
                auto clean_it = bs->clean_db.find(je->small_write.oid);
                if (clean_it == bs->clean_db.end() ||
-                    clean_it->second.version < je->big_write.version)
+                    clean_it->second.version < je->small_write.version)
                {
                    obj_ver_id ov = {
                        .oid = je->small_write.oid,
@@ -555,7 +558,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .version = je->big_write.version,
                    };
                    bs->dirty_db.emplace(ov, (dirty_entry){
-                        .state = ST_D_META_SYNCED,
+                        .state = ST_D_SYNCED,
                        .flags = 0,
                        .location = je->big_write.location,
                        .offset = je->big_write.offset,
@@ -592,7 +595,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                {
                    while (1)
                    {
-                        it->second.state = (it->second.state == ST_D_META_SYNCED
+                        it->second.state = (it->second.state == ST_D_SYNCED
                            ? ST_D_STABLE
                            : (it->second.state == ST_DEL_SYNCED ? ST_DEL_STABLE : ST_J_STABLE));
                        if (it == bs->dirty_db.begin())
--- a/blockstore_journal.cpp
+++ b/blockstore_journal.cpp
@@ -6,18 +6,24 @@ blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
    sectors_required = 0;
    next_pos = bs->journal.next_free;
    next_sector = bs->journal.cur_sector;
+    first_sector = -1;
    next_in_pos = bs->journal.in_sector_pos;
    right_dir = next_pos >= bs->journal.used_start;
 }

 // Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
-int blockstore_journal_check_t::check_available(blockstore_op_t *op, int required, int size, int data_after)
+int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
 {
+    int required = entries_required;
    while (1)
    {
        int fits = (bs->journal.block_size - next_in_pos) / size;
        if (fits > 0)
        {
+            if (first_sector == -1)
+            {
+                first_sector = next_sector;
+            }
            required -= fits;
            next_in_pos += fits * size;
            sectors_required++;
@@ -38,19 +44,40 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int require
            right_dir = false;
        }
        next_in_pos = 0;
-        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
-            bs->journal.sector_info[next_sector].dirty)
+        next_sector = ((next_sector + 1) % bs->journal.sector_count);
+        if (next_sector == first_sector)
        {
-            next_sector = ((next_sector + 1) % bs->journal.sector_count);
+            // next_sector may wrap when all sectors are flushed and the incoming batch is too big
+            // This is an error condition, we can't wait for anything in this case
+            throw std::runtime_error(
+                "Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
+                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
+            );
        }
        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
            bs->journal.sector_info[next_sector].dirty)
        {
            // No memory buffer available. Wait for it.
-#ifdef BLOCKSTORE_DEBUG
-            printf("next journal buffer %d is still dirty=%d used=%d\n", next_sector,
-                bs->journal.sector_info[next_sector].dirty, bs->journal.sector_info[next_sector].usage_count);
-#endif
+            int used = 0, dirty = 0;
+            for (int i = 0; i < bs->journal.sector_count; i++)
+            {
+                if (bs->journal.sector_info[i].dirty)
+                {
+                    dirty++;
+                    used++;
+                }
+                if (bs->journal.sector_info[i].usage_count > 0)
+                {
+                    used++;
+                }
+            }
+            // In fact, it's even more rare than "ran out of journal space", so print a warning
+            printf(
+                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
+                used, bs->journal.sector_count, dirty, next_sector,
+                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
+                bs->journal.sector_info[next_sector].usage_count
+            );
            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
            return 0;
        }
@@ -91,6 +118,11 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        {
            // Also select next sector buffer in memory
            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
+            assert(!journal.sector_info[journal.cur_sector].usage_count);
+        }
+        else
+        {
+            journal.dirty_start = journal.next_free;
        }
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
@@ -148,7 +180,7 @@ bool journal_t::trim()
    auto journal_used_it = used_sectors.lower_bound(used_start);
 #ifdef BLOCKSTORE_DEBUG
    printf(
-        "Trimming journal (used_start=%lu, next_free=%lu, first_used=%lu, usage_count=%lu)\n",
+        "Trimming journal (used_start=%08lx, next_free=%08lx, first_used=%08lx, usage_count=%08lx)\n",
        used_start, next_free,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
@@ -180,7 +212,7 @@ bool journal_t::trim()
        return false;
    }
 #ifdef BLOCKSTORE_DEBUG
-    printf("Journal trimmed to %lu (next_free=%lu)\n", used_start, next_free);
+    printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
 #endif
    return true;
 }
--- a/blockstore_journal.h
+++ b/blockstore_journal.h
@@ -12,12 +12,14 @@
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
+#define JE_MIN         0x01
 #define JE_START       0x01
 #define JE_SMALL_WRITE 0x02
 #define JE_BIG_WRITE   0x03
 #define JE_STABLE      0x04
 #define JE_DELETE      0x05
 #define JE_ROLLBACK    0x06
+#define JE_MAX         0x06

 // crc32c comes first to ease calculation and is equal to crc32()
 struct __attribute__((__packed__)) journal_entry_start
@@ -135,10 +137,14 @@ struct journal_t
    bool inmemory = false;
    void *buffer = NULL;

-    uint64_t block_size = 512;
+    uint64_t block_size;
    uint64_t offset, len;
+    // Next free block offset
    uint64_t next_free = 0;
+    // First occupied block offset
    uint64_t used_start = 0;
+    // End of the last block not used for writing anymore
+    uint64_t dirty_start = 0;
    uint32_t crc32_last = 0;

    // Current sector(s) used for writing
@@ -160,7 +166,7 @@ struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_required;
+    int sectors_required, first_sector;
    bool right_dir; // writing to the end or the beginning of the ring buffer

    blockstore_journal_check_t(blockstore_impl_t *bs);
--- a/blockstore_open.cpp
+++ b/blockstore_open.cpp
@@ -1,3 +1,4 @@
+#include <sys/file.h>
 #include "blockstore_impl.h"

 static uint32_t is_power_of_two(uint64_t value)
@@ -34,10 +35,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        disable_journal_fsync = true;
    }
+    if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
+    {
+        disable_flock = true;
+    }
+    if (config["immediate_commit"] == "all")
+    {
+        immediate_commit = IMMEDIATE_ALL;
+    }
+    else if (config["immediate_commit"] == "small")
+    {
+        immediate_commit = IMMEDIATE_SMALL;
+    }
    metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
    cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
    data_device = config["data_device"];
    data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
+    cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
    meta_device = config["meta_device"];
    meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
    block_size = strtoull(config["block_size"].c_str(), NULL, 10);
@@ -66,7 +80,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!disk_alignment)
    {
-        disk_alignment = 512;
+        disk_alignment = 4096;
    }
    else if (disk_alignment % MEM_ALIGNMENT)
    {
@@ -74,7 +88,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!journal_block_size)
    {
-        journal_block_size = 512;
+        journal_block_size = 4096;
    }
    else if (journal_block_size % MEM_ALIGNMENT)
    {
@@ -82,7 +96,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!meta_block_size)
    {
-        meta_block_size = 512;
+        meta_block_size = 4096;
    }
    else if (meta_block_size % MEM_ALIGNMENT)
    {
@@ -128,6 +142,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        metadata_buf_size = 4*1024*1024;
    }
+    if (meta_device == "")
+    {
+        disable_meta_fsync = disable_data_fsync;
+    }
+    if (journal_device == "")
+    {
+        disable_journal_fsync = disable_meta_fsync;
+    }
+    if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
+    {
+        throw std::runtime_error("immediate_commit requires disable_journal_fsync");
+    }
+    if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
+    {
+        throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
+    }
    // init some fields
    clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
    clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
@@ -151,6 +181,15 @@ void blockstore_impl_t::calc_lengths()
        data_len = data_len < journal.offset-data_offset
            ? data_len : journal.offset-data_offset;
    }
+    if (cfg_data_size != 0)
+    {
+        if (data_len < cfg_data_size)
+        {
+            throw std::runtime_error("Data area ("+std::to_string(data_len)+
+                " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
+        }
+        data_len = cfg_data_size;
+    }
    // meta
    meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
    if (meta_fd == data_fd && meta_offset <= data_offset)
@@ -252,6 +291,10 @@ void blockstore_impl_t::open_data()
    {
        throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
    }
+    if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
+    {
+        throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
+    }
 }

 void blockstore_impl_t::open_meta()
@@ -269,11 +312,14 @@ void blockstore_impl_t::open_meta()
        {
            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
        }
+        if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
+        {
+            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
+        }
    }
    else
    {
        meta_fd = data_fd;
-        disable_meta_fsync = disable_data_fsync;
        meta_size = 0;
        if (meta_offset >= data_size)
        {
@@ -291,12 +337,15 @@ void blockstore_impl_t::open_journal()
        {
            throw std::runtime_error("Failed to open journal device");
        }
-        check_size(journal.fd, &journal.device_size, "metadata device");
+        check_size(journal.fd, &journal.device_size, "journal device");
+        if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
+        {
+            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
+        }
    }
    else
    {
        journal.fd = meta_fd;
-        disable_journal_fsync = disable_meta_fsync;
        journal.device_size = 0;
        if (journal.offset >= data_size)
        {
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@@ -8,12 +8,10 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
        // Zero-length version - skip
        return 1;
    }
-    if (IS_IN_FLIGHT(item_state))
+    else if (IS_IN_FLIGHT(item_state))
    {
-        // Pause until it's written somewhere
-        PRIV(op)->wait_for = WAIT_IN_FLIGHT;
-        PRIV(op)->wait_detail = item_version;
-        return 0;
+        // Write not finished yet - skip
+        return 1;
    }
    else if (IS_DELETE(item_state))
    {
@@ -133,63 +131,66 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            dirty_it--;
        }
    }
-    if (clean_it != clean_db.end() && fulfilled < read_op->len)
+    if (clean_it != clean_db.end())
    {
        if (!result_version)
        {
            result_version = clean_it->second.version;
        }
-        if (!clean_entry_bitmap_size)
+        if (fulfilled < read_op->len)
        {
-            if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+            if (!clean_entry_bitmap_size)
            {
-                // need to wait. undo added requests, don't dequeue op
-                PRIV(read_op)->read_vec.clear();
-                return 0;
-            }
-        }
-        else
-        {
-            uint64_t meta_loc = clean_it->second.location >> block_order;
-            uint8_t *clean_entry_bitmap;
-            if (inmemory_meta)
-            {
-                uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
-                uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
-                clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
+                if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+                {
+                    // need to wait. undo added requests, don't dequeue op
+                    PRIV(read_op)->read_vec.clear();
+                    return 0;
+                }
            }
            else
            {
-                clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
-            }
-            uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
-            while (bmp_start < bmp_size)
-            {
-                while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
+                uint64_t meta_loc = clean_it->second.location >> block_order;
+                uint8_t *clean_entry_bitmap;
+                if (inmemory_meta)
                {
-                    bmp_end++;
+                    uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
+                    uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
+                    clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
                }
-                if (bmp_end > bmp_start)
+                else
                {
-                    // fill with zeroes
-                    fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                        bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
+                    clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
                }
-                bmp_start = bmp_end;
-                while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
+                uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
+                while (bmp_start < bmp_size)
                {
-                    bmp_end++;
-                }
-                if (bmp_end > bmp_start)
-                {
-                    if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                        bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
+                    while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
                    {
-                        // need to wait. undo added requests, don't dequeue op
-                        PRIV(read_op)->read_vec.clear();
-                        return 0;
+                        bmp_end++;
+                    }
+                    if (bmp_end > bmp_start)
+                    {
+                        // fill with zeroes
+                        fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
                    }
                    bmp_start = bmp_end;
+                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
+                    {
+                        bmp_end++;
+                    }
+                    if (bmp_end > bmp_start)
+                    {
+                        if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
+                        {
+                            // need to wait. undo added requests, don't dequeue op
+                            PRIV(read_op)->read_vec.clear();
+                            return 0;
+                        }
+                        bmp_start = bmp_end;
+                    }
                }
            }
        }
--- a/blockstore_rollback.cpp
+++ b/blockstore_rollback.cpp
@@ -2,6 +2,10 @@

 int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
 {
+    if (PRIV(op)->op_state)
+    {
+        return continue_rollback(op);
+    }
    obj_ver_id* v;
    int i, todo = op->len;
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -14,8 +18,13 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        });
        if (dirty_it == dirty_db.begin())
        {
+            if (v->version == 0)
+            {
+                // Already rolled back
+                // FIXME Skip this object version
+            }
        bad_op:
-            op->retval = -EINVAL;
+            op->retval = -ENOENT;
            FINISH_OP(op);
            return 1;
        }
@@ -31,7 +40,9 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
                if (!IS_SYNCED(dirty_it->second.state) ||
                    IS_STABLE(dirty_it->second.state))
                {
-                    goto bad_op;
+                    op->retval = -EBUSY;
+                    FINISH_OP(op);
+                    return 1;
                }
                if (dirty_it == dirty_db.begin())
                {
@@ -60,7 +71,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
-            PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
@@ -103,13 +114,79 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        if (cur_sector != journal.cur_sector)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
-    PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
+    PRIV(op)->op_state = 1;
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 3)
+        goto resume_3;
+    else if (PRIV(op)->op_state == 5)
+        goto resume_5;
+    else
+        return 1;
+resume_2:
+    // Release used journal sectors
+    release_journal_sectors(op);
+resume_3:
+    if (!disable_journal_fsync)
+    {
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->op_state = 4;
+        return 1;
+    }
+resume_5:
+    obj_ver_id* v;
+    int i;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        // Erase dirty_db entries
+        auto rm_end = dirty_db.lower_bound((obj_ver_id){
+            .oid = v->oid,
+            .version = UINT64_MAX,
+        });
+        auto rm_start = rm_end;
+        assert(rm_start != dirty_db.begin());
+        rm_start--;
+        while (1)
+        {
+            if (rm_start->first.oid != v->oid || rm_start->first.version <= v->version)
+            {
+                rm_start++;
+                break;
+            }
+            if (rm_start == dirty_db.begin())
+                break;
+            rm_start--;
+        }
+        erase_dirty(rm_start, rm_end, UINT64_MAX);
+    }
+    journal.trim();
+    inflight_writes--;
+    // Acknowledge op
+    op->retval = 0;
+    FINISH_OP(op);
    return 1;
 }

@@ -118,6 +195,7 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    live = true;
    if (data->res != data->iov.iov_len)
    {
+        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -126,37 +204,11 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    PRIV(op)->pending_ops--;
    if (PRIV(op)->pending_ops == 0)
    {
-        // Release used journal sectors
-        release_journal_sectors(op);
-        obj_ver_id* v;
-        int i;
-        for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+        PRIV(op)->op_state++;
+        if (!continue_rollback(op))
        {
-            // Erase dirty_db entries
-            auto rm_end = dirty_db.lower_bound((obj_ver_id){
-                .oid = v->oid,
-                .version = UINT64_MAX,
-            });
-            rm_end--;
-            auto rm_start = rm_end;
-            while (1)
-            {
-                if (rm_end->first.oid != v->oid)
-                    break;
-                else if (rm_end->first.version <= v->version)
-                    break;
-                rm_start = rm_end;
-                if (rm_end == dirty_db.begin())
-                    break;
-                rm_end--;
-            }
-            if (rm_end != rm_start)
-                erase_dirty(rm_start, rm_end, UINT64_MAX);
+            submit_queue.push_front(op);
        }
-        journal.trim();
-        // Acknowledge op
-        op->retval = 0;
-        FINISH_OP(op);
    }
 }

--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@@ -40,6 +40,10 @@

 int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
 {
+    if (PRIV(op)->op_state)
+    {
+        return continue_stable(op);
+    }
    obj_ver_id* v;
    int i, todo = 0;
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -51,7 +55,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            if (clean_it == clean_db.end() || clean_it->second.version < v->version)
            {
                // No such object version
-                op->retval = -EINVAL;
+                op->retval = -ENOENT;
                FINISH_OP(op);
                return 1;
            }
@@ -63,7 +67,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        else if (IS_UNSYNCED(dirty_it->second.state))
        {
            // Object not synced yet. Caller must sync it first
-            op->retval = EAGAIN;
+            op->retval = -EBUSY;
            FINISH_OP(op);
            return 1;
        }
@@ -98,12 +102,13 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        journal.sector_info[journal.cur_sector].dirty)
    {
        if (cur_sector == -1)
-            PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        cur_sector = journal.cur_sector;
        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
+        // FIXME: Only stabilize versions that aren't stable yet
        auto unstab_it = unstable_writes.find(v->oid);
        if (unstab_it != unstable_writes.end() &&
            unstab_it->second <= v->version)
@@ -120,13 +125,101 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        if (cur_sector != journal.cur_sector)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
-    PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
+    PRIV(op)->op_state = 1;
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_stable(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 3)
+        goto resume_3;
+    else if (PRIV(op)->op_state == 5)
+        goto resume_5;
+    else
+        return 1;
+resume_2:
+    // Release used journal sectors
+    release_journal_sectors(op);
+resume_3:
+    if (!disable_journal_fsync)
+    {
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->op_state = 4;
+        return 1;
+    }
+resume_5:
+    // Mark dirty_db entries as stable, acknowledge op completion
+    obj_ver_id* v;
+    int i;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        // Mark all dirty_db entries up to op->version as stable
+        auto dirty_it = dirty_db.find(*v);
+        if (dirty_it != dirty_db.end())
+        {
+            while (1)
+            {
+                if (dirty_it->second.state == ST_J_SYNCED)
+                {
+                    dirty_it->second.state = ST_J_STABLE;
+                }
+                else if (dirty_it->second.state == ST_D_SYNCED)
+                {
+                    dirty_it->second.state = ST_D_STABLE;
+                }
+                else if (dirty_it->second.state == ST_DEL_SYNCED)
+                {
+                    dirty_it->second.state = ST_DEL_STABLE;
+                }
+                else if (IS_STABLE(dirty_it->second.state))
+                {
+                    break;
+                }
+                if (dirty_it == dirty_db.begin())
+                {
+                    break;
+                }
+                dirty_it--;
+                if (dirty_it->first.oid != v->oid)
+                {
+                    break;
+                }
+            }
+#ifdef BLOCKSTORE_DEBUG
+            printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
+#endif
+            flusher->enqueue_flush(*v);
+        }
+    }
+    inflight_writes--;
+    // Acknowledge op
+    op->retval = 0;
+    FINISH_OP(op);
    return 1;
 }

@@ -135,6 +228,7 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    live = true;
    if (data->res != data->iov.iov_len)
    {
+        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -143,53 +237,10 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    PRIV(op)->pending_ops--;
    if (PRIV(op)->pending_ops == 0)
    {
-        // Release used journal sectors
-        release_journal_sectors(op);
-        // Mark dirty_db entries as stable, acknowledge op completion
-        obj_ver_id* v;
-        int i;
-        for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+        PRIV(op)->op_state++;
+        if (!continue_stable(op))
        {
-            // Mark all dirty_db entries up to op->version as stable
-            auto dirty_it = dirty_db.find(*v);
-            if (dirty_it != dirty_db.end())
-            {
-                while (1)
-                {
-                    if (dirty_it->second.state == ST_J_SYNCED)
-                    {
-                        dirty_it->second.state = ST_J_STABLE;
-                    }
-                    else if (dirty_it->second.state == ST_D_META_SYNCED)
-                    {
-                        dirty_it->second.state = ST_D_STABLE;
-                    }
-                    else if (dirty_it->second.state == ST_DEL_SYNCED)
-                    {
-                        dirty_it->second.state = ST_DEL_STABLE;
-                    }
-                    else if (IS_STABLE(dirty_it->second.state))
-                    {
-                        break;
-                    }
-                    if (dirty_it == dirty_db.begin())
-                    {
-                        break;
-                    }
-                    dirty_it--;
-                    if (dirty_it->first.oid != v->oid)
-                    {
-                        break;
-                    }
-                }
-#ifdef BLOCKSTORE_DEBUG
-                printf("enqueue_flush %lu:%lu v%lu\n", v->oid.inode, v->oid.stripe, v->version);
-#endif
-                flusher->enqueue_flush(*v);
-            }
+            submit_queue.push_front(op);
        }
-        // Acknowledge op
-        op->retval = 0;
-        FINISH_OP(op);
    }
 }
--- a/blockstore_sync.cpp
+++ b/blockstore_sync.cpp
@@ -11,7 +11,7 @@

 int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 {
-    if (PRIV(op)->sync_state == 0)
+    if (PRIV(op)->op_state == 0)
    {
        stop_sync_submitted = false;
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
@@ -21,11 +21,11 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
        unsynced_big_writes.clear();
        unsynced_small_writes.clear();
        if (PRIV(op)->sync_big_writes.size() > 0)
-            PRIV(op)->sync_state = SYNC_HAS_BIG;
+            PRIV(op)->op_state = SYNC_HAS_BIG;
        else if (PRIV(op)->sync_small_writes.size() > 0)
-            PRIV(op)->sync_state = SYNC_HAS_SMALL;
+            PRIV(op)->op_state = SYNC_HAS_SMALL;
        else
-            PRIV(op)->sync_state = SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DONE;
        // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
        PRIV(op)->prev_sync_count = in_progress_syncs.size();
        PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
@@ -38,7 +38,7 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 int blockstore_impl_t::continue_sync(blockstore_op_t *op)
 {
    auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
-    if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
+    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
    {
        // No big writes, just fsync the journal
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
@@ -54,17 +54,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            // Write out the last journal sector if it happens to be dirty
            BS_SUBMIT_GET_ONLY_SQE(sqe);
            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
        }
    }
-    if (PRIV(op)->sync_state == SYNC_HAS_BIG)
+    if (PRIV(op)->op_state == SYNC_HAS_BIG)
    {
        for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
        {
@@ -81,17 +81,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
            data->callback = cb;
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
        }
    }
-    if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
+    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
    {
        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
        {
@@ -121,7 +121,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            journal.sector_info[journal.cur_sector].dirty)
        {
            if (cur_sector == -1)
-                PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
@@ -146,17 +146,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            if (cur_sector != journal.cur_sector)
            {
                if (cur_sector == -1)
-                    PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
+                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
                cur_sector = journal.cur_sector;
                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
            }
        }
-        PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
-        PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
+        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
        return 1;
    }
-    if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
+    if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
    {
        if (!disable_journal_fsync)
        {
@@ -165,17 +165,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            data->iov = { 0 };
            data->callback = cb;
            PRIV(op)->pending_ops = 1;
-            PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
+            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
            return 1;
        }
        else
        {
-            PRIV(op)->sync_state = SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DONE;
        }
    }
-    if (PRIV(op)->sync_state == SYNC_DONE)
+    if (PRIV(op)->op_state == SYNC_DONE)
    {
-        ack_sync(op);
+        return ack_sync(op);
    }
    return 1;
 }
@@ -196,17 +196,17 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
        // Release used journal sectors
        release_journal_sectors(op);
        // Handle states
-        if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
+        if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
        {
-            PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
        }
-        else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
+        else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
        {
-            PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
        }
-        else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
+        else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
        {
-            PRIV(op)->sync_state = SYNC_DONE;
+            PRIV(op)->op_state = SYNC_DONE;
            ack_sync(op);
        }
        else
@@ -218,7 +218,7 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op

 int blockstore_impl_t::ack_sync(blockstore_op_t *op)
 {
-    if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
+    if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
    {
        // Remove dependency of subsequent syncs
        auto it = PRIV(op)->in_progress_ptr;
@@ -230,14 +230,14 @@ int blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            auto & next_sync = *it++;
            PRIV(next_sync)->prev_sync_count -= done_syncs;
-            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE)
+            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
            {
                done_syncs++;
                // Acknowledge next_sync
                ack_one_sync(next_sync);
            }
        }
-        return 1;
+        return 2;
    }
    return 0;
 }
@@ -252,7 +252,17 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
-        dirty_db[*it].state = ST_D_META_SYNCED;
+        auto dirty_it = dirty_db.find(*it);
+        dirty_it->second.state = ST_D_SYNCED;
+        dirty_it++;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
+        {
+            if (dirty_it->second.state == ST_J_WAIT_BIG)
+            {
+                dirty_it->second.state = ST_J_IN_FLIGHT;
+            }
+            dirty_it++;
+        }
    }
    for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
    {
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@@ -4,6 +4,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
+    bool is_inflight_big = false;
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -17,6 +18,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
+            is_inflight_big = dirty_it->second.state >= ST_D_IN_FLIGHT &&
+                dirty_it->second.state < ST_D_SYNCED ||
+                dirty_it->second.state == ST_J_WAIT_BIG;
        }
    }
    if (!found)
@@ -38,7 +42,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    else if (op->version < version)
    {
        // Invalid version requested
-        op->retval = -EINVAL;
+        op->retval = -EEXIST;
        return false;
    }
    if (deleted && is_del)
@@ -47,10 +51,26 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        op->retval = 0;
        return false;
    }
-    // Immediately add the operation into dirty_db, so subsequent reads could see it
+    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
+        immediate_commit != IMMEDIATE_ALL)
+    {
+        // Issue an additional sync so that the previous big write can reach the journal
+        blockstore_op_t *sync_op = new blockstore_op_t;
+        sync_op->opcode = BS_OP_SYNC;
+        sync_op->callback = [this, op](blockstore_op_t *sync_op)
+        {
+            delete sync_op;
+        };
+        enqueue_op(sync_op);
+    }
 #ifdef BLOCKSTORE_DEBUG
-    printf("%s %lu:%lu v%lu\n", is_del ? "Delete" : "Write", op->oid.inode, op->oid.stripe, op->version);
+    if (is_del)
+        printf("Delete %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version);
+    else
+        printf("Write %lu:%lu v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
+    // No strict need to add it into dirty_db here, it's just left
+    // from the previous implementation where reads waited for writes
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
@@ -58,7 +78,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        .state = (uint32_t)(
            is_del
                ? ST_DEL_IN_FLIGHT
-                : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : ST_J_IN_FLIGHT)
+                : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : (is_inflight_big ? ST_J_WAIT_BIG : ST_J_IN_FLIGHT))
        ),
        .flags = 0,
        .location = 0,
@@ -72,11 +92,19 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 // First step of the write algorithm: dequeue operation and submit initial write(s)
 int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
 {
+    if (PRIV(op)->op_state)
+    {
+        return continue_write(op);
+    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
-    if (dirty_it->second.state == ST_D_IN_FLIGHT)
+    if (dirty_it->second.state == ST_J_WAIT_BIG)
+    {
+        return 0;
+    }
+    else if (dirty_it->second.state == ST_D_IN_FLIGHT)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -125,12 +153,20 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
-        PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
-        // Remember big write as unsynced
-        unsynced_big_writes.push_back((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        });
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        if (immediate_commit != IMMEDIATE_ALL)
+        {
+            // Remember big write as unsynced
+            unsynced_big_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+            PRIV(op)->op_state = 3;
+        }
+        else
+        {
+            PRIV(op)->op_state = 1;
+        }
    }
    else
    {
@@ -144,10 +180,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // There is sufficient space. Get SQE(s)
        struct io_uring_sqe *sqe1 = NULL;
-        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
+        if (immediate_commit != IMMEDIATE_NONE ||
+            (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
-            // Write current journal sector only if it's dirty and full
+            // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
            BS_SUBMIT_GET_SQE_DECL(sqe1);
        }
        struct io_uring_sqe *sqe2 = NULL;
@@ -157,16 +194,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // Got SQEs. Prepare previous journal sector write if required
        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        if (sqe1)
+        if (immediate_commit == IMMEDIATE_NONE)
        {
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
-            // FIXME rename to min/max _flushing
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
-            PRIV(op)->pending_ops++;
-        }
-        else
-        {
-            PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+            if (sqe1)
+            {
+                prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->pending_ops++;
+            }
+            else
+            {
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+            }
        }
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)
@@ -186,6 +225,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        je->crc32_data = crc32c(0, op->buf, op->len);
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
+        if (immediate_commit != IMMEDIATE_NONE)
+        {
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
        if (op->len > 0)
        {
            // Prepare journal data write
@@ -213,16 +258,120 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        {
            journal.next_free = journal_block_size;
        }
-        // Remember small write as unsynced
-        unsynced_small_writes.push_back((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        });
+        if (immediate_commit == IMMEDIATE_NONE)
+        {
+            // Remember small write as unsynced
+            unsynced_small_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+        }
        if (!PRIV(op)->pending_ops)
        {
-            ack_write(op);
+            PRIV(op)->op_state = 4;
+            continue_write(op);
+        }
+        else
+        {
+            PRIV(op)->op_state = 3;
        }
    }
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_write(blockstore_op_t *op)
+{
+    io_uring_sqe *sqe = NULL;
+    journal_entry_big_write *je;
+    auto dirty_it = dirty_db.find((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    });
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 4)
+        goto resume_4;
+    else
+        return 1;
+resume_2:
+    // Only for the immediate_commit mode: prepare and submit big_write journal entry
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+    sqe = get_sqe();
+    if (!sqe)
+    {
+        return 0;
+    }
+    je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
+    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+    journal.sector_info[journal.cur_sector].dirty = false;
+    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
+#ifdef BLOCKSTORE_DEBUG
+    printf("journal offset %lu is used by %lu:%lu v%lu\n", journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version);
+#endif
+    je->oid = op->oid;
+    je->version = op->version;
+    je->offset = op->offset;
+    je->len = op->len;
+    je->location = dirty_it->second.location;
+    je->crc32 = je_crc32((journal_entry*)je);
+    journal.crc32_last = je->crc32;
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe,
+        [this, op](ring_data_t *data) { handle_write_event(data, op); });
+    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->pending_ops = 1;
+    PRIV(op)->op_state = 3;
+    return 1;
+resume_4:
+    // Switch object state
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("write_done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
+#ifdef BLOCKSTORE_DEBUG
+    printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+#endif
+    bool imm = dirty_it->second.state == ST_D_SUBMITTED
+        ? (immediate_commit == IMMEDIATE_ALL)
+        : (immediate_commit != IMMEDIATE_NONE);
+    if (imm)
+    {
+        auto & unstab = unstable_writes[op->oid];
+        unstab = unstab < op->version ? op->version : unstab;
+    }
+    if (dirty_it->second.state == ST_J_SUBMITTED)
+    {
+        dirty_it->second.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
+    }
+    else if (dirty_it->second.state == ST_D_SUBMITTED)
+    {
+        dirty_it->second.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
+    }
+    else if (dirty_it->second.state == ST_DEL_SUBMITTED)
+    {
+        dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
+    }
+    if (immediate_commit == IMMEDIATE_ALL)
+    {
+        dirty_it++;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
+        {
+            if (dirty_it->second.state == ST_J_WAIT_BIG)
+            {
+                dirty_it->second.state = ST_J_IN_FLIGHT;
+            }
+            dirty_it++;
+        }
+    }
+    inflight_writes--;
+    // Acknowledge write
+    op->retval = op->len;
+    FINISH_OP(op);
    return 1;
 }

@@ -231,6 +380,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    live = true;
    if (data->res != data->iov.iov_len)
    {
+        inflight_writes--;
        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
@@ -241,55 +391,42 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    if (PRIV(op)->pending_ops == 0)
    {
        release_journal_sectors(op);
-        ack_write(op);
+        PRIV(op)->op_state++;
+        if (!continue_write(op))
+        {
+            submit_queue.push_front(op);
+        }
    }
 }

 void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
 {
-    // Release used journal sectors
-    if (PRIV(op)->min_used_journal_sector > 0 &&
-        PRIV(op)->max_used_journal_sector > 0)
+    // Release flushed journal sectors
+    if (PRIV(op)->min_flushed_journal_sector > 0 &&
+        PRIV(op)->max_flushed_journal_sector > 0)
    {
-        uint64_t s = PRIV(op)->min_used_journal_sector;
+        uint64_t s = PRIV(op)->min_flushed_journal_sector;
        while (1)
        {
            journal.sector_info[s-1].usage_count--;
-            if (s == PRIV(op)->max_used_journal_sector)
+            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
+            {
+                // We know for sure that we won't write into this sector anymore
+                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
+                if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
+                    (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
+                {
+                    journal.dirty_start = new_ds;
+                }
+            }
+            if (s == PRIV(op)->max_flushed_journal_sector)
                break;
            s = 1 + s % journal.sector_count;
        }
-        PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
    }
 }

-void blockstore_impl_t::ack_write(blockstore_op_t *op)
-{
-    // Switch object state
-    auto & dirty_entry = dirty_db[(obj_ver_id){
-        .oid = op->oid,
-        .version = op->version,
-    }];
-#ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_entry.state);
-#endif
-    if (dirty_entry.state == ST_J_SUBMITTED)
-    {
-        dirty_entry.state = ST_J_WRITTEN;
-    }
-    else if (dirty_entry.state == ST_D_SUBMITTED)
-    {
-        dirty_entry.state = ST_D_WRITTEN;
-    }
-    else if (dirty_entry.state == ST_DEL_SUBMITTED)
-    {
-        dirty_entry.state = ST_DEL_WRITTEN;
-    }
-    // Acknowledge write without sync
-    op->retval = op->len;
-    FINISH_OP(op);
-}
-
 int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
 {
    auto dirty_it = dirty_db.find((obj_ver_id){
@@ -301,8 +438,30 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    {
        return 0;
    }
-    BS_SUBMIT_GET_ONLY_SQE(sqe);
+    io_uring_sqe *sqe = NULL;
+    if (immediate_commit != IMMEDIATE_NONE ||
+        (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
+        BS_SUBMIT_GET_SQE_DECL(sqe);
+    }
+    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
    // Prepare journal sector write
+    if (immediate_commit == IMMEDIATE_NONE)
+    {
+        if (sqe)
+        {
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
+        else
+        {
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        }
+    }
+    // Pre-fill journal entry
    journal_entry_del *je = (journal_entry_del*)
        prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
@@ -314,15 +473,26 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    je->version = op->version;
    je->crc32 = je_crc32((journal_entry*)je);
    journal.crc32_last = je->crc32;
-    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-    prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
-    PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
-    PRIV(op)->pending_ops = 1;
    dirty_it->second.state = ST_DEL_SUBMITTED;
-    // Remember small write as unsynced
-    unsynced_small_writes.push_back((obj_ver_id){
-        .oid = op->oid,
-        .version = op->version,
-    });
+    if (immediate_commit != IMMEDIATE_NONE)
+    {
+        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->pending_ops++;
+        // Remember small write as unsynced
+        unsynced_small_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
+    }
+    if (!PRIV(op)->pending_ops)
+    {
+        PRIV(op)->op_state = 4;
+        continue_write(op);
+    }
+    else
+    {
+        PRIV(op)->op_state = 3;
+    }
    return 1;
 }
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@@ -0,0 +1,357 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <netinet/tcp.h>
+
+#include "cluster_client.h"
+
+osd_op_t::~osd_op_t()
+{
+    assert(!bs_op);
+    if (op_data)
+    {
+        free(op_data);
+    }
+    if (rmw_buf)
+    {
+        free(rmw_buf);
+    }
+    if (buf)
+    {
+        // Note: reusing osd_op_t WILL currently lead to memory leaks
+        // So we don't reuse it, but free it every time
+        free(buf);
+    }
+}
+
+void cluster_client_t::connect_peer(uint64_t peer_osd, json11::Json address_list, int port)
+{
+    if (wanted_peers.find(peer_osd) == wanted_peers.end())
+    {
+        wanted_peers[peer_osd] = (osd_wanted_peer_t){
+            .address_list = address_list,
+            .port = port,
+        };
+    }
+    else
+    {
+        wanted_peers[peer_osd].address_list = address_list;
+        wanted_peers[peer_osd].port = port;
+    }
+    wanted_peers[peer_osd].address_changed = true;
+    if (!wanted_peers[peer_osd].connecting &&
+        (time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
+    {
+        try_connect_peer(peer_osd);
+    }
+}
+
+void cluster_client_t::try_connect_peer(uint64_t peer_osd)
+{
+    auto wp_it = wanted_peers.find(peer_osd);
+    if (wp_it == wanted_peers.end())
+    {
+        return;
+    }
+    if (osd_peer_fds.find(peer_osd) != osd_peer_fds.end())
+    {
+        wanted_peers.erase(peer_osd);
+        return;
+    }
+    auto & wp = wp_it->second;
+    if (wp.address_index >= wp.address_list.array_items().size())
+    {
+        return;
+    }
+    wp.cur_addr = wp.address_list[wp.address_index].string_value();
+    wp.cur_port = wp.port;
+    try_connect_peer_addr(peer_osd, wp.cur_addr.c_str(), wp.cur_port);
+}
+
+void cluster_client_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
+{
+    struct sockaddr_in addr;
+    int r;
+    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
+    {
+        on_connect_peer(peer_osd, -EINVAL);
+        return;
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(peer_port ? peer_port : 11203);
+    int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (peer_fd < 0)
+    {
+        on_connect_peer(peer_osd, -errno);
+        return;
+    }
+    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
+    int timeout_id = -1;
+    if (peer_connect_timeout > 0)
+    {
+        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
+        {
+            osd_num_t peer_osd = clients[peer_fd].osd_num;
+            stop_client(peer_fd);
+            on_connect_peer(peer_osd, -EIO);
+            return;
+        });
+    }
+    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
+    if (r < 0 && errno != EINPROGRESS)
+    {
+        close(peer_fd);
+        on_connect_peer(peer_osd, -errno);
+        return;
+    }
+    assert(peer_osd != this->osd_num);
+    clients[peer_fd] = (osd_client_t){
+        .peer_addr = addr,
+        .peer_port = peer_port,
+        .peer_fd = peer_fd,
+        .peer_state = PEER_CONNECTING,
+        .connect_timeout_id = timeout_id,
+        .osd_num = peer_osd,
+        .in_buf = malloc(receive_buffer_size),
+    };
+    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
+    {
+        // Either OUT (connected) or HUP
+        handle_connect_epoll(peer_fd);
+    });
+}
+
+void cluster_client_t::handle_connect_epoll(int peer_fd)
+{
+    auto & cl = clients[peer_fd];
+    if (cl.connect_timeout_id >= 0)
+    {
+        tfd->clear_timer(cl.connect_timeout_id);
+        cl.connect_timeout_id = -1;
+    }
+    osd_num_t peer_osd = cl.osd_num;
+    int result = 0;
+    socklen_t result_len = sizeof(result);
+    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
+    {
+        result = errno;
+    }
+    if (result != 0)
+    {
+        stop_client(peer_fd);
+        on_connect_peer(peer_osd, -result);
+        return;
+    }
+    int one = 1;
+    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+    cl.peer_state = PEER_CONNECTED;
+    tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
+    {
+        handle_peer_epoll(peer_fd, epoll_events);
+    });
+    // Check OSD number
+    check_peer_config(cl);
+}
+
+void cluster_client_t::handle_peer_epoll(int peer_fd, int epoll_events)
+{
+    // Mark client as ready (i.e. some data is available)
+    if (epoll_events & EPOLLRDHUP)
+    {
+        // Stop client
+        printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
+        stop_client(peer_fd);
+    }
+    else if (epoll_events & EPOLLIN)
+    {
+        // Mark client as ready (i.e. some data is available)
+        auto & cl = clients[peer_fd];
+        cl.read_ready++;
+        if (cl.read_ready == 1)
+        {
+            read_ready_clients.push_back(cl.peer_fd);
+            ringloop->wakeup();
+        }
+    }
+}
+
+void cluster_client_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
+{
+    auto & wp = wanted_peers.at(peer_osd);
+    wp.connecting = false;
+    if (peer_fd < 0)
+    {
+        printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
+        if (wp.address_changed)
+        {
+            wp.address_changed = false;
+            wp.address_index = 0;
+            try_connect_peer(peer_osd);
+        }
+        else if (wp.address_index < wp.address_list.array_items().size()-1)
+        {
+            // Try other addresses
+            wp.address_index++;
+            try_connect_peer(peer_osd);
+        }
+        else
+        {
+            // Retry again in <peer_connect_interval> seconds
+            wp.last_connect_attempt = time(NULL);
+            wp.address_index = 0;
+            tfd->set_timer(1000*peer_connect_interval, false, [this, peer_osd](int)
+            {
+                try_connect_peer(peer_osd);
+            });
+        }
+        return;
+    }
+    printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
+    wanted_peers.erase(peer_osd);
+    repeer_pgs(peer_osd);
+}
+
+void cluster_client_t::check_peer_config(osd_client_t & cl)
+{
+    osd_op_t *op = new osd_op_t();
+    op->op_type = OSD_OP_OUT;
+    op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+    op->peer_fd = cl.peer_fd;
+    op->req = {
+        .show_conf = {
+            .header = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = this->next_subop_id++,
+                .opcode = OSD_OP_SHOW_CONFIG,
+            },
+        },
+    };
+    op->callback = [this](osd_op_t *op)
+    {
+        osd_client_t & cl = clients[op->peer_fd];
+        std::string json_err;
+        json11::Json config;
+        bool err = false;
+        if (op->reply.hdr.retval < 0)
+        {
+            err = true;
+            printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
+        }
+        else
+        {
+            config = json11::Json::parse(std::string((char*)op->buf), json_err);
+            if (json_err != "")
+            {
+                err = true;
+                printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
+            }
+            else if (config["osd_num"].uint64_value() != cl.osd_num)
+            {
+                err = true;
+                printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
+                on_connect_peer(cl.osd_num, -1);
+            }
+        }
+        if (err)
+        {
+            stop_client(op->peer_fd);
+            delete op;
+            return;
+        }
+        osd_peer_fds[cl.osd_num] = cl.peer_fd;
+        on_connect_peer(cl.osd_num, cl.peer_fd);
+        delete op;
+    };
+    outbox_push(op);
+}
+
+void cluster_client_t::cancel_osd_ops(osd_client_t & cl)
+{
+    for (auto p: cl.sent_ops)
+    {
+        cancel_out_op(p.second);
+    }
+    cl.sent_ops.clear();
+    for (auto op: cl.outbox)
+    {
+        cancel_out_op(op);
+    }
+    cl.outbox.clear();
+    if (cl.write_op)
+    {
+        cancel_out_op(cl.write_op);
+        cl.write_op = NULL;
+    }
+}
+
+void cluster_client_t::cancel_out_op(osd_op_t *op)
+{
+    op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+    op->reply.hdr.id = op->req.hdr.id;
+    op->reply.hdr.opcode = op->req.hdr.opcode;
+    op->reply.hdr.retval = -EPIPE;
+    // Copy lambda to be unaffected by `delete op`
+    std::function<void(osd_op_t*)>(op->callback)(op);
+}
+
+void cluster_client_t::stop_client(int peer_fd)
+{
+    assert(peer_fd != 0);
+    auto it = clients.find(peer_fd);
+    if (it == clients.end())
+    {
+        return;
+    }
+    uint64_t repeer_osd = 0;
+    osd_client_t cl = it->second;
+    if (cl.peer_state == PEER_CONNECTED)
+    {
+        if (cl.osd_num)
+        {
+            // Reload configuration from etcd when the connection is dropped
+            printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
+            repeer_osd = cl.osd_num;
+        }
+        else
+        {
+            printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
+        }
+    }
+    clients.erase(it);
+    tfd->set_fd_handler(peer_fd, false, NULL);
+    if (cl.osd_num)
+    {
+        osd_peer_fds.erase(cl.osd_num);
+        // Cancel outbound operations
+        cancel_osd_ops(cl);
+    }
+    if (cl.read_op)
+    {
+        delete cl.read_op;
+        cl.read_op = NULL;
+    }
+    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
+    {
+        if (*rit == peer_fd)
+        {
+            read_ready_clients.erase(rit);
+            break;
+        }
+    }
+    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
+    {
+        if (*wit == peer_fd)
+        {
+            write_ready_clients.erase(wit);
+            break;
+        }
+    }
+    free(cl.in_buf);
+    assert(peer_fd != 0);
+    close(peer_fd);
+    if (repeer_osd)
+    {
+        repeer_pgs(repeer_osd);
+    }
+}
--- a/cluster_client.h
+++ b/cluster_client.h
@@ -0,0 +1,209 @@
+#pragma once
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <arpa/inet.h>
+#include <malloc.h>
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+
+#include "json11/json11.hpp"
+#include "osd_ops.h"
+#include "timerfd_manager.h"
+#include "ringloop.h"
+
+#define OSD_OP_IN 0
+#define OSD_OP_OUT 1
+
+#define CL_READ_HDR 1
+#define CL_READ_DATA 2
+#define CL_READ_REPLY_DATA 3
+#define CL_WRITE_READY 1
+#define CL_WRITE_REPLY 2
+#define MAX_EPOLL_EVENTS 64
+#define OSD_OP_INLINE_BUF_COUNT 16
+
+#define PEER_CONNECTING 1
+#define PEER_CONNECTED 2
+
+struct osd_op_buf_list_t
+{
+    int count = 0, alloc = 0, sent = 0;
+    iovec *buf = NULL;
+    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
+
+    ~osd_op_buf_list_t()
+    {
+        if (buf && buf != inline_buf)
+        {
+            free(buf);
+        }
+    }
+
+    inline iovec* get_iovec()
+    {
+        return (buf ? buf : inline_buf) + sent;
+    }
+
+    inline int get_size()
+    {
+        return count - sent;
+    }
+
+    inline void push_back(void *nbuf, size_t len)
+    {
+        if (count >= alloc)
+        {
+            if (!alloc)
+            {
+                alloc = OSD_OP_INLINE_BUF_COUNT;
+                buf = inline_buf;
+            }
+            else if (buf == inline_buf)
+            {
+                int old = alloc;
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)malloc(sizeof(iovec) * alloc);
+                memcpy(buf, inline_buf, sizeof(iovec)*old);
+            }
+            else
+            {
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
+            }
+        }
+        buf[count++] = { .iov_base = nbuf, .iov_len = len };
+    }
+};
+
+struct blockstore_op_t;
+
+struct osd_primary_op_data_t;
+
+struct osd_op_t
+{
+    timespec tv_begin;
+    uint64_t op_type = OSD_OP_IN;
+    int peer_fd;
+    osd_any_op_t req;
+    osd_any_reply_t reply;
+    blockstore_op_t *bs_op = NULL;
+    void *buf = NULL;
+    void *rmw_buf = NULL;
+    osd_primary_op_data_t* op_data = NULL;
+    std::function<void(osd_op_t*)> callback;
+
+    osd_op_buf_list_t send_list;
+
+    ~osd_op_t();
+};
+
+struct osd_client_t
+{
+    sockaddr_in peer_addr;
+    int peer_port;
+    int peer_fd;
+    int peer_state;
+    int connect_timeout_id = -1;
+    osd_num_t osd_num = 0;
+
+    void *in_buf = NULL;
+
+    // Read state
+    int read_ready = 0;
+    osd_op_t *read_op = NULL;
+    int read_reply_id = 0;
+    iovec read_iov;
+    msghdr read_msg;
+    void *read_buf = NULL;
+    int read_remaining = 0;
+    int read_state = 0;
+
+    // Outbound operations sent to this peer
+    std::map<int, osd_op_t*> sent_ops;
+
+    // Outbound messages (replies or requests)
+    std::deque<osd_op_t*> outbox;
+
+    // PGs dirtied by this client's primary-writes (FIXME to drop the connection)
+    std::set<pg_num_t> dirty_pgs;
+
+    // Write state
+    osd_op_t *write_op = NULL;
+    msghdr write_msg;
+    int write_state = 0;
+};
+
+struct osd_wanted_peer_t
+{
+    json11::Json address_list;
+    int port;
+    time_t last_connect_attempt;
+    bool connecting, address_changed;
+    int address_index;
+    std::string cur_addr;
+    int cur_port;
+};
+
+struct osd_op_stats_t
+{
+    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
+    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
+    uint64_t op_stat_bytes[OSD_OP_MAX+1] = { 0 };
+    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
+    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
+};
+
+struct cluster_client_t
+{
+    timerfd_manager_t *tfd;
+    ring_loop_t *ringloop;
+
+    // osd_num_t is only for logging and asserts
+    osd_num_t osd_num;
+    int receive_buffer_size = 9000;
+    int peer_connect_interval = 5;
+    int peer_connect_timeout = 5;
+    int log_level = 0;
+
+    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
+    std::map<uint64_t, int> osd_peer_fds;
+    uint64_t next_subop_id = 1;
+
+    std::map<int, osd_client_t> clients;
+    std::vector<int> read_ready_clients;
+    std::vector<int> write_ready_clients;
+
+    // op statistics
+    osd_op_stats_t stats;
+
+    // public
+    void connect_peer(uint64_t osd_num, json11::Json address_list, int port);
+    void stop_client(int peer_fd);
+    void outbox_push(osd_op_t *cur_op);
+    std::function<void(osd_op_t*)> exec_op;
+    std::function<void(osd_num_t)> repeer_pgs;
+
+    // private
+    void try_connect_peer(uint64_t osd_num);
+    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
+    void handle_connect_epoll(int peer_fd);
+    void handle_peer_epoll(int peer_fd, int epoll_events);
+    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
+    void check_peer_config(osd_client_t & cl);
+    void cancel_osd_ops(osd_client_t & cl);
+    void cancel_out_op(osd_op_t *op);
+
+    bool try_send(osd_client_t & cl);
+    void send_replies();
+    void handle_send(ring_data_t *data, int peer_fd);
+
+    void read_requests();
+    void handle_read(ring_data_t *data, int peer_fd);
+    void handle_finished_read(osd_client_t & cl);
+    void handle_op_hdr(osd_client_t *cl);
+    void handle_reply_hdr(osd_client_t *cl);
+};
--- a/dump_journal.cpp
+++ b/dump_journal.cpp
@@ -0,0 +1,165 @@
+#define _LARGEFILE64_SOURCE
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <linux/fs.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include "blockstore_impl.h"
+#include "crc32c.h"
+
+struct journal_dump_t
+{
+    char *journal_device;
+    uint32_t journal_block;
+    uint64_t journal_offset;
+    uint64_t journal_len;
+    uint64_t journal_pos;
+    int fd;
+
+    void dump_block(void *buf);
+};
+
+int main(int argc, char *argv[])
+{
+    if (argc < 5)
+    {
+        printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
+        return 1;
+    }
+    journal_dump_t self;
+    self.journal_device = argv[1];
+    self.journal_block = strtoul(argv[2], NULL, 10);
+    self.journal_offset = strtoull(argv[3], NULL, 10);
+    self.journal_len = strtoull(argv[4], NULL, 10);
+    if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
+        self.journal_block > 128*1024)
+    {
+        printf("Invalid journal block size\n");
+        return 1;
+    }
+    self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
+    if (self.fd == -1)
+    {
+        printf("Failed to open journal\n");
+        return 1;
+    }
+    void *data = memalign(MEM_ALIGNMENT, self.journal_block);
+    self.journal_pos = 0;
+    while (self.journal_pos < self.journal_len)
+    {
+        int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
+        assert(r == self.journal_block);
+        uint64_t s;
+        for (s = 0; s < self.journal_block; s += 8)
+        {
+            if (*((uint64_t*)(data+s)) != 0)
+                break;
+        }
+        if (s == self.journal_block)
+        {
+            printf("offset %08lx: zeroes\n", self.journal_pos);
+            self.journal_pos += self.journal_block;
+        }
+        else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
+        {
+            printf("offset %08lx:\n", self.journal_pos);
+            self.dump_block(data);
+        }
+        else
+        {
+            printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
+            self.journal_pos += self.journal_block;
+        }
+    }
+    free(data);
+    close(self.fd);
+    return 0;
+}
+
+void journal_dump_t::dump_block(void *buf)
+{
+    uint32_t pos = 0;
+    journal_pos += journal_block;
+    int entry = 0;
+    bool wrapped = false;
+    while (pos < journal_block)
+    {
+        journal_entry *je = (journal_entry*)(buf + pos);
+        if (je->magic != JOURNAL_MAGIC || je->type < JE_START || je->type > JE_DELETE)
+        {
+            break;
+        }
+        const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
+        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
+        if (je->type == JE_START)
+        {
+            printf("je_start start=%08lx\n", je->start.journal_start);
+        }
+        else if (je->type == JE_SMALL_WRITE)
+        {
+            printf(
+                "je_small_write oid=%lu:%lu ver=%lu offset=%u len=%u loc=%08lx",
+                je->small_write.oid.inode, je->small_write.oid.stripe,
+                je->small_write.version, je->small_write.offset, je->small_write.len,
+                je->small_write.data_offset
+            );
+            if (journal_pos + je->small_write.len > journal_len)
+            {
+                // data continues from the beginning of the journal
+                journal_pos = journal_block;
+                wrapped = true;
+            }
+            if (journal_pos != je->small_write.data_offset)
+            {
+                printf(" (mismatched, calculated = %lu)", journal_pos);
+            }
+            journal_pos += je->small_write.len;
+            if (journal_pos >= journal_len)
+            {
+                journal_pos = journal_block;
+                wrapped = true;
+            }
+            uint32_t data_crc32 = 0;
+            void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
+            assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
+            data_crc32 = crc32c(0, data, je->small_write.len);
+            free(data);
+            printf(
+                " data_crc32=%08x%s", je->small_write.crc32_data,
+                (data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
+            );
+            printf("\n");
+        }
+        else if (je->type == JE_BIG_WRITE)
+        {
+            printf("je_big_write oid=%lu:%lu ver=%lu loc=%08lx\n", je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location);
+        }
+        else if (je->type == JE_STABLE)
+        {
+            printf("je_stable oid=%lu:%lu ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
+        }
+        else if (je->type == JE_ROLLBACK)
+        {
+            printf("je_rollback oid=%lu:%lu ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
+        }
+        else if (je->type == JE_DELETE)
+        {
+            printf("je_delete oid=%lu:%lu ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
+        }
+        pos += je->size;
+        entry++;
+    }
+    if (wrapped)
+    {
+        journal_pos = journal_len;
+    }
+}
--- a/etcd_state_client.cpp
+++ b/etcd_state_client.cpp
@@ -0,0 +1,374 @@
+#include "osd_ops.h"
+#include "pg_states.h"
+#include "etcd_state_client.h"
+#include "http_client.h"
+#include "base64.h"
+
+json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
+{
+    json_kv_t kv;
+    kv.key = base64_decode(kv_json["key"].string_value());
+    std::string json_err, json_text = base64_decode(kv_json["value"].string_value());
+    kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
+    if (json_err != "")
+    {
+        printf("Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
+        kv.key = "";
+    }
+    return kv;
+}
+
+void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
+{
+    etcd_call("/kv/txn", txn, timeout, callback);
+}
+
+void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
+{
+    std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
+    std::string etcd_api_path;
+    int pos = etcd_address.find('/');
+    if (pos >= 0)
+    {
+        etcd_api_path = etcd_address.substr(pos);
+        etcd_address = etcd_address.substr(0, pos);
+    }
+    std::string req = payload.dump();
+    req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
+        "Host: "+etcd_address+"\r\n"
+        "Content-Type: application/json\r\n"
+        "Content-Length: "+std::to_string(req.size())+"\r\n"
+        "Connection: close\r\n"
+        "\r\n"+req;
+    http_request_json(tfd, etcd_address, req, timeout, callback);
+}
+
+void etcd_state_client_t::start_etcd_watcher()
+{
+    std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
+    std::string etcd_api_path;
+    int pos = etcd_address.find('/');
+    if (pos >= 0)
+    {
+        etcd_api_path = etcd_address.substr(pos);
+        etcd_address = etcd_address.substr(0, pos);
+    }
+    etcd_watches_initialised = 0;
+    etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT, [this](const http_response_t *msg)
+    {
+        if (msg->body.length())
+        {
+            std::string json_err;
+            json11::Json data = json11::Json::parse(msg->body, json_err);
+            if (json_err != "")
+            {
+                printf("Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
+            }
+            else
+            {
+                if (data["result"]["created"].bool_value())
+                {
+                    etcd_watches_initialised++;
+                }
+                if (etcd_watches_initialised == 4)
+                {
+                    etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
+                }
+                // First gather all changes into a hash to remove multiple overwrites
+                json11::Json::object changes;
+                for (auto & ev: data["result"]["events"].array_items())
+                {
+                    auto kv = parse_etcd_kv(ev["kv"]);
+                    if (kv.key != "")
+                    {
+                        changes[kv.key] = kv.value;
+                    }
+                }
+                for (auto & kv: changes)
+                {
+                    if (this->log_level > 0)
+                    {
+                        printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.dump().c_str());
+                    }
+                    parse_state(kv.first, kv.second);
+                }
+                // React to changes
+                on_change_hook(changes);
+            }
+        }
+        if (msg->eof)
+        {
+            etcd_watch_ws = NULL;
+            if (etcd_watches_initialised == 0)
+            {
+                // Connection not established, retry in <ETCD_SLOW_TIMEOUT>
+                tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int)
+                {
+                    start_etcd_watcher();
+                });
+            }
+            else
+            {
+                // Connection was live, retry immediately
+                start_etcd_watcher();
+            }
+        }
+    });
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/config/") },
+            { "range_end", base64_encode(etcd_prefix+"/config0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_CONFIG_WATCH_ID },
+        } }
+    }).dump());
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/osd/state/") },
+            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
+        } }
+    }).dump());
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/pg/state/") },
+            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_PG_STATE_WATCH_ID },
+        } }
+    }).dump());
+    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/pg/history/") },
+            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
+            { "start_revision", etcd_watch_revision+1 },
+            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
+        } }
+    }).dump());
+}
+
+void etcd_state_client_t::load_global_config()
+{
+    etcd_call("/kv/range", json11::Json::object {
+        { "key", base64_encode(etcd_prefix+"/config/global") }
+    }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "")
+        {
+            printf("Error reading OSD configuration from etcd: %s\n", err.c_str());
+            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
+            {
+                load_global_config();
+            });
+            return;
+        }
+        if (!etcd_watch_revision)
+        {
+            etcd_watch_revision = data["header"]["revision"].uint64_value();
+        }
+        json11::Json::object global_config;
+        if (data["kvs"].array_items().size() > 0)
+        {
+            auto kv = parse_etcd_kv(data["kvs"][0]);
+            if (kv.value.is_object())
+            {
+                global_config = kv.value.object_items();
+            }
+        }
+        on_load_config_hook(global_config);
+    });
+}
+
+void etcd_state_client_t::load_pgs()
+{
+    json11::Json::array txn = {
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/config/pgs") },
+            } }
+        },
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/pg/history/") },
+                { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
+            } }
+        },
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/pg/state/") },
+                { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
+            } }
+        },
+        json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", base64_encode(etcd_prefix+"/osd/state/") },
+                { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
+            } }
+        },
+    };
+    json11::Json::object req = { { "success", txn } };
+    json11::Json checks = load_pgs_checks_hook();
+    if (checks.array_items().size() > 0)
+    {
+        req["compare"] = checks;
+    }
+    etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "")
+        {
+            printf("Error loading PGs from etcd: %s\n", err.c_str());
+            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
+            {
+                load_pgs();
+            });
+            return;
+        }
+        if (!data["succeeded"].bool_value())
+        {
+            on_load_pgs_hook(false);
+            return;
+        }
+        for (auto & res: data["responses"].array_items())
+        {
+            for (auto & kv_json: res["response_range"]["kvs"].array_items())
+            {
+                auto kv = parse_etcd_kv(kv_json);
+                parse_state(kv.key, kv.value);
+            }
+        }
+        on_load_pgs_hook(true);
+    });
+}
+
+void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
+{
+    if (key == etcd_prefix+"/config/pgs")
+    {
+        for (auto & pg_item: this->pg_config)
+        {
+            pg_item.second.exists = false;
+        }
+        for (auto & pg_item: value["items"].object_items())
+        {
+            pg_num_t pg_num = stoull_full(pg_item.first);
+            if (!pg_num)
+            {
+                printf("Bad key in PG configuration: %s (must be a number), skipped\n", pg_item.first.c_str());
+                continue;
+            }
+            this->pg_config[pg_num].exists = true;
+            this->pg_config[pg_num].pause = pg_item.second["pause"].bool_value();
+            this->pg_config[pg_num].primary = pg_item.second["primary"].uint64_value();
+            this->pg_config[pg_num].target_set.clear();
+            for (auto pg_osd: pg_item.second["osd_set"].array_items())
+            {
+                this->pg_config[pg_num].target_set.push_back(pg_osd.uint64_value());
+            }
+            if (this->pg_config[pg_num].target_set.size() != 3)
+            {
+                printf("Bad PG %u config format: incorrect osd_set = %s\n", pg_num, pg_item.second["osd_set"].dump().c_str());
+                this->pg_config[pg_num].target_set.resize(3);
+                this->pg_config[pg_num].pause = true;
+            }
+        }
+    }
+    else if (key.substr(0, etcd_prefix.length()+12) == etcd_prefix+"/pg/history/")
+    {
+        // <etcd_prefix>/pg/history/%d
+        pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+12));
+        if (!pg_num)
+        {
+            printf("Bad etcd key %s, ignoring\n", key.c_str());
+        }
+        else
+        {
+            auto & pg_cfg = this->pg_config[pg_num];
+            pg_cfg.target_history.clear();
+            pg_cfg.all_peers.clear();
+            // Refuse to start PG if any set of the <osd_sets> has no live OSDs
+            for (auto hist_item: value["osd_sets"].array_items())
+            {
+                std::vector<osd_num_t> history_set;
+                for (auto pg_osd: hist_item.array_items())
+                {
+                    history_set.push_back(pg_osd.uint64_value());
+                }
+                pg_cfg.target_history.push_back(history_set);
+            }
+            // Include these additional OSDs when peering the PG
+            for (auto pg_osd: value["all_peers"].array_items())
+            {
+                pg_cfg.all_peers.push_back(pg_osd.uint64_value());
+            }
+        }
+    }
+    else if (key.substr(0, etcd_prefix.length()+10) == etcd_prefix+"/pg/state/")
+    {
+        // <etcd_prefix>/pg/state/%d
+        pg_num_t pg_num = stoull_full(key.substr(etcd_prefix.length()+10));
+        if (!pg_num)
+        {
+            printf("Bad etcd key %s, ignoring\n", key.c_str());
+        }
+        else if (value.is_null())
+        {
+            this->pg_config[pg_num].cur_primary = 0;
+            this->pg_config[pg_num].cur_state = 0;
+        }
+        else
+        {
+            osd_num_t cur_primary = value["primary"].uint64_value();
+            int state = 0;
+            for (auto & e: value["state"].array_items())
+            {
+                int i;
+                for (i = 0; i < pg_state_bit_count; i++)
+                {
+                    if (e.string_value() == pg_state_names[i])
+                    {
+                        state = state | pg_state_bits[i];
+                        break;
+                    }
+                }
+                if (i >= pg_state_bit_count)
+                {
+                    printf("Unexpected PG %u state keyword in etcd: %s\n", pg_num, e.dump().c_str());
+                    return;
+                }
+            }
+            if (!cur_primary || !value["state"].is_array() || !state ||
+                (state & PG_OFFLINE) && state != PG_OFFLINE ||
+                (state & PG_PEERING) && state != PG_PEERING ||
+                (state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
+            {
+                printf("Unexpected PG %u state in etcd: primary=%lu, state=%s\n", pg_num, cur_primary, value["state"].dump().c_str());
+                return;
+            }
+            this->pg_config[pg_num].cur_primary = cur_primary;
+            this->pg_config[pg_num].cur_state = state;
+        }
+    }
+    else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")
+    {
+        // <etcd_prefix>/osd/state/%d
+        osd_num_t peer_osd = std::stoull(key.substr(etcd_prefix.length()+11));
+        if (peer_osd > 0)
+        {
+            if (value.is_object() && value["state"] == "up" &&
+                value["addresses"].is_array() &&
+                value["port"].int64_value() > 0 && value["port"].int64_value() < 65536)
+            {
+                this->peer_states[peer_osd] = value;
+            }
+            else
+            {
+                this->peer_states.erase(peer_osd);
+            }
+            if (on_change_osd_state_hook != NULL)
+            {
+                on_change_osd_state_hook(peer_osd);
+            }
+        }
+    }
+}
--- a/etcd_state_client.h
+++ b/etcd_state_client.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "http_client.h"
+#include "timerfd_manager.h"
+
+#define ETCD_CONFIG_WATCH_ID 1
+#define ETCD_PG_STATE_WATCH_ID 2
+#define ETCD_PG_HISTORY_WATCH_ID 3
+#define ETCD_OSD_STATE_WATCH_ID 4
+
+#define MAX_ETCD_ATTEMPTS 5
+#define ETCD_SLOW_TIMEOUT 5000
+#define ETCD_QUICK_TIMEOUT 1000
+
+struct pg_config_t
+{
+    bool exists;
+    osd_num_t primary;
+    std::vector<osd_num_t> target_set;
+    std::vector<std::vector<osd_num_t>> target_history;
+    std::vector<osd_num_t> all_peers;
+    bool pause;
+    osd_num_t cur_primary;
+    int cur_state;
+};
+
+struct json_kv_t
+{
+    std::string key;
+    json11::Json value;
+};
+
+struct etcd_state_client_t
+{
+    std::vector<std::string> etcd_addresses;
+    std::string etcd_prefix;
+    int log_level = 0;
+    timerfd_manager_t *tfd = NULL;
+
+    int etcd_watches_initialised = 0;
+    uint64_t etcd_watch_revision = 0;
+    websocket_t *etcd_watch_ws = NULL;
+    std::map<pg_num_t, pg_config_t> pg_config;
+    std::map<osd_num_t, json11::Json> peer_states;
+
+    std::function<void(json11::Json::object &)> on_change_hook;
+    std::function<void(json11::Json::object &)> on_load_config_hook;
+    std::function<json11::Json()> load_pgs_checks_hook;
+    std::function<void(bool)> on_load_pgs_hook;
+    std::function<void(uint64_t)> on_change_osd_state_hook;
+
+    json_kv_t parse_etcd_kv(const json11::Json & kv_json);
+    void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
+    void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
+    void start_etcd_watcher();
+    void load_global_config();
+    void load_pgs();
+    void parse_state(const std::string & key, const json11::Json & value);
+};
--- a/fio_engine.cpp
+++ b/fio_engine.cpp
@@ -23,6 +23,7 @@

 #include "blockstore.h"
 extern "C" {
+#define CONFIG_HAVE_GETTID
 #define CONFIG_PWRITEV2
 #include "fio/fio.h"
 #include "fio/optgroup.h"
@@ -100,7 +101,7 @@ static void bs_cleanup(struct thread_data *td)
                bsd->ringloop->loop();
                if (bsd->bs->is_safe_to_stop())
                    goto safe;
-            } while (bsd->ringloop->get_loop_again());
+            } while (bsd->ringloop->has_work());
            bsd->ringloop->wait();
        }
    safe:
--- a/fio_sec_osd.cpp
+++ b/fio_sec_osd.cpp
@@ -28,6 +28,7 @@
 #include "rw_blocking.h"
 #include "osd_ops.h"
 extern "C" {
+#define CONFIG_HAVE_GETTID
 #define CONFIG_PWRITEV2
 #include "fio/fio.h"
 #include "fio/optgroup.h"
--- a/http_client.cpp
+++ b/http_client.cpp
@@ -0,0 +1,657 @@
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+
+#include <ctype.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "json11/json11.hpp"
+#include "http_client.h"
+#include "timerfd_manager.h"
+
+#define READ_BUFFER_SIZE 9000
+
+static int extract_port(std::string & host);
+static std::string strtolower(const std::string & in);
+static std::string trim(const std::string & in);
+static std::string ws_format_frame(int type, uint64_t size);
+static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
+
+// FIXME: Use keepalive
+struct http_co_t
+{
+    timerfd_manager_t *tfd;
+
+    int request_timeout = 0;
+    std::string host;
+    std::string request;
+    std::string ws_outbox;
+    std::string response;
+    bool want_streaming;
+
+    http_response_t parsed;
+    uint64_t target_response_size = 0;
+
+    int state = 0;
+    int peer_fd = -1;
+    int timeout_id = -1;
+    int epoll_events = 0;
+    int sent = 0;
+    std::vector<char> rbuf;
+    iovec read_iov, send_iov;
+    msghdr read_msg = { 0 }, send_msg = { 0 };
+
+    std::function<void(const http_response_t*)> callback;
+
+    websocket_t ws;
+
+    ~http_co_t();
+    void start_connection();
+    void handle_connect_result();
+    void submit_read();
+    void submit_send();
+    bool handle_read();
+    void post_message(int type, const std::string & msg);
+};
+
+#define HTTP_CO_CONNECTING 1
+#define HTTP_CO_SENDING_REQUEST 2
+#define HTTP_CO_REQUEST_SENT 3
+#define HTTP_CO_HEADERS_RECEIVED 4
+#define HTTP_CO_WEBSOCKET 5
+#define HTTP_CO_CHUNKED 6
+
+#define DEFAULT_TIMEOUT 5000
+
+void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    const http_options_t & options, std::function<void(const http_response_t *response)> callback)
+{
+    http_co_t *handler = new http_co_t();
+    handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
+    handler->want_streaming = options.want_streaming;
+    handler->tfd = tfd;
+    handler->host = host;
+    handler->request = request;
+    handler->callback = callback;
+    handler->ws.co = handler;
+    handler->start_connection();
+}
+
+void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    int timeout, std::function<void(std::string, json11::Json r)> callback)
+{
+    http_request(tfd, host, request, { .timeout = timeout }, [callback](const http_response_t* res)
+    {
+        if (res->error_code != 0)
+        {
+            callback("Error code: "+std::to_string(res->error_code)+" ("+std::string(strerror(res->error_code))+")", json11::Json());
+            return;
+        }
+        if (res->status_code != 200)
+        {
+            callback("HTTP "+std::to_string(res->status_code)+" "+res->status_line+" body: "+trim(res->body), json11::Json());
+            return;
+        }
+        std::string json_err;
+        json11::Json data = json11::Json::parse(res->body, json_err);
+        if (json_err != "")
+        {
+            callback("Bad JSON: "+json_err+" (response: "+trim(res->body)+")", json11::Json());
+            return;
+        }
+        callback(std::string(), data);
+    });
+}
+
+websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
+    int timeout, std::function<void(const http_response_t *msg)> callback)
+{
+    std::string request = "GET "+path+" HTTP/1.1\r\n"
+        "Host: "+host+"\r\n"
+        "Upgrade: websocket\r\n"
+        "Connection: upgrade\r\n"
+        "Sec-WebSocket-Key: x3JJHMbDL1EzLkh9GBhXDw==\r\n"
+        "Sec-WebSocket-Version: 13\r\n"
+        "\r\n";
+    http_co_t *handler = new http_co_t();
+    handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
+    handler->want_streaming = false;
+    handler->tfd = tfd;
+    handler->host = host;
+    handler->request = request;
+    handler->callback = callback;
+    handler->ws.co = handler;
+    handler->start_connection();
+    return &handler->ws;
+}
+
+void websocket_t::post_message(int type, const std::string & msg)
+{
+    co->post_message(type, msg);
+}
+
+void websocket_t::close()
+{
+    delete co;
+}
+
+http_co_t::~http_co_t()
+{
+    if (timeout_id >= 0)
+    {
+        tfd->clear_timer(timeout_id);
+        timeout_id = -1;
+    }
+    if (peer_fd >= 0)
+    {
+        tfd->set_fd_handler(peer_fd, false, NULL);
+        close(peer_fd);
+        peer_fd = -1;
+    }
+    if (parsed.headers["transfer-encoding"] == "chunked")
+    {
+        int prev = 0, pos = 0;
+        while ((pos = response.find("\r\n", prev)) >= prev)
+        {
+            uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
+            parsed.body += response.substr(pos+2, len);
+            prev = pos+2+len+2;
+        }
+    }
+    else
+    {
+        std::swap(parsed.body, response);
+    }
+    parsed.eof = true;
+    callback(&parsed);
+}
+
+void http_co_t::start_connection()
+{
+    int port = extract_port(host);
+    struct sockaddr_in addr;
+    int r;
+    if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
+    {
+        parsed.error_code = ENXIO;
+        // FIXME 'delete this' is ugly...
+        delete this;
+        return;
+    }
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port ? port : 80);
+    peer_fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (peer_fd < 0)
+    {
+        parsed.error_code = errno;
+        delete this;
+        return;
+    }
+    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
+    if (request_timeout > 0)
+    {
+        timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
+        {
+            if (response.length() == 0)
+            {
+                parsed.error_code = ETIME;
+            }
+            delete this;
+        });
+    }
+    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
+    {
+        this->epoll_events |= epoll_events;
+        handle_connect_result();
+    });
+    epoll_events = 0;
+    // Finally call connect
+    r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
+    if (r < 0 && errno != EINPROGRESS)
+    {
+        parsed.error_code = errno;
+        delete this;
+        return;
+    }
+    state = HTTP_CO_CONNECTING;
+}
+
+void http_co_t::handle_connect_result()
+{
+    if (epoll_events & (EPOLLOUT | EPOLLERR))
+    {
+        int result = 0;
+        socklen_t result_len = sizeof(result);
+        if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
+        {
+            result = errno;
+        }
+        if (result != 0)
+        {
+            parsed.error_code = result;
+            delete this;
+            return;
+        }
+        int one = 1;
+        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
+        tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
+        {
+            this->epoll_events |= epoll_events;
+            if (this->epoll_events & EPOLLIN)
+            {
+                submit_read();
+            }
+            else if (this->epoll_events & (EPOLLRDHUP|EPOLLERR))
+            {
+                delete this;
+            }
+        });
+        state = HTTP_CO_SENDING_REQUEST;
+        submit_send();
+    }
+    else
+    {
+        delete this;
+    }
+}
+
+void http_co_t::submit_read()
+{
+    int res;
+again:
+    if (rbuf.size() != READ_BUFFER_SIZE)
+    {
+        rbuf.resize(READ_BUFFER_SIZE);
+    }
+    read_iov = { .iov_base = rbuf.data(), .iov_len = READ_BUFFER_SIZE };
+    read_msg.msg_iov = &read_iov;
+    read_msg.msg_iovlen = 1;
+    epoll_events = epoll_events & ~EPOLLIN;
+    res = recvmsg(peer_fd, &read_msg, 0);
+    if (res < 0)
+    {
+        res = -errno;
+    }
+    if (res == -EAGAIN)
+    {
+        res = 0;
+    }
+    if (res < 0)
+    {
+        delete this;
+        return;
+    }
+    response += std::string(rbuf.data(), res);
+    if (res == READ_BUFFER_SIZE)
+    {
+        goto again;
+    }
+    if (!handle_read())
+    {
+        return;
+    }
+    if (res < READ_BUFFER_SIZE && (epoll_events & (EPOLLRDHUP|EPOLLERR)))
+    {
+        delete this;
+        return;
+    }
+}
+
+void http_co_t::submit_send()
+{
+    int res;
+again:
+    if (sent < request.size())
+    {
+        send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
+        send_msg.msg_iov = &send_iov;
+        send_msg.msg_iovlen = 1;
+        res = sendmsg(peer_fd, &send_msg, 0);
+        if (res < 0)
+        {
+            res = -errno;
+        }
+        if (res == -EAGAIN)
+        {
+            res = 0;
+        }
+        else if (res < 0)
+        {
+            delete this;
+            return;
+        }
+        sent += res;
+        if (state == HTTP_CO_SENDING_REQUEST)
+        {
+            if (sent >= request.size())
+                state = HTTP_CO_REQUEST_SENT;
+            else
+                goto again;
+        }
+        else if (state == HTTP_CO_WEBSOCKET)
+        {
+            request = request.substr(sent);
+            sent = 0;
+            goto again;
+        }
+    }
+}
+
+bool http_co_t::handle_read()
+{
+    if (state == HTTP_CO_REQUEST_SENT)
+    {
+        int pos = response.find("\r\n\r\n");
+        if (pos >= 0)
+        {
+            if (timeout_id >= 0)
+            {
+                tfd->clear_timer(timeout_id);
+                timeout_id = -1;
+            }
+            state = HTTP_CO_HEADERS_RECEIVED;
+            parse_http_headers(response, &parsed);
+            if (parsed.status_code == 101 &&
+                parsed.headers.find("sec-websocket-accept") != parsed.headers.end() &&
+                parsed.headers["upgrade"] == "websocket" &&
+                parsed.headers["connection"] == "upgrade")
+            {
+                // Don't care about validating the key
+                state = HTTP_CO_WEBSOCKET;
+                request = ws_outbox;
+                ws_outbox = "";
+                sent = 0;
+                submit_send();
+            }
+            else if (parsed.headers["transfer-encoding"] == "chunked")
+            {
+                state = HTTP_CO_CHUNKED;
+            }
+            else if (parsed.headers["connection"] != "close")
+            {
+                target_response_size = stoull_full(parsed.headers["content-length"]);
+                if (!target_response_size)
+                {
+                    // Sorry, unsupported response
+                    delete this;
+                    return false;
+                }
+            }
+        }
+    }
+    if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
+    {
+        delete this;
+        return false;
+    }
+    if (state == HTTP_CO_CHUNKED && response.size() > 0)
+    {
+        int prev = 0, pos = 0;
+        while ((pos = response.find("\r\n", prev)) >= prev)
+        {
+            uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
+            if (!len)
+            {
+                // Zero length chunk indicates EOF
+                parsed.eof = true;
+                break;
+            }
+            if (response.size() < pos+2+len+2)
+            {
+                break;
+            }
+            parsed.body += response.substr(pos+2, len);
+            prev = pos+2+len+2;
+        }
+        if (prev > 0)
+        {
+            response = response.substr(prev);
+        }
+        if (parsed.eof)
+        {
+            delete this;
+            return false;
+        }
+        if (want_streaming && parsed.body.size() > 0)
+        {
+            callback(&parsed);
+            parsed.body = "";
+        }
+    }
+    if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
+    {
+        while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body))
+        {
+            callback(&parsed);
+            parsed.body = "";
+        }
+    }
+    return true;
+}
+
+void http_co_t::post_message(int type, const std::string & msg)
+{
+    if (state == HTTP_CO_WEBSOCKET)
+    {
+        request += ws_format_frame(type, msg.size());
+        request += msg;
+        submit_send();
+    }
+    else
+    {
+        ws_outbox += ws_format_frame(type, msg.size());
+        ws_outbox += msg;
+    }
+}
+
+uint64_t stoull_full(const std::string & str, int base)
+{
+    if (isspace(str[0]))
+    {
+        return 0;
+    }
+    char *end = NULL;
+    uint64_t r = strtoull(str.c_str(), &end, base);
+    if (end != str.c_str()+str.length())
+    {
+        return 0;
+    }
+    return r;
+}
+
+void parse_http_headers(std::string & res, http_response_t *parsed)
+{
+    int pos = res.find("\r\n");
+    pos = pos < 0 ? res.length() : pos+2;
+    std::string status_line = res.substr(0, pos);
+    int http_version;
+    char *status_text = NULL;
+    sscanf(status_line.c_str(), "HTTP/1.%d %d %ms", &http_version, &parsed->status_code, &status_text);
+    if (status_text)
+    {
+        parsed->status_line = status_text;
+        // %ms = allocate a buffer
+        free(status_text);
+        status_text = NULL;
+    }
+    int prev = pos;
+    while ((pos = res.find("\r\n", prev)) >= prev)
+    {
+        if (pos == prev)
+        {
+            res = res.substr(pos+2);
+            break;
+        }
+        std::string header = res.substr(prev, pos-prev);
+        int p2 = header.find(":");
+        if (p2 >= 0)
+        {
+            std::string key = strtolower(header.substr(0, p2));
+            int p3 = p2+1;
+            while (p3 < header.length() && isblank(header[p3]))
+                p3++;
+            parsed->headers[key] = key == "connection" || key == "upgrade" || key == "transfer-encoding"
+                ? strtolower(header.substr(p3)) : header.substr(p3);
+        }
+        prev = pos+2;
+    }
+}
+
+static std::string ws_format_frame(int type, uint64_t size)
+{
+    // Always zero mask
+    std::string res;
+    int p = 0;
+    res.resize(2 + (size >= 126 ? 2 : 0) + (size >= 65536 ? 6 : 0) + /*mask*/4);
+    res[p++] = 0x80 | type;
+    if (size < 126)
+        res[p++] = size | /*mask*/0x80;
+    else if (size < 65536)
+    {
+        res[p++] = 126 | /*mask*/0x80;
+        res[p++] = (size >> 8) & 0xFF;
+        res[p++] = (size >> 0) & 0xFF;
+    }
+    else
+    {
+        res[p++] = 127 | /*mask*/0x80;
+        res[p++] = (size >> 56) & 0xFF;
+        res[p++] = (size >> 48) & 0xFF;
+        res[p++] = (size >> 40) & 0xFF;
+        res[p++] = (size >> 32) & 0xFF;
+        res[p++] = (size >> 24) & 0xFF;
+        res[p++] = (size >> 16) & 0xFF;
+        res[p++] = (size >>  8) & 0xFF;
+        res[p++] = (size >>  0) & 0xFF;
+    }
+    res[p++] = 0;
+    res[p++] = 0;
+    res[p++] = 0;
+    res[p++] = 0;
+    return res;
+}
+
+static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
+{
+    uint64_t hdr = 2;
+    if (buf.size() < hdr)
+    {
+        return false;
+    }
+    type = buf[0] & ~0x80;
+    bool mask = !!(buf[1] & 0x80);
+    hdr += mask ? 4 : 0;
+    uint64_t len = ((uint8_t)buf[1] & ~0x80);
+    if (len == 126)
+    {
+        hdr += 2;
+        if (buf.size() < hdr)
+        {
+            return false;
+        }
+        len = ((uint64_t)(uint8_t)buf[2] << 8) | ((uint64_t)(uint8_t)buf[3] << 0);
+    }
+    else if (len == 127)
+    {
+        hdr += 8;
+        if (buf.size() < hdr)
+        {
+            return false;
+        }
+        len = ((uint64_t)(uint8_t)buf[2] << 56) |
+            ((uint64_t)(uint8_t)buf[3] << 48) |
+            ((uint64_t)(uint8_t)buf[4] << 40) |
+            ((uint64_t)(uint8_t)buf[5] << 32) |
+            ((uint64_t)(uint8_t)buf[6] << 24) |
+            ((uint64_t)(uint8_t)buf[7] << 16) |
+            ((uint64_t)(uint8_t)buf[8] << 8) |
+            ((uint64_t)(uint8_t)buf[9] << 0);
+    }
+    if (buf.size() < hdr+len)
+    {
+        return false;
+    }
+    if (mask)
+    {
+        for (int i = 0; i < len; i++)
+            buf[hdr+i] ^= buf[hdr-4+(i & 3)];
+    }
+    res += buf.substr(hdr, len);
+    buf = buf.substr(hdr+len);
+    return true;
+}
+
+std::vector<std::string> getifaddr_list(bool include_v6)
+{
+    std::vector<std::string> addresses;
+    ifaddrs *list, *ifa;
+    if (getifaddrs(&list) == -1)
+    {
+        throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
+    }
+    for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
+    {
+        if (!ifa->ifa_addr)
+        {
+            continue;
+        }
+        int family = ifa->ifa_addr->sa_family;
+        if ((family == AF_INET || family == AF_INET6 && include_v6) &&
+            (ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
+        {
+            void *addr_ptr;
+            if (family == AF_INET)
+                addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
+            else
+                addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
+            char addr[INET6_ADDRSTRLEN];
+            if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
+            {
+                throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
+            }
+            addresses.push_back(std::string(addr));
+        }
+    }
+    freeifaddrs(list);
+    return addresses;
+}
+
+static int extract_port(std::string & host)
+{
+    int port = 0;
+    int pos = 0;
+    if ((pos = host.find(':')) >= 0)
+    {
+        port = strtoull(host.c_str() + pos + 1, NULL, 10);
+        if (port >= 0x10000)
+        {
+            port = 0;
+        }
+        host = host.substr(0, pos);
+    }
+    return port;
+}
+
+static std::string strtolower(const std::string & in)
+{
+    std::string s = in;
+    for (int i = 0; i < s.length(); i++)
+    {
+        s[i] = tolower(s[i]);
+    }
+    return s;
+}
+
+static std::string trim(const std::string & in)
+{
+    int begin = in.find_first_not_of(" \n\r\t");
+    if (begin == -1)
+        return "";
+    int end = in.find_last_not_of(" \n\r\t");
+    return in.substr(begin, end+1-begin);
+}
--- a/http_client.h
+++ b/http_client.h
@@ -0,0 +1,56 @@
+#pragma once
+#include <string>
+#include <vector>
+#include <map>
+#include <functional>
+#include "json11/json11.hpp"
+
+#define WS_CONTINUATION 0
+#define WS_TEXT 1
+#define WS_BINARY 2
+#define WS_CLOSE 8
+#define WS_PING 9
+#define WS_PONG 10
+
+class timerfd_manager_t;
+
+struct http_options_t
+{
+    int timeout;
+    bool want_streaming;
+};
+
+struct http_response_t
+{
+    bool eof = false;
+    int error_code = 0;
+    int status_code = 0;
+    std::string status_line;
+    std::map<std::string, std::string> headers;
+    int ws_msg_type = -1;
+    std::string body;
+};
+
+struct http_co_t;
+
+struct websocket_t
+{
+    http_co_t *co;
+    void post_message(int type, const std::string & msg);
+    void close();
+};
+
+void parse_http_headers(std::string & res, http_response_t *parsed);
+
+std::vector<std::string> getifaddr_list(bool include_v6 = false);
+
+uint64_t stoull_full(const std::string & str, int base = 10);
+
+void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    const http_options_t & options, std::function<void(const http_response_t *response)> callback);
+
+void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
+    int timeout, std::function<void(std::string, json11::Json r)> callback);
+
+websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
+    int timeout, std::function<void(const http_response_t *msg)> callback);
--- a/lp/lp-optimizer.js
+++ b/lp/lp-optimizer.js
@@ -0,0 +1,521 @@
+// Data distribution optimizer using linear programming (lp_solve)
+
+const child_process = require('child_process');
+
+const NO_OSD = 'Z';
+
+async function lp_solve(text)
+{
+    const cp = child_process.spawn('lp_solve');
+    let stdout = '', stderr = '', finish_cb;
+    cp.stdout.on('data', buf => stdout += buf.toString());
+    cp.stderr.on('data', buf => stderr += buf.toString());
+    cp.on('exit', () => finish_cb && finish_cb());
+    cp.stdin.write(text);
+    cp.stdin.end();
+    if (cp.exitCode == null)
+    {
+        await new Promise(ok => finish_cb = ok);
+    }
+    if (!stdout.trim())
+    {
+        return null;
+    }
+    let score = 0;
+    let vars = {};
+    for (const line of stdout.split(/\n/))
+    {
+        let m = /^(^Value of objective function: ([\d\.]+)|Actual values of the variables:)\s*$/.exec(line);
+        if (m)
+        {
+            if (m[2])
+            {
+                score = m[2];
+            }
+            continue;
+        }
+        else if (/This problem is (infeasible|unbounded)/.exec(line))
+        {
+            return null;
+        }
+        let [ k, v ] = line.trim().split(/\s+/, 2);
+        if (v)
+        {
+            vars[k] = v;
+        }
+    }
+    return { score, vars };
+}
+
+async function optimize_initial(osd_tree, pg_count, max_combinations)
+{
+    max_combinations = max_combinations || 10000;
+    const all_weights = Object.assign({}, ...Object.values(osd_tree));
+    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
+    let all_pgs = all_combinations(osd_tree, null, true);
+    if (all_pgs.length > max_combinations)
+    {
+        const prob = max_combinations/all_pgs.length;
+        all_pgs = all_pgs.filter(pg => Math.random() < prob);
+    }
+    const pg_per_osd = {};
+    for (const pg of all_pgs)
+    {
+        for (const osd of pg)
+        {
+            pg_per_osd[osd] = pg_per_osd[osd] || [];
+            pg_per_osd[osd].push("pg_"+pg.join("_"));
+        }
+    }
+    const pg_size = Math.min(Object.keys(osd_tree).length, 3);
+    let lp = '';
+    lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n";
+    for (const osd in pg_per_osd)
+    {
+        if (osd !== NO_OSD)
+        {
+            let osd_pg_count = all_weights[osd]/total_weight*pg_size*pg_count;
+            lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
+        }
+    }
+    for (const pg of all_pgs)
+    {
+        lp += 'pg_'+pg.join('_')+" >= 0;\n";
+    }
+    lp += "sec "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(', ')+";\n";
+    const lp_result = await lp_solve(lp);
+    if (!lp_result)
+    {
+        throw new Error('Problem is infeasible or unbounded - is it a bug?');
+    }
+    const int_pgs = make_int_pgs(lp_result.vars, pg_count);
+    const eff = pg_list_space_efficiency(int_pgs, all_weights);
+    return { score: lp_result.score, weights: lp_result.vars, int_pgs, space: eff*pg_size, total_space: total_weight };
+}
+
+function make_int_pgs(weights, pg_count)
+{
+    const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
+    let int_pgs = [];
+    let pg_left = pg_count;
+    let weight_left = total_weight;
+    for (const pg_name in weights)
+    {
+        let n = Math.round(weights[pg_name] / weight_left * pg_left);
+        for (let i = 0; i < n; i++)
+        {
+            int_pgs.push(pg_name.substr(3).split('_'));
+        }
+        weight_left -= weights[pg_name];
+        pg_left -= n;
+    }
+    return int_pgs;
+}
+
+// Try to minimize data movement
+async function optimize_change(prev_int_pgs, osd_tree, max_combinations)
+{
+    max_combinations = max_combinations || 10000;
+    const pg_size = Math.min(Object.keys(osd_tree).length, 3);
+    const pg_count = prev_int_pgs.length;
+    const prev_weights = {};
+    const prev_pg_per_osd = {};
+    for (const pg of prev_int_pgs)
+    {
+        const pg_name = 'pg_'+pg.join('_');
+        prev_weights[pg_name] = (prev_weights[pg_name]||0) + 1;
+        for (const osd of pg)
+        {
+            prev_pg_per_osd[osd] = prev_pg_per_osd[osd] || [];
+            prev_pg_per_osd[osd].push(pg_name);
+        }
+    }
+    // Get all combinations
+    let all_pgs = all_combinations(osd_tree, null, true);
+    if (all_pgs.length > max_combinations)
+    {
+        const intersecting = all_pgs.filter(pg => prev_weights['pg_'+pg.join('_')]);
+        if (intersecting.length > max_combinations)
+        {
+            const prob = max_combinations/intersecting.length;
+            all_pgs = intersecting.filter(pg => Math.random() < prob);
+        }
+        else
+        {
+            const prob = (max_combinations-intersecting.length)/all_pgs.length;
+            all_pgs = all_pgs.filter(pg => Math.random() < prob || prev_weights['pg_'+pg.join('_')]);
+        }
+    }
+    const pg_per_osd = {};
+    for (const pg of all_pgs)
+    {
+        const pg_name = 'pg_'+pg.join('_');
+        for (const osd of pg)
+        {
+            pg_per_osd[osd] = pg_per_osd[osd] || [];
+            pg_per_osd[osd].push(pg_name);
+        }
+    }
+    // Penalize PGs based on their similarity to old PGs
+    const intersect = {};
+    for (const pg_name in prev_weights)
+    {
+        const pg = pg_name.substr(3).split(/_/);
+        intersect[pg[0]+'::'] = intersect[':'+pg[1]+':'] = intersect['::'+pg[2]] = 2;
+        intersect[pg[0]+'::'+pg[2]] = intersect[':'+pg[1]+':'+pg[2]] = intersect[pg[0]+':'+pg[1]+':'] = 1;
+    }
+    const move_weights = {};
+    for (const pg of all_pgs)
+    {
+        move_weights['pg_'+pg.join('_')] =
+            intersect[pg[0]+'::'+pg[2]] || intersect[':'+pg[1]+':'+pg[2]] || intersect[pg[0]+':'+pg[1]+':'] ||
+            intersect[pg[0]+'::'] || intersect[':'+pg[1]+':'] || intersect['::'+pg[2]] ||
+            3;
+    }
+    // Calculate total weight - old PG weights
+    const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
+    const all_weights = Object.assign({}, ...Object.values(osd_tree));
+    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
+    // Generate the LP problem
+    let lp = '';
+    lp += 'max: '+all_pg_names.map(pg_name => (
+        prev_weights[pg_name] ? `${4-move_weights[pg_name]}*add_${pg_name} - 4*del_${pg_name}` : `${4-move_weights[pg_name]}*${pg_name}`
+    )).join(' + ')+';\n';
+    for (const osd in pg_per_osd)
+    {
+        if (osd !== NO_OSD)
+        {
+            const osd_sum = (pg_per_osd[osd]||[]).map(pg_name => prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : pg_name).join(' + ');
+            const rm_osd_pg_count = (prev_pg_per_osd[osd]||[]).filter(old_pg_name => move_weights[old_pg_name]).length;
+            let osd_pg_count = all_weights[osd]*3/total_weight*pg_count - rm_osd_pg_count;
+            lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
+        }
+    }
+    let pg_vars = [];
+    for (const pg_name of all_pg_names)
+    {
+        if (prev_weights[pg_name])
+        {
+            pg_vars.push(`add_${pg_name}`, `del_${pg_name}`);
+            // Can't add or remove less than zero
+            lp += `add_${pg_name} >= 0;\n`;
+            lp += `del_${pg_name} >= 0;\n`;
+            // Can't remove more than the PG already has
+            lp += `add_${pg_name} - del_${pg_name} >= -${prev_weights[pg_name]};\n`;
+        }
+        else
+        {
+            pg_vars.push(pg_name);
+            lp += `${pg_name} >= 0;\n`;
+        }
+    }
+    lp += 'sec '+pg_vars.join(', ')+';\n';
+    // Solve it
+    const lp_result = await lp_solve(lp);
+    if (!lp_result)
+    {
+        console.log(lp);
+        throw new Error('Problem is infeasible or unbounded - is it a bug?');
+    }
+    // Generate the new distribution
+    const weights = { ...prev_weights };
+    for (const k in prev_weights)
+    {
+        if (!move_weights[k])
+        {
+            delete weights[k];
+        }
+    }
+    for (const k in lp_result.vars)
+    {
+        if (k.substr(0, 4) === 'add_')
+        {
+            weights[k.substr(4)] = (weights[k.substr(4)] || 0) + Number(lp_result.vars[k]);
+        }
+        else if (k.substr(0, 4) === 'del_')
+        {
+            weights[k.substr(4)] = (weights[k.substr(4)] || 0) - Number(lp_result.vars[k]);
+        }
+        else
+        {
+            weights[k] = Number(lp_result.vars[k]);
+        }
+    }
+    for (const k in weights)
+    {
+        if (!weights[k])
+        {
+            delete weights[k];
+        }
+    }
+    const int_pgs = make_int_pgs(weights, pg_count);
+    // Align them with most similar previous PGs
+    const new_pgs = align_pgs(prev_int_pgs, int_pgs);
+    let differs = 0, osd_differs = 0;
+    for (let i = 0; i < pg_count; i++)
+    {
+        if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
+        {
+            differs++;
+        }
+        for (let j = 0; j < 3; j++)
+        {
+            if (new_pgs[i][j] != prev_int_pgs[i][j])
+            {
+                osd_differs++;
+            }
+        }
+    }
+    return {
+        prev_pgs: prev_int_pgs,
+        score: lp_result.score,
+        weights,
+        int_pgs: new_pgs,
+        differs,
+        osd_differs,
+        space: pg_size * pg_list_space_efficiency(new_pgs, all_weights),
+        total_space: total_weight,
+    };
+}
+
+function print_change_stats(retval, detailed)
+{
+    const new_pgs = retval.int_pgs;
+    const prev_int_pgs = retval.prev_pgs;
+    if (prev_int_pgs)
+    {
+        if (detailed)
+        {
+            for (let i = 0; i < new_pgs.length; i++)
+            {
+                if (new_pgs[i].join('_') != prev_int_pgs[i].join('_'))
+                {
+                    console.log("pg "+i+": "+prev_int_pgs[i].join(' ')+" -> "+new_pgs[i].join(' '));
+                }
+            }
+        }
+        console.log(
+            "Data movement: "+retval.differs+" pgs, "+
+            retval.osd_differs+" pg*osds = "+Math.round(retval.osd_differs / prev_int_pgs.length / 3 * 10000)/100+" %"
+        );
+    }
+    console.log(
+        "Total space (raw): "+Math.round(retval.space*100)/100+" TB, space efficiency: "+
+        Math.round(retval.space/(retval.total_space||1)*10000)/100+" %"
+    );
+}
+
+function align_pgs(prev_int_pgs, int_pgs)
+{
+    const aligned_pgs = [];
+    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg.join(':') ]);
+    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::'+pg[2], ':'+pg[1]+':'+pg[2], pg[0]+':'+pg[1]+':' ]);
+    put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, (pg) => [ pg[0]+'::', ':'+pg[1]+':', '::'+pg[2] ]);
+    const free_slots = prev_int_pgs.map((pg, i) => !aligned_pgs[i] ? i : null).filter(i => i != null);
+    for (const pg of int_pgs)
+    {
+        if (!free_slots.length)
+        {
+            throw new Error("Can't place unaligned PG");
+        }
+        aligned_pgs[free_slots.shift()] = pg;
+    }
+    return aligned_pgs;
+}
+
+function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
+{
+    let prev_indexes = {};
+    for (let i = 0; i < prev_int_pgs.length; i++)
+    {
+        for (let k of keygen(prev_int_pgs[i]))
+        {
+            prev_indexes[k] = prev_indexes[k] || [];
+            prev_indexes[k].push(i);
+        }
+    }
+    PG: for (let i = int_pgs.length-1; i >= 0; i--)
+    {
+        let pg = int_pgs[i];
+        let keys = keygen(int_pgs[i]);
+        for (let k of keys)
+        {
+            while (prev_indexes[k] && prev_indexes[k].length)
+            {
+                let idx = prev_indexes[k].shift();
+                if (!aligned_pgs[idx])
+                {
+                    aligned_pgs[idx] = pg;
+                    int_pgs.splice(i, 1);
+                    continue PG;
+                }
+            }
+        }
+    }
+}
+
+// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
+// levels = { string: number }
+// to a two-level osd_tree suitable for all_combinations()
+function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
+{
+    osd_level = levels[osd_level] || osd_level;
+    failure_domain_level = levels[failure_domain_level] || failure_domain_level;
+    for (const node of osd_tree)
+    {
+        if ((levels[node.level] || node.level) < failure_domain_level)
+        {
+            flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
+        }
+        else
+        {
+            domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
+        }
+    }
+    return domains;
+}
+
+function extract_osds(osd_tree, levels, osd_level, osds = {})
+{
+    for (const node of osd_tree)
+    {
+        if ((levels[node.level] || node.level) >= osd_level)
+        {
+            osds[node.id] = node.size;
+        }
+        else
+        {
+            extract_osds(node.children||[], levels, osd_level, osds);
+        }
+    }
+    return osds;
+}
+
+// FIXME: support different pg_sizes, not just 3
+// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
+function all_combinations(osd_tree, count, ordered)
+{
+    const hosts = Object.keys(osd_tree).sort();
+    const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
+    while (hosts.length < 3)
+    {
+        osds[NO_OSD] = [ NO_OSD ];
+        hosts.push(NO_OSD);
+    }
+    let host_idx = [ 0, 1, 2 ];
+    let osd_idx = [ 0, 0, 0 ];
+    const r = [];
+    while (!count || count < 0 || r.length < count)
+    {
+        let inc;
+        if (host_idx[2] != host_idx[1] && host_idx[2] != host_idx[0] && host_idx[1] != host_idx[0])
+        {
+            r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
+            inc = 2;
+            while (inc >= 0)
+            {
+                osd_idx[inc]++;
+                if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
+                {
+                    osd_idx[inc] = 0;
+                    inc--;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+        else
+        {
+            inc = -1;
+        }
+        if (inc < 0)
+        {
+            // no osds left in current host combination, select the next one
+            osd_idx = [ 0, 0, 0 ];
+            host_idx[2]++;
+            if (host_idx[2] >= hosts.length)
+            {
+                host_idx[1]++;
+                host_idx[2] = ordered ? host_idx[1]+1 : 0;
+                if ((ordered ? host_idx[2] : host_idx[1]) >= hosts.length)
+                {
+                    host_idx[0]++;
+                    host_idx[1] = ordered ? host_idx[0]+1 : 0;
+                    host_idx[2] = ordered ? host_idx[1]+1 : 0;
+                    if ((ordered ? host_idx[2] : host_idx[0]) >= hosts.length)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    return r;
+}
+
+function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
+{
+    const per_osd = {};
+    for (const pg_name in weights)
+    {
+        for (const osd of pg_name.substr(3).split(/_/))
+        {
+            per_osd[osd] = (per_osd[osd]||0) + weights[pg_name];
+        }
+    }
+    return pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes);
+}
+
+function pg_list_space_efficiency(pgs, osd_sizes)
+{
+    const per_osd = {};
+    for (const pg of pgs)
+    {
+        for (const osd of pg)
+        {
+            per_osd[osd] = (per_osd[osd]||0) + 1;
+        }
+    }
+    return pg_per_osd_space_efficiency(per_osd, pgs.length, osd_sizes);
+}
+
+function pg_per_osd_space_efficiency(per_osd, pg_count, osd_sizes)
+{
+    // each PG gets randomly selected in 1/N cases
+    // & there are x PGs per OSD
+    // => an OSD is selected in x/N cases
+    // => total space * x/N <= OSD size
+    // => total space <= OSD size * N/x
+    let space;
+    for (let osd in per_osd)
+    {
+        if (osd in osd_sizes)
+        {
+            const space_estimate = osd_sizes[osd] * pg_count / per_osd[osd];
+            if (space == null || space > space_estimate)
+            {
+                space = space_estimate;
+            }
+        }
+    }
+    return space == null ? 0 : space;
+}
+
+module.exports = {
+    NO_OSD,
+
+    optimize_initial,
+    optimize_change,
+    print_change_stats,
+    pg_weights_space_efficiency,
+    pg_list_space_efficiency,
+    pg_per_osd_space_efficiency,
+    flatten_tree,
+
+    lp_solve,
+    make_int_pgs,
+    align_pgs,
+    all_combinations,
+};
--- a/lp/mon-main.js
+++ b/lp/mon-main.js
@@ -0,0 +1,22 @@
+#!/usr/bin/node
+
+const Mon = require('./mon.js');
+
+const options = {};
+
+for (let i = 2; i < process.argv.length; i++)
+{
+    if (process.argv[i].substr(0, 2) == '--')
+    {
+        options[process.argv[i].substr(2)] = process.argv[i+1];
+        i++;
+    }
+}
+
+if (!options.etcd_url)
+{
+    console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url "http://127.0.0.1:2379,..." --etcd_prefix "/rage" --etcd_start_timeout 5');
+    process.exit();
+}
+
+new Mon(options).start();
--- a/lp/mon.js
+++ b/lp/mon.js
@@ -0,0 +1,858 @@
+const http = require('http');
+const os = require('os');
+const WebSocket = require('ws');
+const LPOptimizer = require('./lp-optimizer.js');
+const stableStringify = require('./stable-stringify.js');
+
+class Mon
+{
+    static etcd_tree = {
+        config: {
+            global: null,
+            /* placement_tree = {
+                levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
+                nodes: { host1: { level: 'host', parent: 'rack1' }, ... },
+                failure_domain: 'host',
+            } */
+            placement_tree: null,
+            osd: {},
+            pgs: {},
+        },
+        osd: {
+            state: {},
+            stats: {},
+        },
+        mon: {
+            master: null,
+        },
+        pg: {
+            change_stamp: null,
+            state: {},
+            stats: {},
+            history: {},
+        },
+    }
+
+    constructor(config)
+    {
+        // FIXME: Maybe prefer local etcd
+        this.etcd_urls = [];
+        for (let url of config.etcd_url.split(/,/))
+        {
+            let scheme = 'http';
+            url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
+            if (!/\/[^\/]/.exec(url))
+                url += '/v3';
+            this.etcd_urls.push(scheme+'://'+url);
+        }
+        this.etcd_prefix = config.etcd_prefix || '/rage';
+        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
+        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
+        this.state = JSON.parse(JSON.stringify(Mon.etcd_tree));
+    }
+
+    async start()
+    {
+        await this.load_config();
+        await this.get_lease();
+        await this.become_master();
+        await this.load_cluster_state();
+        await this.start_watcher();
+        await this.recheck_pgs();
+    }
+
+    async load_config()
+    {
+        const res = await this.etcd_call('/txn', { success: [
+            { requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
+        ] }, this.etcd_start_timeout, -1);
+        this.parse_kv(res.responses[0].response_range.kvs[0]);
+        this.check_config();
+    }
+
+    check_config()
+    {
+        this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0;
+        if (this.config.etcd_mon_timeout <= 0)
+        {
+            this.config.etcd_mon_timeout = 1000;
+        }
+        this.config.etcd_mon_retries = Number(this.config.etcd_mon_retries) || 5;
+        if (this.config.etcd_mon_retries < 0)
+        {
+            this.config.etcd_mon_retries = 0;
+        }
+        this.config.mon_change_timeout = Number(this.config.mon_change_timeout) || 1000;
+        if (this.config.mon_change_timeout < 100)
+        {
+            this.config.mon_change_timeout = 100;
+        }
+        this.config.mon_stats_timeout = Number(this.config.mon_stats_timeout) || 1000;
+        if (this.config.mon_stats_timeout < 100)
+        {
+            this.config.mon_stats_timeout = 100;
+        }
+        // After this number of seconds, a dead OSD will be removed from PG distribution
+        this.config.osd_out_time = Number(this.config.osd_out_time) || 0;
+        if (!this.config.osd_out_time)
+        {
+            this.config.osd_out_time = 30*60; // 30 minutes by default
+        }
+        this.config.max_osd_combinations = Number(this.config.max_osd_combinations) || 10000;
+        if (this.config.max_osd_combinations < 100)
+        {
+            this.config.max_osd_combinations = 100;
+        }
+    }
+
+    async start_watcher(retries)
+    {
+        let retry = 0;
+        if (retries >= 0 && retries < 1)
+        {
+            retries = 1;
+        }
+        while (retries < 0 || retry < retries)
+        {
+            const base = 'ws'+this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)].substr(4);
+            const ok = await new Promise((ok, no) =>
+            {
+                const timer_id = setTimeout(() =>
+                {
+                    this.ws.close();
+                    ok(false);
+                }, timeout);
+                this.ws = new WebSocket(base+'/watch');
+                this.ws.on('open', () =>
+                {
+                    if (timer_id)
+                        clearTimeout(timer_id);
+                    ok(true);
+                });
+            });
+            if (!ok)
+            {
+                this.ws = null;
+            }
+            retry++;
+        }
+        if (!this.ws)
+        {
+            this.die('Failed to open etcd watch websocket');
+        }
+        this.ws.send(JSON.stringify({
+            create_request: {
+                key: b64(this.etcd_prefix+'/'),
+                range_end: b64(this.etcd_prefix+'0'),
+                start_revision: ''+this.etcd_watch_revision,
+                watch_id: 1,
+            },
+        }));
+        this.ws.on('message', (msg) =>
+        {
+            let data;
+            try
+            {
+                data = JSON.parse(msg);
+            }
+            catch (e)
+            {
+            }
+            if (!data || !data.result || !data.result.events)
+            {
+                console.error('Garbage received from watch websocket: '+msg);
+            }
+            else
+            {
+                let stats_changed = false, changed = false;
+                console.log('Revision '+data.result.header.revision+' events: ');
+                for (const e of data.result.events)
+                {
+                    this.parse_kv(e.kv);
+                    const key = e.kv.key.substr(this.etcd_prefix.length);
+                    if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/')
+                    {
+                        stats_changed = true;
+                    }
+                    else if (key != '/stats')
+                    {
+                        changed = true;
+                    }
+                    console.log(e);
+                }
+                if (stats_changed)
+                {
+                    this.schedule_update_stats();
+                }
+                if (changed)
+                {
+                    this.schedule_recheck();
+                }
+            }
+        });
+    }
+
+    async get_lease()
+    {
+        const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
+        const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
+        this.etcd_lease_id = res.ID;
+        setInterval(async () =>
+        {
+            const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
+            if (!res.result.TTL)
+            {
+                this.die('Lease expired');
+            }
+        }, config.etcd_mon_timeout);
+    }
+
+    async become_master()
+    {
+        const state = { ip: this.local_ips() };
+        while (1)
+        {
+            const res = await this.etcd_call('/txn', {
+                compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.etcd_prefix+'/mon/master') } ],
+                success: [ { key: b64(this.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.etcd_lease_id } ],
+            }, this.etcd_start_timeout, 0);
+            if (!res.succeeded)
+            {
+                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout));
+            }
+        }
+    }
+
+    async load_cluster_state()
+    {
+        const res = await this.etcd_call('/txn', { success: [
+            { requestRange: { key: b64(this.etcd_prefix+'/'), range_end: b64(this.etcd_prefix+'0') } },
+        ] }, this.etcd_start_timeout, -1);
+        this.etcd_watch_revision = BigInt(res.header.revision)+BigInt(1);
+        const data = JSON.parse(JSON.stringify(Mon.etcd_tree));
+        for (const response of res.responses)
+        {
+            for (const kv of response.response_range.kvs)
+            {
+                this.parse_kv(kv);
+            }
+        }
+        this.state = data;
+    }
+
+    all_osds()
+    {
+        return Object.keys(this.state.osd.stats);
+    }
+
+    get_osd_tree()
+    {
+        this.state.config.placement_tree = this.state.config.placement_tree||{};
+        const levels = this.state.config.placement_tree.levels||{};
+        levels.host = levels.host || 100;
+        levels.osd = levels.osd || 101;
+        const tree = { '': { children: [] } };
+        for (const node_id in this.state.config.placement_tree.nodes||{})
+        {
+            const node_cfg = this.state.config.placement_tree.nodes[node_id];
+            if (!node_id || /^\d/.exec(node_id) ||
+                !node_cfg.level || !levels[node_cfg.level])
+            {
+                // All nodes must have non-empty non-numeric IDs and valid levels
+                continue;
+            }
+            tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
+        }
+        // This requires monitor system time to be in sync with OSD system times (at least to some extent)
+        const down_time = Date.now()/1000 - this.config.osd_out_time;
+        for (const osd_num of this.all_osds().sort((a, b) => a - b))
+        {
+            const stat = this.state.osd.stats[osd_num];
+            if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
+            {
+                // Numeric IDs are reserved for OSDs
+                const reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight) || 1;
+                tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
+                tree[osd_num].level = 'osd';
+                tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
+                delete tree[osd_num].children;
+            }
+        }
+        for (const node_id in tree)
+        {
+            if (node_id === '')
+            {
+                continue;
+            }
+            const node_cfg = tree[node_id];
+            const node_level = levels[node_cfg.level] || node_cfg.level;
+            let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
+                && tree[node_cfg.parent].level;
+            parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
+            // Parent's level must be less than child's; OSDs must be leaves
+            const parent = parent_level && parent_level < node_level ? tree[node_cfg.parent] : '';
+            tree[parent].children.push(tree[node_id]);
+            delete node_cfg.parent;
+        }
+        return LPOptimizer.flatten_tree(tree[''].children, levels, this.state.config.failure_domain, 'osd');
+    }
+
+    async stop_all_pgs()
+    {
+        let has_online = false, paused = true;
+        for (const pg in this.state.config.pgs.items||{})
+        {
+            const cur_state = ((this.state.pg.state[pg]||{}).state||[]).join(',');
+            if (cur_state != '' && cur_state != 'offline')
+            {
+                has_online = true;
+            }
+            if (!this.state.config.pgs.items[pg].pause)
+            {
+                paused = false;
+            }
+        }
+        if (!paused)
+        {
+            console.log('Stopping all PGs before changing PG count');
+            const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
+            for (const pg in new_cfg.items)
+            {
+                new_cfg.items[pg].pause = true;
+            }
+            // Check that no OSDs change their state before we pause PGs
+            // Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
+            // and can't see the old PG configuration
+            const checks = [];
+            for (const osd_num of this.all_osds())
+            {
+                const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
+                checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
+            }
+            const res = await this.etcd_call('/txn', {
+                compare: [
+                    { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
+                    { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
+                    ...checks,
+                ],
+                success: [
+                    { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
+                ],
+            }, this.config.etcd_mon_timeout, 0);
+            if (!res.succeeded)
+            {
+                return false;
+            }
+            this.state.config.pgs = new_cfg;
+        }
+        return !has_online;
+    }
+
+    scale_pg_count(prev_pgs, pg_history, new_pg_count)
+    {
+        const old_pg_count = prev_pgs.length;
+        // Add all possibly intersecting PGs into the history of new PGs
+        if (!(new_pg_count % old_pg_count))
+        {
+            // New PG count is a multiple of the old PG count
+            const mul = (new_pg_count / old_pg_count);
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                const old_i = Math.floor(new_pg_count / mul);
+                pg_history[i] = JSON.parse(JSON.stringify(this.state.pg.history[1+old_i]));
+            }
+        }
+        else if (!(old_pg_count % new_pg_count))
+        {
+            // Old PG count is a multiple of the new PG count
+            const mul = (old_pg_count / new_pg_count);
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                pg_history[i] = {
+                    osd_sets: [],
+                    all_peers: [],
+                };
+                for (let j = 0; j < mul; j++)
+                {
+                    pg_history[i].osd_sets.push(prev_pgs[i*mul]);
+                    const hist = this.state.pg.history[1+i*mul+j];
+                    if (hist && hist.osd_sets && hist.osd_sets.length)
+                    {
+                        Array.prototype.push.apply(pg_history[i].osd_sets, hist.osd_sets);
+                    }
+                    if (hist && hist.all_peers && hist.all_peers.length)
+                    {
+                        Array.prototype.push.apply(pg_history[i].all_peers, hist.all_peers);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Any PG may intersect with any PG after non-multiple PG count change
+            // So, merge ALL PGs history
+            let all_sets = {};
+            let all_peers = {};
+            for (const pg of prev_pgs)
+            {
+                all_sets[pg.join(' ')] = pg;
+            }
+            for (const pg in this.state.pg.history)
+            {
+                const hist = this.state.pg.history[pg];
+                if (hist && hist.osd_sets)
+                {
+                    for (const pg of hist.osd_sets)
+                    {
+                        all_sets[pg.join(' ')] = pg;
+                    }
+                }
+                if (hist && hist.all_peers)
+                {
+                    for (const osd_num of hist.all_peers)
+                    {
+                        all_peers[osd_num] = Number(osd_num);
+                    }
+                }
+            }
+            all_sets = Object.values(all_sets);
+            all_peers = Object.values(all_peers);
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                pg_history[i] = { osd_sets: all_sets, all_peers };
+            }
+        }
+        // Mark history keys for removed PGs as removed
+        for (let i = new_pg_count; i < old_pg_count; i++)
+        {
+            pg_history[i] = null;
+        }
+        if (old_pg_count < new_pg_count)
+        {
+            for (let i = new_pg_count-1; i >= 0; i--)
+            {
+                prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
+            }
+        }
+        else if (old_pg_count > new_pg_count)
+        {
+            for (let i = 0; i < new_pg_count; i++)
+            {
+                prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
+            }
+            prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
+        }
+    }
+
+    async save_new_pgs(prev_pgs, new_pgs, pg_history, tree_hash)
+    {
+        const txn = [], checks = [];
+        const pg_items = {};
+        new_pgs.map((osd_set, i) =>
+        {
+            osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
+            const alive_set = osd_set.filter(osd_num => osd_num);
+            pg_items[i+1] = {
+                osd_set,
+                primary: alive_set.length ? alive_set[Math.floor(Math.random()*alive_set.length)] : 0,
+            };
+            if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' '))
+            {
+                pg_history[i] = pg_history[i] || {};
+                pg_history[i].osd_sets = pg_history[i].osd_sets || [];
+                pg_history[i].osd_sets.push(prev_pgs[i]);
+            }
+        });
+        for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
+        {
+            checks.push({
+                key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
+                target: 'MOD',
+                mod_revision: ''+this.etcd_watch_revision,
+                result: 'LESS',
+            });
+            if (pg_history[i])
+            {
+                txn.push({
+                    requestPut: {
+                        key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
+                        value: b64(JSON.stringify(pg_history[i])),
+                    },
+                });
+            }
+            else
+            {
+                txn.push({
+                    requestDeleteRange: {
+                        key: b64(this.etcd_prefix+'/pg/history/'+(i+1)),
+                    },
+                });
+            }
+        }
+        this.state.config.pgs = {
+            hash: tree_hash,
+            items: pg_items,
+        };
+        const res = await this.etcd_call('/txn', {
+            compare: [
+                { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
+                { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
+                ...checks,
+            ],
+            success: [
+                { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(this.state.config.pgs)) } },
+                ...txn,
+            ],
+        }, this.config.etcd_mon_timeout, 0);
+        return res.succeeded;
+    }
+
+    async recheck_pgs()
+    {
+        // Take configuration and state, check it against the stored configuration hash
+        // Recalculate PGs and save them to etcd if the configuration is changed
+        const tree_cfg = {
+            osd_tree: this.get_osd_tree(),
+            pg_count: this.config.pg_count || Object.keys(this.state.config.pgs.items||{}).length || 128,
+            max_osd_combinations: this.config.max_osd_combinations,
+        };
+        const tree_hash = sha1hex(stableStringify(tree_cfg));
+        if (this.state.config.pgs.hash != tree_hash)
+        {
+            // Something has changed
+            const prev_pgs = [];
+            for (const pg in this.state.config.pgs.items||{})
+            {
+                prev_pgs[pg-1] = this.state.config.pgs.items[pg].osd_set;
+            }
+            const pg_history = [];
+            const old_pg_count = prev_pgs.length;
+            let optimize_result;
+            if (old_pg_count > 0)
+            {
+                if (old_pg_count != tree_cfg.pg_count)
+                {
+                    // PG count changed. Need to bring all PGs down.
+                    if (!await this.stop_all_pgs())
+                    {
+                        this.schedule_recheck();
+                        return;
+                    }
+                    this.scale_pg_count(prev_pgs, pg_history, new_pg_count);
+                }
+                optimize_result = await LPOptimizer.optimize_change(prev_pgs, tree_cfg.osd_tree, tree_cfg.max_osd_combinations);
+            }
+            else
+            {
+                optimize_result = await LPOptimizer.optimize_initial(tree_cfg.osd_tree, tree_cfg.pg_count, tree_cfg.max_osd_combinations);
+            }
+            if (!await this.save_new_pgs(prev_pgs, optimize_result.int_pgs, pg_history, tree_hash))
+            {
+                console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
+                this.schedule_recheck();
+                return;
+            }
+            console.log('PG configuration successfully changed');
+            if (old_pg_count != optimize_result.int_pgs.length)
+            {
+                console.log(`PG count changed from: ${old_pg_count} to ${optimize_result.int_pgs.length}`);
+            }
+            LPOptimizer.print_change_stats(optimize_result);
+        }
+    }
+
+    schedule_recheck()
+    {
+        if (this.recheck_timer)
+        {
+            clearTimeout(this.recheck_timer);
+            this.recheck_timer = null;
+        }
+        this.recheck_timer = setTimeout(() =>
+        {
+            this.recheck_timer = null;
+            this.recheck_pgs().catch(console.error);
+        }, this.config.mon_change_timeout || 1000);
+    }
+
+    sum_stats()
+    {
+        let overflow = false;
+        this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} };
+        const op_stats = {}, subop_stats = {}, recovery_stats = {};
+        for (const osd in this.state.osd.stats)
+        {
+            const st = this.state.osd.stats[osd];
+            for (const op in st.op_stats||{})
+            {
+                op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
+                op_stats[op].count += BigInt(st.op_stats.count||0);
+                op_stats[op].usec += BigInt(st.op_stats.usec||0);
+                op_stats[op].bytes += BigInt(st.op_stats.bytes||0);
+            }
+            for (const op in st.subop_stats||{})
+            {
+                subop_stats[op] = subop_stats[op] || { count: 0n, usec: 0n };
+                subop_stats[op].count += BigInt(st.subop_stats.count||0);
+                subop_stats[op].usec += BigInt(st.subop_stats.usec||0);
+            }
+            for (const op in st.recovery_stats||{})
+            {
+                recovery_stats[op] = recovery_stats[op] || { count: 0n, bytes: 0n };
+                recovery_stats[op].count += BigInt(st.recovery_stats.count||0);
+                recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
+            }
+        }
+        for (const op in op_stats)
+        {
+            if (op_stats[op].count >= 0x10000000000000000n)
+            {
+                if (!this.prev_stats.op_stats[op])
+                {
+                    overflow = true;
+                }
+                else
+                {
+                    op_stats[op].count -= this.prev_stats.op_stats[op].count;
+                    op_stats[op].usec -= this.prev_stats.op_stats[op].usec;
+                    op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes;
+                }
+            }
+        }
+        for (const op in subop_stats)
+        {
+            if (subop_stats[op].count >= 0x10000000000000000n)
+            {
+                if (!this.prev_stats.subop_stats[op])
+                {
+                    overflow = true;
+                }
+                else
+                {
+                    subop_stats[op].count -= this.prev_stats.subop_stats[op].count;
+                    subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec;
+                }
+            }
+        }
+        for (const op in recovery_stats)
+        {
+            if (recovery_stats[op].count >= 0x10000000000000000n)
+            {
+                if (!this.prev_stats.recovery_stats[op])
+                {
+                    overflow = true;
+                }
+                else
+                {
+                    recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count;
+                    recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes;
+                }
+            }
+        }
+        const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
+        for (const pg_num in this.state.pg.stats)
+        {
+            const st = this.state.pg.stats[pg_num];
+            for (const k in object_counts)
+            {
+                if (st[k+'_count'])
+                {
+                    object_counts[k] += BigInt(st[k+'_count']);
+                }
+            }
+        }
+        return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts });
+    }
+
+    async update_total_stats()
+    {
+        const stats = this.sum_stats();
+        if (!stats.overflow)
+        {
+            // Convert to strings, serialize and save
+            const ser = {};
+            for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ])
+            {
+                ser[st] = {};
+                for (const op in stats[st])
+                {
+                    ser[st][op] = {};
+                    for (const k in stats[st][op])
+                    {
+                        ser[st][op][k] = ''+stats[st][op][k];
+                    }
+                }
+            }
+            ser.object_counts = {};
+            for (const k in stats.object_counts)
+            {
+                ser.object_counts[k] = ''+stats.object_counts[k];
+            }
+            await this.etcd_call('/txn', {
+                success: [ { requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } } ],
+            }, this.config.etcd_mon_timeout, 0);
+        }
+    }
+
+    schedule_update_stats()
+    {
+        if (this.stats_timer)
+        {
+            clearTimeout(this.stats_timer);
+            this.stats_timer = null;
+        }
+        this.stats_timer = setTimeout(() =>
+        {
+            this.stats_timer = null;
+            this.update_total_stats().catch(console.error);
+        }, this.config.mon_stats_timeout || 1000);
+    }
+
+    parse_kv(kv)
+    {
+        if (!kv || !kv.key)
+        {
+            return;
+        }
+        kv.key = de64(kv.key);
+        kv.value = kv.value ? JSON.parse(de64(kv.value)) : null;
+        const key = kv.key.substr(this.etcd_prefix.length).replace(/^\/+/, '').split('/');
+        const cur = this.state, orig = Mon.etcd_tree;
+        for (let i = 0; i < key.length-1; i++)
+        {
+            if (!orig[key[i]])
+            {
+                console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
+                return;
+            }
+            orig = orig[key[i]];
+            cur = (cur[key[i]] = cur[key[i]] || {});
+        }
+        if (orig[key.length-1])
+        {
+            console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
+            return;
+        }
+        cur[key[key.length-1]] = kv.value;
+        if (key.join('/') === 'config/global')
+        {
+            this.state.config.global = this.state.config.global || {};
+            this.config = this.state.config.global;
+            this.check_config();
+        }
+    }
+
+    async etcd_call(path, body, timeout, retries)
+    {
+        let retry = 0;
+        if (retries >= 0 && retries < 1)
+        {
+            retries = 1;
+        }
+        while (retries < 0 || retry < retries)
+        {
+            const base = this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)];
+            const res = await POST(base+path, body, timeout);
+            if (res.json)
+            {
+                if (res.json.error)
+                {
+                    console.log('etcd returned error: '+res.json.error);
+                    break;
+                }
+                return res.json;
+            }
+            retry++;
+        }
+        this.die();
+    }
+
+    die(err)
+    {
+        // In fact we can just try to rejoin
+        console.fatal(err || 'Cluster connection failed');
+        process.exit(1);
+    }
+
+    local_ips()
+    {
+        const ips = [];
+        const ifaces = os.networkInterfaces();
+        for (const ifname in ifaces)
+        {
+            for (const iface of ifaces[ifname])
+            {
+                if (iface.family == 'IPv4' && !iface.internal)
+                {
+                    ips.push(iface.address);
+                }
+            }
+        }
+        return ips;
+    }
+}
+
+function POST(url, body, timeout)
+{
+    return new Promise((ok, no) =>
+    {
+        const body_text = Buffer.from(JSON.stringify(body));
+        let timer_id = timeout > 0 ? setTimeout(() =>
+        {
+            if (req)
+                req.abort();
+            req = null;
+            ok({ error: 'timeout' });
+        }, timeout) : null;
+        let req = http.request(url, { method: 'POST', headers: {
+            'Content-Type': 'application/json',
+            'Content-Length': body_text,
+        } }, (res) =>
+        {
+            if (!req)
+            {
+                return;
+            }
+            clearTimeout(timer_id);
+            if (res.statusCode != 200)
+            {
+                ok({ error: res.statusCode, response: res });
+                return;
+            }
+            let res_body = '';
+            res.setEncoding('utf8');
+            res.on('data', chunk => { res_body += chunk });
+            res.on('end', () =>
+            {
+                try
+                {
+                    res_body = JSON.parse(res_body);
+                    ok({ response: res, json: res_body });
+                }
+                catch (e)
+                {
+                    ok({ error: e, response: res, body: res_body });
+                }
+            });
+        });
+        req.write(body_text);
+        req.end();
+    });
+}
+
+function b64(str)
+{
+    return Buffer.from(str).toString('base64');
+}
+
+function de64(str)
+{
+    return Buffer.from(str, 'base64').toString();
+}
+
+function sha1hex(str)
+{
+    const hash = crypto.createHash('sha1');
+    hash.update(str);
+    return hash.digest('hex');
+}
--- a/lp/package.json
+++ b/lp/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "rage-mon",
+  "version": "1.0.0",
+  "description": "RAGE storage monitor service",
+  "main": "mon.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "Vitaliy Filippov",
+  "license": "UNLICENSED",
+  "dependencies": {
+    "ws": "^7.2.5"
+  }
+}
--- a/lp/test-optimize-undersized.js
+++ b/lp/test-optimize-undersized.js
@@ -0,0 +1,71 @@
+const LPOptimizer = require('./lp-optimizer.js');
+
+const crush_tree = [
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 1, size: 3 },
+            { level: 3, id: 2, size: 3 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 3, size: 3 },
+            { level: 3, id: 4, size: 3 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 5, size: 3 },
+            { level: 3, id: 6, size: 3 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 7, size: 3 },
+            { level: 3, id: 8, size: 3 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 9, size: 3 },
+            { level: 3, id: 10, size: 3 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 11, size: 3 },
+            { level: 3, id: 12, size: 3 },
+        ] },
+    ] },
+];
+
+const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
+console.log(osd_tree);
+
+async function run()
+{
+    const cur_tree = {};
+    console.log('Empty tree:');
+    let res = await LPOptimizer.optimize_initial(cur_tree, 256);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nAdding 1st failure domain:');
+    cur_tree['dom1'] = osd_tree['dom1'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nAdding 2nd failure domain:');
+    cur_tree['dom2'] = osd_tree['dom2'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nAdding 3rd failure domain:');
+    cur_tree['dom3'] = osd_tree['dom3'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nRemoving 3rd failure domain:');
+    delete cur_tree['dom3'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nRemoving 2nd failure domain:');
+    delete cur_tree['dom2'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('\nRemoving 1st failure domain:');
+    delete cur_tree['dom1'];
+    res = await LPOptimizer.optimize_change(res.int_pgs, cur_tree);
+    LPOptimizer.print_change_stats(res, false);
+}
+
+run().catch(console.error);
--- a/lp/test-optimize.js
+++ b/lp/test-optimize.js
@@ -0,0 +1,94 @@
+const LPOptimizer = require('./lp-optimizer.js');
+
+const osd_tree = {
+    100: {
+        7: 3.63869,
+    },
+    300: {
+        10: 3.46089,
+        11: 3.46089,
+        12: 3.46089,
+    },
+    400: {
+        1: 3.49309,
+        2: 3.49309,
+        3: 3.49309,
+    },
+    500: {
+        4: 3.58498,
+//        8: 3.58589,
+        9: 3.63869,
+    },
+    600: {
+        5: 3.63869,
+        6: 3.63869,
+    },
+/*    100: {
+        1: 2.72800,
+    },
+    200: {
+        2: 2.72900,
+    },
+    300: {
+        3: 1.87000,
+    },
+    400: {
+        4: 1.87000,
+    },
+    500: {
+        5: 3.63869,
+    },*/
+};
+
+const crush_tree = [
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 1, size: 3 },
+            { level: 3, id: 2, size: 2 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 3, size: 4 },
+            { level: 3, id: 4, size: 4 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 5, size: 4 },
+            { level: 3, id: 6, size: 1 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 7, size: 3 },
+            { level: 3, id: 8, size: 5 },
+        ] },
+    ] },
+    { level: 1, children: [
+        { level: 2, children: [
+            { level: 3, id: 9, size: 5 },
+            { level: 3, id: 10, size: 2 },
+        ] },
+        { level: 2, children: [
+            { level: 3, id: 11, size: 3 },
+            { level: 3, id: 12, size: 3 },
+        ] },
+    ] },
+];
+
+async function run()
+{
+    // Test: add 1 OSD of almost the same size. Ideal data movement could be 1/12 = 8.33%. Actual is ~13%
+    // Space efficiency is ~99.5% in both cases.
+    let res = await LPOptimizer.optimize_initial(osd_tree, 256);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('adding osd.8');
+    osd_tree[500][8] = 3.58589;
+    res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
+    LPOptimizer.print_change_stats(res, false);
+    console.log('removing osd.8');
+    delete osd_tree[500][8];
+    res = await LPOptimizer.optimize_change(res.int_pgs, osd_tree);
+    LPOptimizer.print_change_stats(res, false);
+    res = await LPOptimizer.optimize_initial(LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), 256);
+    LPOptimizer.print_change_stats(res, false);
+}
+
+run().catch(console.error);
--- a/osd.cpp
+++ b/osd.cpp
@@ -7,7 +7,7 @@

 #include "osd.h"

-static const char* osd_op_names[] = {
+const char* osd_op_names[] = {
    "",
    "read",
    "write",
@@ -21,6 +21,7 @@ static const char* osd_op_names[] = {
    "primary_read",
    "primary_write",
    "primary_sync",
+    "primary_delete",
 };

 osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
@@ -28,50 +29,131 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
    this->config = config;
    this->bs = bs;
    this->ringloop = ringloop;
-    this->tick_tfd = new timerfd_interval(ringloop, 3, [this]()
-    {
-        for (int i = 0; i <= OSD_OP_MAX; i++)
-        {
-            if (op_stat_count[i] != 0)
-            {
-                printf("avg latency for op %d (%s): %ld us\n", i, osd_op_names[i], op_stat_sum[i]/op_stat_count[i]);
-                op_stat_count[i] = 0;
-                op_stat_sum[i] = 0;
-            }
-        }
-        for (int i = 0; i <= OSD_OP_MAX; i++)
-        {
-            if (subop_stat_count[i] != 0)
-            {
-                printf("avg latency for subop %d (%s): %ld us\n", i, osd_op_names[i], subop_stat_sum[i]/subop_stat_count[i]);
-                subop_stat_count[i] = 0;
-                subop_stat_sum[i] = 0;
-            }
-        }
-        if (send_stat_count != 0)
-        {
-            printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
-            send_stat_count = 0;
-            send_stat_sum = 0;
-        }
-    });
+
    this->bs_block_size = bs->get_block_size();
    // FIXME: use bitmap granularity instead
    this->bs_disk_alignment = bs->get_disk_alignment();

-    bind_address = config["bind_address"];
-    if (bind_address == "")
-        bind_address = "0.0.0.0";
-    bind_port = strtoull(config["bind_port"].c_str(), NULL, 10);
-    if (!bind_port || bind_port > 65535)
-        bind_port = 11203;
+    parse_config(config);
+
+    epoll_fd = epoll_create(1);
+    if (epoll_fd < 0)
+    {
+        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
+    }
+
+    this->tfd = new timerfd_manager_t([this](int fd, bool out, std::function<void(int, int)> handler) { set_fd_handler(fd, out, handler); });
+    this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
+    {
+        print_stats();
+    });
+
+    c_cli.tfd = this->tfd;
+    c_cli.ringloop = this->ringloop;
+    c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
+    c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
+
+    init_cluster();
+
+    consumer.loop = [this]() { loop(); };
+    ringloop->register_consumer(&consumer);
+}
+
+osd_t::~osd_t()
+{
+    if (tfd)
+    {
+        delete tfd;
+        tfd = NULL;
+    }
+    ringloop->unregister_consumer(&consumer);
+    close(epoll_fd);
+    close(listen_fd);
+}
+
+void osd_t::parse_config(blockstore_config_t & config)
+{
+    int pos;
+    // Initial startup configuration
+    {
+        std::string ea = config["etcd_address"];
+        while (1)
+        {
+            pos = ea.find(',');
+            std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
+            if (addr.length() > 0)
+            {
+                if (addr.find('/') < 0)
+                    addr += "/v3";
+                st_cli.etcd_addresses.push_back(addr);
+            }
+            if (pos >= 0)
+                ea = ea.substr(pos+1);
+            else
+                break;
+        }
+    }
+    st_cli.etcd_prefix = config["etcd_prefix"];
+    if (st_cli.etcd_prefix == "")
+        st_cli.etcd_prefix = "/microceph";
+    etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
+    if (etcd_report_interval <= 0)
+        etcd_report_interval = 30;
    osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
    if (!osd_num)
        throw std::runtime_error("osd_num is required in the configuration");
-    run_primary = config["run_primary"] == "true" || config["run_primary"] == "1" || config["run_primary"] == "yes";
-    if (run_primary)
-        init_primary();
+    c_cli.osd_num = osd_num;
+    run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
+    // Cluster configuration
+    bind_address = config["bind_address"];
+    if (bind_address == "")
+        bind_address = "0.0.0.0";
+    bind_port = stoull_full(config["bind_port"]);
+    if (bind_port <= 0 || bind_port > 65535)
+        bind_port = 0;
+    if (config["immediate_commit"] == "all")
+        immediate_commit = IMMEDIATE_ALL;
+    else if (config["immediate_commit"] == "small")
+        immediate_commit = IMMEDIATE_SMALL;
+    if (config.find("autosync_interval") != config.end())
+    {
+        autosync_interval = strtoull(config["autosync_interval"].c_str(), NULL, 10);
+        if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
+            autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
+    }
+    if (config.find("client_queue_depth") != config.end())
+    {
+        client_queue_depth = strtoull(config["client_queue_depth"].c_str(), NULL, 10);
+        if (client_queue_depth < 128)
+            client_queue_depth = 128;
+    }
+    if (config.find("pg_stripe_size") != config.end())
+    {
+        pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
+        if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
+            pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
+    }
+    recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
+    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
+        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
+        readonly = true;
+    print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
+    if (!print_stats_interval)
+        print_stats_interval = 3;
+    c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
+    if (!c_cli.peer_connect_interval)
+        c_cli.peer_connect_interval = 5;
+    c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
+    if (!c_cli.peer_connect_timeout)
+        c_cli.peer_connect_timeout = 5;
+    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
+    st_cli.log_level = log_level;
+    c_cli.log_level = log_level;
+}

+void osd_t::bind_socket()
+{
    listen_fd = socket(AF_INET, SOCK_STREAM, 0);
    if (listen_fd < 0)
    {
@@ -88,13 +170,27 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
        throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
    }
    addr.sin_family = AF_INET;
-    addr.sin_port = htons(bind_port);

+    addr.sin_port = htons(bind_port);
    if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
    {
        close(listen_fd);
        throw std::runtime_error(std::string("bind: ") + strerror(errno));
    }
+    if (bind_port == 0)
+    {
+        socklen_t len = sizeof(addr);
+        if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
+        {
+            close(listen_fd);
+            throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
+        }
+        listening_port = ntohs(addr.sin_port);
+    }
+    else
+    {
+        listening_port = bind_port;
+    }

    if (listen(listen_fd, listen_backlog) < 0)
    {
@@ -104,55 +200,15 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo

    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);

-    epoll_fd = epoll_create(1);
-    if (epoll_fd < 0)
-    {
-        close(listen_fd);
-        throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
-    }
-
    epoll_event ev;
    ev.data.fd = listen_fd;
-    ev.events = EPOLLIN | EPOLLET;
+    ev.events = EPOLLIN;
    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0)
    {
        close(listen_fd);
        close(epoll_fd);
        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
    }
-
-    consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(consumer);
-}
-
-osd_t::~osd_t()
-{
-    delete tick_tfd;
-    ringloop->unregister_consumer(consumer);
-    close(epoll_fd);
-    close(listen_fd);
-}
-
-osd_op_t::~osd_op_t()
-{
-    if (bs_op)
-    {
-        delete bs_op;
-    }
-    if (op_data)
-    {
-        free(op_data);
-    }
-    if (rmw_buf)
-    {
-        free(rmw_buf);
-    }
-    if (buf)
-    {
-        // Note: reusing osd_op_t WILL currently lead to memory leaks
-        // So we don't reuse it, but free it every time
-        free(buf);
-    }
 }

 bool osd_t::shutdown()
@@ -173,17 +229,49 @@ void osd_t::loop()
        wait_state = 1;
    }
    handle_peers();
-    read_requests();
-    send_replies();
+    c_cli.read_requests();
+    c_cli.send_replies();
+    ringloop->submit();
+}
+
+void osd_t::set_fd_handler(int fd, bool out, std::function<void(int, int)> handler)
+{
+    if (handler != NULL)
+    {
+        bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
+        epoll_event ev;
+        ev.data.fd = fd;
+        ev.events = EPOLLIN | (out ? EPOLLOUT : 0) | EPOLLRDHUP;
+        if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
+        {
+            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+        }
+        epoll_handlers[fd] = handler;
+    }
+    else
+    {
+        if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL) < 0 && errno != ENOENT)
+        {
+            throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
+        }
+        epoll_handlers.erase(fd);
+    }
 }

 void osd_t::handle_epoll_events()
 {
+    wait_state = 0;
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
    io_uring_sqe *sqe = ringloop->get_sqe();
    if (!sqe)
    {
-        throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
+        return;
    }
+    wait_state = 1;
    ring_data_t *data = ((ring_data_t*)sqe->user_data);
    my_uring_prep_poll_add(sqe, epoll_fd, POLLIN);
    data->callback = [this](ring_data_t *data)
@@ -194,7 +282,6 @@ void osd_t::handle_epoll_events()
        }
        handle_epoll_events();
    };
-    ringloop->submit();
    int nfds;
    epoll_event events[MAX_EPOLL_EVENTS];
 restart:
@@ -209,25 +296,25 @@ restart:
            int peer_fd;
            while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
            {
+                assert(peer_fd != 0);
                char peer_str[256];
-                printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
+                printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
+                    inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
                fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
                int one = 1;
                setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-                clients[peer_fd] = {
+                c_cli.clients[peer_fd] = {
                    .peer_addr = addr,
                    .peer_port = ntohs(addr.sin_port),
                    .peer_fd = peer_fd,
                    .peer_state = PEER_CONNECTED,
+                    .in_buf = malloc(c_cli.receive_buffer_size),
                };
                // Add FD to epoll
-                epoll_event ev;
-                ev.data.fd = peer_fd;
-                ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
-                if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
+                set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
                {
-                    throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-                }
+                    c_cli.handle_peer_epoll(peer_fd, epoll_events);
+                });
                // Try to accept next connection
                peer_addr_size = sizeof(addr);
            }
@@ -238,115 +325,17 @@ restart:
        }
        else
        {
-            auto & cl = clients[events[i].data.fd];
-            if (cl.peer_state == PEER_CONNECTING)
-            {
-                // Either OUT (connected) or HUP
-                handle_connect_result(cl.peer_fd);
-            }
-            else if (events[i].events & EPOLLRDHUP)
-            {
-                // Stop client
-                printf("osd: client %d disconnected\n", cl.peer_fd);
-                stop_client(cl.peer_fd);
-            }
-            else
-            {
-                // Mark client as ready (i.e. some data is available)
-                cl.read_ready++;
-                if (cl.read_ready == 1)
-                {
-                    read_ready_clients.push_back(cl.peer_fd);
-                    ringloop->wakeup();
-                }
-            }
+            auto & cb = epoll_handlers[events[i].data.fd];
+            cb(events[i].data.fd, events[i].events);
        }
    }
+    printf("%d events\n", nfds);
    if (nfds == MAX_EPOLL_EVENTS)
    {
        goto restart;
    }
 }

-void osd_t::cancel_osd_ops(osd_client_t & cl)
-{
-    for (auto p: cl.sent_ops)
-    {
-        cancel_op(p.second);
-    }
-    cl.sent_ops.clear();
-    for (auto op: cl.outbox)
-    {
-        cancel_op(op);
-    }
-    cl.outbox.clear();
-    if (cl.write_op)
-    {
-        cancel_op(cl.write_op);
-        cl.write_op = NULL;
-    }
-}
-
-void osd_t::cancel_op(osd_op_t *op)
-{
-    if (op->op_type == OSD_OP_OUT)
-    {
-        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        op->reply.hdr.id = op->req.hdr.id;
-        op->reply.hdr.opcode = op->req.hdr.opcode;
-        op->reply.hdr.retval = -EPIPE;
-        op->callback(op);
-    }
-    else
-    {
-        delete op;
-    }
-}
-
-void osd_t::stop_client(int peer_fd)
-{
-    auto it = clients.find(peer_fd);
-    if (it == clients.end())
-    {
-        return;
-    }
-    auto & cl = it->second;
-    if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
-    {
-        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-    }
-    if (cl.osd_num)
-    {
-        // Cancel outbound operations
-        cancel_osd_ops(cl);
-        osd_peer_fds.erase(cl.osd_num);
-        repeer_pgs(cl.osd_num, false);
-        peering_state |= OSD_PEERING_PEERS;
-    }
-    if (cl.read_op)
-    {
-        delete cl.read_op;
-    }
-    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
-    {
-        if (*rit == peer_fd)
-        {
-            read_ready_clients.erase(rit);
-            break;
-        }
-    }
-    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
-    {
-        if (*wit == peer_fd)
-        {
-            write_ready_clients.erase(wit);
-            break;
-        }
-    }
-    clients.erase(it);
-    close(peer_fd);
-}
-
 void osd_t::exec_op(osd_op_t *cur_op)
 {
    clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@@ -356,23 +345,29 @@ void osd_t::exec_op(osd_op_t *cur_op)
        delete cur_op;
        return;
    }
+    inflight_ops++;
    cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
    if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
        cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
        (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
-        (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
-        (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
-        (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
+        (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % bs_disk_alignment || cur_op->req.sec_rw.offset % bs_disk_alignment) ||
+        (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE || cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
+        (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % bs_disk_alignment || cur_op->req.rw.offset % bs_disk_alignment))
    {
        // Bad command
-        cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        cur_op->reply.hdr.id = cur_op->req.hdr.id;
-        cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-        cur_op->reply.hdr.retval = -EINVAL;
-        outbox_push(this->clients[cur_op->peer_fd], cur_op);
+        finish_op(cur_op, -EINVAL);
+        return;
+    }
+    if (readonly &&
+        cur_op->req.hdr.opcode != OSD_OP_SECONDARY_READ &&
+        cur_op->req.hdr.opcode != OSD_OP_SECONDARY_LIST &&
+        cur_op->req.hdr.opcode != OSD_OP_READ &&
+        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
+    {
+        // Readonly mode
+        finish_op(cur_op, -EROFS);
        return;
    }
-    inflight_ops++;
    if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
    {
        exec_sync_stab_all(cur_op);
@@ -393,8 +388,84 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_sync(cur_op);
    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
+    {
+        continue_primary_del(cur_op);
+    }
    else
    {
        exec_secondary(cur_op);
    }
 }
+
+void osd_t::reset_stats()
+{
+    c_cli.stats = { 0 };
+    prev_stats = { 0 };
+    memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
+    memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
+}
+
+void osd_t::print_stats()
+{
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
+        {
+            uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
+            uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
+            if (c_cli.stats.op_stat_bytes[i] != 0)
+            {
+                printf(
+                    "[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
+                    (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
+                    (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
+                );
+            }
+            else
+            {
+                printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
+            }
+            prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i];
+            prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i];
+            prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
+        }
+    }
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
+        {
+            uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
+            printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
+            prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i];
+            prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i];
+        }
+    }
+    for (int i = 0; i < 2; i++)
+    {
+        if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
+        {
+            uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
+            printf(
+                "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
+                (recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
+                (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
+                (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
+            );
+            recovery_stat_count[1][i] = recovery_stat_count[0][i];
+            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
+        }
+    }
+    if (incomplete_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
+    }
+    if (degraded_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
+    }
+    if (misplaced_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
+    }
+}
--- a/osd.h
+++ b/osd.h
@@ -15,144 +15,29 @@

 #include "blockstore.h"
 #include "ringloop.h"
-#include "timerfd_interval.h"
-#include "osd_ops.h"
+#include "timerfd_manager.h"
 #include "osd_peering_pg.h"
+#include "cluster_client.h"
+#include "etcd_state_client.h"

-#include "sparsepp/sparsepp/spp.h"
+#define OSD_LOADING_PGS 0x01
+#define OSD_PEERING_PGS 0x04
+#define OSD_FLUSHING_PGS 0x08
+#define OSD_RECOVERING 0x10

-#define OSD_OP_IN 0
-#define OSD_OP_OUT 1
+#define IMMEDIATE_NONE 0
+#define IMMEDIATE_SMALL 1
+#define IMMEDIATE_ALL 2

-#define CL_READ_OP 1
-#define CL_READ_DATA 2
-#define CL_READ_REPLY_DATA 3
-#define CL_WRITE_READY 1
-#define CL_WRITE_REPLY 2
-#define MAX_EPOLL_EVENTS 64
-#define OSD_OP_INLINE_BUF_COUNT 16
-
-#define PEER_CONNECTING 1
-#define PEER_CONNECTED 2
-#define OSD_PEERING_PEERS 1
-#define OSD_PEERING_PGS 2
+#define MAX_AUTOSYNC_INTERVAL 3600
+#define DEFAULT_AUTOSYNC_INTERVAL 5
+#define MAX_RECOVERY_QUEUE 2048
+#define DEFAULT_RECOVERY_QUEUE 4
+#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default

 //#define OSD_STUB

-struct osd_op_buf_list_t
-{
-    int count = 0, alloc = 0, sent = 0;
-    iovec *buf = NULL;
-    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
-
-    ~osd_op_buf_list_t()
-    {
-        if (buf && buf != inline_buf)
-        {
-            free(buf);
-        }
-    }
-
-    inline iovec* get_iovec()
-    {
-        return (buf ? buf : inline_buf) + sent;
-    }
-
-    inline int get_size()
-    {
-        return count - sent;
-    }
-
-    inline void push_back(void *nbuf, size_t len)
-    {
-        if (count >= alloc)
-        {
-            if (!alloc)
-            {
-                alloc = OSD_OP_INLINE_BUF_COUNT;
-                buf = inline_buf;
-            }
-            else if (buf == inline_buf)
-            {
-                int old = alloc;
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)malloc(sizeof(iovec) * alloc);
-                memcpy(buf, inline_buf, sizeof(iovec)*old);
-            }
-            else
-            {
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
-            }
-        }
-        buf[count++] = { .iov_base = nbuf, .iov_len = len };
-    }
-};
-
-struct osd_primary_op_data_t;
-
-struct osd_op_t
-{
-    timespec tv_begin;
-    timespec tv_send;
-    int op_type = OSD_OP_IN;
-    int peer_fd;
-    osd_any_op_t req;
-    osd_any_reply_t reply;
-    blockstore_op_t *bs_op = NULL;
-    void *buf = NULL;
-    void *rmw_buf = NULL;
-    osd_primary_op_data_t* op_data = NULL;
-    std::function<void(osd_op_t*)> callback;
-
-    osd_op_buf_list_t send_list;
-
-    ~osd_op_t();
-};
-
-struct osd_peer_def_t
-{
-    osd_num_t osd_num = 0;
-    std::string addr;
-    int port = 0;
-    time_t last_connect_attempt = 0;
-};
-
-struct osd_client_t
-{
-    sockaddr_in peer_addr;
-    int peer_port;
-    int peer_fd;
-    int peer_state;
-    std::function<void(osd_num_t, int)> connect_callback;
-    osd_num_t osd_num = 0;
-
-    // Read state
-    int read_ready = 0;
-    osd_op_t *read_op = NULL;
-    int read_reply_id = 0;
-    iovec read_iov;
-    msghdr read_msg;
-    void *read_buf = NULL;
-    int read_remaining = 0;
-    int read_state = 0;
-
-    // Outbound operations sent to this client (which is probably an OSD peer)
-    std::map<int, osd_op_t*> sent_ops;
-
-    // Outbound messages (replies or requests)
-    std::deque<osd_op_t*> outbox;
-
-    // PGs dirtied by this client's primary-writes
-    std::set<pg_num_t> dirty_pgs;
-
-    // Write state
-    osd_op_t *write_op = NULL;
-    msghdr write_msg;
-    int write_state = 0;
-};
-
-struct osd_rmw_stripe_t;
+extern const char* osd_op_names[];

 struct osd_object_id_t
 {
@@ -160,26 +45,58 @@ struct osd_object_id_t
    object_id oid;
 };

+struct osd_recovery_op_t
+{
+    int st = 0;
+    bool degraded = false;
+    pg_num_t pg_num = 0;
+    object_id oid = { 0 };
+    osd_op_t *osd_op = NULL;
+};
+
 class osd_t
 {
    // config

+    blockstore_config_t config;
+    int etcd_report_interval = 30;
+
+    bool readonly = false;
    osd_num_t osd_num = 1; // OSD numbers start with 1
    bool run_primary = false;
-    std::vector<osd_peer_def_t> peers;
-    blockstore_config_t config;
    std::string bind_address;
    int bind_port, listen_backlog;
+    // FIXME: Implement client queue depth limit
    int client_queue_depth = 128;
    bool allow_test_ops = true;
+    int print_stats_interval = 3;
+    int immediate_commit = IMMEDIATE_NONE;
+    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
+    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    int log_level = 0;

-    // peer OSDs
+    // cluster state

-    std::map<uint64_t, int> osd_peer_fds;
-    std::vector<pg_t> pgs;
+    etcd_state_client_t st_cli;
+    cluster_client_t c_cli;
+    int etcd_failed_attempts = 0;
+    std::string etcd_lease_id;
+    json11::Json self_state;
+    bool loading_peer_config = false;
+    std::set<pg_num_t> pg_state_dirty;
+    bool pg_config_applied = false;
+    bool etcd_reporting_pg_state = false;
+    bool etcd_reporting_stats = false;
+
+    // peers and PGs
+
+    std::map<pg_num_t, pg_t> pgs;
+    std::set<pg_num_t> dirty_pgs;
+    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    unsigned pg_count = 0;
-    uint64_t next_subop_id = 1;
+    std::map<object_id, osd_recovery_op_t> recovery_ops;
+    osd_op_t *autosync_op = NULL;

    // Unstable writes
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -191,53 +108,73 @@ class osd_t
    int inflight_ops = 0;
    blockstore_t *bs;
    uint32_t bs_block_size, bs_disk_alignment;
-    uint64_t parity_block_size = 4*1024*1024; // 4 MB by default
+    uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
    ring_loop_t *ringloop;
-    timerfd_interval *tick_tfd;
+    timerfd_manager_t *tfd = NULL;

    int wait_state = 0;
    int epoll_fd = 0;
+    int listening_port = 0;
    int listen_fd = 0;
    ring_consumer_t consumer;
+    std::map<int, std::function<void(int, int)>> epoll_handlers;

-    std::unordered_map<int,osd_client_t> clients;
-    std::vector<int> read_ready_clients;
-    std::vector<int> write_ready_clients;
-    uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
-    uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
-    uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
-    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
-    uint64_t send_stat_sum = 0;
-    uint64_t send_stat_count = 0;
+    // op statistics
+    osd_op_stats_t prev_stats;
+    const char* recovery_stat_names[2] = { "degraded", "misplaced" };
+    uint64_t recovery_stat_count[2][2] = { 0 };
+    uint64_t recovery_stat_bytes[2][2] = { 0 };

-    // methods
+    // cluster connection
+    void parse_config(blockstore_config_t & config);
+    void init_cluster();
+    void on_change_osd_state_hook(uint64_t osd_num);
+    void on_change_etcd_state_hook(json11::Json::object & changes);
+    void on_load_config_hook(json11::Json::object & changes);
+    json11::Json on_load_pgs_checks_hook();
+    void on_load_pgs_hook(bool success);
+    void bind_socket();
+    void acquire_lease();
+    json11::Json get_osd_state();
+    void create_osd_state();
+    void renew_lease();
+    void print_stats();
+    void reset_stats();
+    json11::Json get_statistics();
+    void report_statistics();
+    void report_pg_state(pg_t & pg);
+    void report_pg_states();
+    void apply_pg_count();
+    void apply_pg_config();

    // event loop, socket read/write
    void loop();
+    void set_fd_handler(int fd, bool out, std::function<void(int, int)> handler);
    void handle_epoll_events();
-    void read_requests();
-    void handle_read(ring_data_t *data, int peer_fd);
-    void handle_op_hdr(osd_client_t *cl);
-    void handle_reply_hdr(osd_client_t *cl);
-    bool try_send(osd_client_t & cl);
-    void send_replies();
-    void handle_send(ring_data_t *data, int peer_fd);
-    void outbox_push(osd_client_t & cl, osd_op_t *op);

    // peer handling (primary OSD logic)
-    void connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback);
-    void handle_connect_result(int peer_fd);
-    void cancel_osd_ops(osd_client_t & cl);
-    void cancel_op(osd_op_t *op);
-    void stop_client(int peer_fd);
-    osd_peer_def_t parse_peer(std::string peer);
-    void init_primary();
+    void parse_test_peer(std::string peer);
    void handle_peers();
-    void repeer_pgs(osd_num_t osd_num, bool is_connected);
-    void start_pg_peering(int i);
+    void repeer_pgs(osd_num_t osd_num);
+    void start_pg_peering(pg_num_t pg_num);
+    void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
+    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
+    void discard_list_subop(osd_op_t *list_op);
+    bool stop_pg(pg_num_t pg_num);
+    void finish_stop_pg(pg_t & pg);
+
+    // flushing, recovery and backfill
+    void submit_pg_flush_ops(pg_num_t pg_num);
+    void handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
+    void submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
+    bool pick_next_recovery(osd_recovery_op_t &op);
+    void submit_recovery_op(osd_recovery_op_t *op);
+    bool continue_recovery();
+    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

    // op execution
    void exec_op(osd_op_t *cur_op);
+    void finish_op(osd_op_t *cur_op, int retval);

    // secondary ops
    void exec_sync_stab_all(osd_op_t *cur_op);
@@ -246,18 +183,33 @@ class osd_t
    void secondary_op_callback(osd_op_t *cur_op);

    // primary ops
+    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
-    void finish_primary_op(osd_op_t *cur_op, int retval);
-    void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
+    void continue_primary_del(osd_op_t *cur_op);
+    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
+    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
+    bool finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
+    void handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version);
+    void handle_primary_bs_subop(osd_op_t *subop);
+    void add_bs_subop_stats(osd_op_t *subop);
+    void pg_cancel_write_queue(pg_t & pg, object_id oid, int retval);
    void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
+    void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set);
    void submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);
+
+    inline pg_num_t map_to_pg(object_id oid)
+    {
+        return (oid.inode + oid.stripe / pg_stripe_size) % pg_count + 1;
+    }
+
 public:
    osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
    ~osd_t();
+    void force_stop(int exitcode);
    bool shutdown();
 };

--- a/osd_cluster.cpp
+++ b/osd_cluster.cpp
@@ -0,0 +1,740 @@
+#include "osd.h"
+#include "base64.h"
+#include "etcd_state_client.h"
+
+// Startup sequence:
+//   Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
+//   -> Load PG config -> Report&lock PG states -> Load peers -> Connect to peers -> Peer PGs
+// Event handling
+//   Wait for PG changes -> Start/Stop PGs when requested
+//   Peer connection is lost -> Reload connection data -> Try to reconnect
+void osd_t::init_cluster()
+{
+    if (!st_cli.etcd_addresses.size())
+    {
+        if (run_primary)
+        {
+            // Test version of clustering code with 1 PG and 2 peers
+            // Example: peers = 2:127.0.0.1:11204,3:127.0.0.1:11205
+            std::string peerstr = config["peers"];
+            while (peerstr.size())
+            {
+                int pos = peerstr.find(',');
+                parse_test_peer(pos < 0 ? peerstr : peerstr.substr(0, pos));
+                peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
+            }
+            if (st_cli.peer_states.size() < 2)
+            {
+                throw std::runtime_error("run_primary requires at least 2 peers");
+            }
+            pgs[1] = (pg_t){
+                .state = PG_PEERING,
+                .pg_cursize = 0,
+                .pg_num = 1,
+                .target_set = { 1, 2, 3 },
+                .cur_set = { 0, 0, 0 },
+            };
+            report_pg_state(pgs[1]);
+            pg_count = 1;
+        }
+        bind_socket();
+    }
+    else
+    {
+        st_cli.tfd = tfd;
+        st_cli.log_level = log_level;
+        st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
+        st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_etcd_state_hook(changes); };
+        st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
+        st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
+        st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
+        peering_state = OSD_LOADING_PGS;
+        st_cli.load_global_config();
+    }
+    if (run_primary && autosync_interval > 0)
+    {
+        this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
+        {
+            autosync();
+        });
+    }
+}
+
+void osd_t::parse_test_peer(std::string peer)
+{
+    // OSD_NUM:IP:PORT
+    int pos1 = peer.find(':');
+    int pos2 = peer.find(':', pos1+1);
+    if (pos1 < 0 || pos2 < 0)
+        throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
+    std::string addr = peer.substr(pos1+1, pos2-pos1-1);
+    std::string osd_num_str = peer.substr(0, pos1);
+    std::string port_str = peer.substr(pos2+1);
+    osd_num_t peer_osd = strtoull(osd_num_str.c_str(), NULL, 10);
+    if (!peer_osd)
+        throw new std::runtime_error("Could not parse OSD peer osd_num");
+    else if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
+        throw std::runtime_error("Same osd number "+std::to_string(peer_osd)+" specified twice in peers");
+    int port = strtoull(port_str.c_str(), NULL, 10);
+    if (!port)
+        throw new std::runtime_error("Could not parse OSD peer port");
+    st_cli.peer_states[peer_osd] = json11::Json::object {
+        { "state", "up" },
+        { "addresses", json11::Json::array { addr } },
+        { "port", port },
+    };
+    c_cli.connect_peer(peer_osd, json11::Json::array { addr }, port);
+}
+
+json11::Json osd_t::get_osd_state()
+{
+    std::vector<char> hostname;
+    hostname.resize(1024);
+    while (gethostname(hostname.data(), hostname.size()) < 0 && errno == ENAMETOOLONG)
+        hostname.resize(hostname.size()+1024);
+    hostname.resize(strnlen(hostname.data(), hostname.size()));
+    json11::Json::object st;
+    st["state"] = "up";
+    if (bind_address != "0.0.0.0")
+        st["addresses"] = json11::Json::array { bind_address };
+    else
+        st["addresses"] = getifaddr_list();
+    st["host"] = std::string(hostname.data(), hostname.size());
+    st["port"] = listening_port;
+    st["primary_enabled"] = run_primary;
+    st["blockstore_enabled"] = bs ? true : false;
+    return st;
+}
+
+json11::Json osd_t::get_statistics()
+{
+    json11::Json::object st;
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    char time_str[50] = { 0 };
+    sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
+    st["time"] = time_str;
+    st["blockstore_ready"] = bs->is_started();
+    if (bs)
+    {
+        st["size"] = bs->get_block_count() * bs->get_block_size();
+        st["free"] = bs->get_free_block_count() * bs->get_block_size();
+    }
+    st["host"] = self_state["host"];
+    json11::Json::object op_stats, subop_stats;
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        op_stats[osd_op_names[i]] = json11::Json::object {
+            { "count", c_cli.stats.op_stat_count[i] },
+            { "usec", c_cli.stats.op_stat_sum[i] },
+            { "bytes", c_cli.stats.op_stat_bytes[i] },
+        };
+    }
+    for (int i = 0; i <= OSD_OP_MAX; i++)
+    {
+        subop_stats[osd_op_names[i]] = json11::Json::object {
+            { "count", c_cli.stats.subop_stat_count[i] },
+            { "usec", c_cli.stats.subop_stat_sum[i] },
+        };
+    }
+    st["op_stats"] = op_stats;
+    st["subop_stats"] = subop_stats;
+    st["recovery_stats"] = json11::Json::object {
+        { recovery_stat_names[0], json11::Json::object {
+            { "count", recovery_stat_count[0][0] },
+            { "bytes", recovery_stat_bytes[0][0] },
+        } },
+        { recovery_stat_names[1], json11::Json::object {
+            { "count", recovery_stat_count[0][1] },
+            { "bytes", recovery_stat_bytes[0][1] },
+        } },
+    };
+    return st;
+}
+
+void osd_t::report_statistics()
+{
+    if (etcd_reporting_stats)
+    {
+        return;
+    }
+    etcd_reporting_stats = true;
+    json11::Json::array txn = { json11::Json::object {
+        { "request_put", json11::Json::object {
+            { "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
+            { "value", base64_encode(get_statistics().dump()) },
+        } }
+    } };
+    for (auto & p: pgs)
+    {
+        auto & pg = p.second;
+        if (pg.state & (PG_OFFLINE | PG_STARTING))
+        {
+            // Don't report statistics for offline PGs
+            continue;
+        }
+        json11::Json::object pg_stats;
+        pg_stats["object_count"] = pg.total_count;
+        pg_stats["clean_count"] = pg.clean_count;
+        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
+        pg_stats["degraded_count"] = pg.degraded_objects.size();
+        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
+        pg_stats["write_osd_set"] = pg.cur_set;
+        txn.push_back(json11::Json::object {
+            { "request_put", json11::Json::object {
+                { "key", base64_encode(st_cli.etcd_prefix+"/pg/stats/"+std::to_string(pg.pg_num)) },
+                { "value", base64_encode(json11::Json(pg_stats).dump()) },
+            } }
+        });
+    }
+    st_cli.etcd_txn(json11::Json::object { { "success", txn } }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
+    {
+        etcd_reporting_stats = false;
+        if (err != "")
+        {
+            printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, err.c_str());
+            // Retry indefinitely
+            tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
+            {
+                report_statistics();
+            });
+        }
+        else if (res["error"].string_value() != "")
+        {
+            printf("[OSD %lu] Error reporting state to etcd: %s\n", this->osd_num, res["error"].string_value().c_str());
+            force_stop(1);
+        }
+    });
+}
+
+void osd_t::on_change_osd_state_hook(uint64_t peer_osd)
+{
+    if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
+    {
+        c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]["addresses"], st_cli.peer_states[peer_osd]["port"].int64_value());
+    }
+}
+
+void osd_t::on_change_etcd_state_hook(json11::Json::object & changes)
+{
+    // FIXME apply config changes in runtime (maybe, some)
+    apply_pg_count();
+    apply_pg_config();
+}
+
+void osd_t::on_load_config_hook(json11::Json::object & global_config)
+{
+    blockstore_config_t osd_config = this->config;
+    for (auto & cfg_var: global_config)
+    {
+        if (this->config.find(cfg_var.first) == this->config.end())
+        {
+            // FIXME Convert int to str
+            osd_config[cfg_var.first] = cfg_var.second.string_value();
+        }
+    }
+    parse_config(osd_config);
+    bind_socket();
+    st_cli.start_etcd_watcher();
+    acquire_lease();
+}
+
+// Acquire lease
+void osd_t::acquire_lease()
+{
+    // Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
+    st_cli.etcd_call("/lease/grant", json11::Json::object {
+        { "TTL", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 }
+    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "" || data["ID"].string_value() == "")
+        {
+            printf("Error acquiring a lease from etcd: %s\n", err.c_str());
+            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
+            {
+                acquire_lease();
+            });
+            return;
+        }
+        etcd_lease_id = data["ID"].string_value();
+        create_osd_state();
+    });
+    printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].c_str(), etcd_report_interval);
+    tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
+    {
+        renew_lease();
+    });
+}
+
+// Report "up" state once, then keep it alive using the lease
+// Do it first to allow "monitors" check it when moving PGs
+void osd_t::create_osd_state()
+{
+    std::string state_key = base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num));
+    self_state = get_osd_state();
+    st_cli.etcd_txn(json11::Json::object {
+        // Check that the state key does not exist
+        { "compare", json11::Json::array {
+            json11::Json::object {
+                { "target", "CREATE" },
+                { "create_revision", 0 },
+                { "key", state_key },
+            }
+        } },
+        { "success", json11::Json::array {
+            json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", state_key },
+                    { "value", base64_encode(self_state.dump()) },
+                    { "lease", etcd_lease_id },
+                } }
+            },
+        } },
+        { "failure", json11::Json::array {
+            json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", state_key },
+                } }
+            },
+        } },
+    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err != "")
+        {
+            etcd_failed_attempts++;
+            printf("Error creating OSD state key: %s\n", err.c_str());
+            if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
+            {
+                // Die
+                throw std::runtime_error("Cluster connection failed");
+            }
+            // Retry
+            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
+            {
+                create_osd_state();
+            });
+            return;
+        }
+        if (!data["succeeded"].bool_value())
+        {
+            // OSD is already up
+            auto kv = st_cli.parse_etcd_kv(data["responses"][0]["response_range"]["kvs"][0]);
+            printf("Key %s already exists in etcd, OSD %lu is still up\n", kv.key.c_str(), this->osd_num);
+            int64_t port = kv.value["port"].int64_value();
+            for (auto & addr: kv.value["addresses"].array_items())
+            {
+                printf("  listening at: %s:%ld\n", addr.string_value().c_str(), port);
+            }
+            force_stop(0);
+            return;
+        }
+        if (run_primary)
+        {
+            st_cli.load_pgs();
+        }
+    });
+}
+
+// Renew lease
+void osd_t::renew_lease()
+{
+    st_cli.etcd_call("/lease/keepalive", json11::Json::object {
+        { "ID", etcd_lease_id }
+    }, ETCD_QUICK_TIMEOUT, [this](std::string err, json11::Json data)
+    {
+        if (err == "" && data["result"]["TTL"].string_value() == "")
+        {
+            // Die
+            throw std::runtime_error("etcd lease has expired");
+        }
+        if (err != "")
+        {
+            etcd_failed_attempts++;
+            printf("Error renewing etcd lease: %s\n", err.c_str());
+            if (etcd_failed_attempts > MAX_ETCD_ATTEMPTS)
+            {
+                // Die
+                throw std::runtime_error("Cluster connection failed");
+            }
+            // Retry
+            tfd->set_timer(ETCD_QUICK_TIMEOUT, false, [this](int timer_id)
+            {
+                renew_lease();
+            });
+        }
+        else
+        {
+            etcd_failed_attempts = 0;
+            report_statistics();
+        }
+    });
+}
+
+void osd_t::force_stop(int exitcode)
+{
+    if (etcd_lease_id != "")
+    {
+        st_cli.etcd_call("/kv/lease/revoke", json11::Json::object {
+            { "ID", etcd_lease_id }
+        }, ETCD_QUICK_TIMEOUT, [this, exitcode](std::string err, json11::Json data)
+        {
+            if (err != "")
+            {
+                printf("Error revoking etcd lease: %s\n", err.c_str());
+            }
+            printf("[OSD %lu] Force stopping\n", this->osd_num);
+            exit(exitcode);
+        });
+    }
+    else
+    {
+        printf("[OSD %lu] Force stopping\n", this->osd_num);
+        exit(exitcode);
+    }
+}
+
+json11::Json osd_t::on_load_pgs_checks_hook()
+{
+    assert(this->pgs.size() == 0);
+    json11::Json::array checks = {
+        json11::Json::object {
+            { "target", "LEASE" },
+            { "lease", etcd_lease_id },
+            { "key", base64_encode(st_cli.etcd_prefix+"/osd/state/"+std::to_string(osd_num)) },
+        }
+    };
+    return checks;
+}
+
+void osd_t::on_load_pgs_hook(bool success)
+{
+    if (!success)
+    {
+        printf("Error loading PGs from etcd: lease expired\n");
+        force_stop(1);
+    }
+    else
+    {
+        peering_state &= ~OSD_LOADING_PGS;
+        apply_pg_count();
+        apply_pg_config();
+    }
+}
+
+void osd_t::apply_pg_count()
+{
+    pg_num_t pg_count = st_cli.pg_config.size();
+    if (pg_count > 0 && (st_cli.pg_config.begin()->first != 1 || std::prev(st_cli.pg_config.end())->first != pg_count))
+    {
+        printf("Invalid PG configuration: PG numbers don't cover the whole 1..%d range\n", pg_count);
+        force_stop(1);
+        return;
+    }
+    if (this->pg_count != 0 && this->pg_count != pg_count)
+    {
+        // Check that all PGs are offline. It is not allowed to change PG count when any PGs are online
+        // The external tool must wait for all PGs to come down before changing PG count
+        // If it doesn't wait, a restarted OSD may apply the new count immediately which will lead to bugs
+        // So an OSD just dies if it detects PG count change while there are active PGs
+        int still_active = 0;
+        for (auto & kv: pgs)
+        {
+            if (kv.second.state & PG_ACTIVE)
+            {
+                still_active++;
+            }
+        }
+        if (still_active > 0)
+        {
+            printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
+            force_stop(1);
+            return;
+        }
+    }
+    this->pg_count = pg_count;
+}
+
+void osd_t::apply_pg_config()
+{
+    bool all_applied = true;
+    for (auto & kv: st_cli.pg_config)
+    {
+        pg_num_t pg_num = kv.first;
+        auto & pg_cfg = kv.second;
+        bool take = pg_cfg.exists && pg_cfg.primary == this->osd_num &&
+            !pg_cfg.pause && (!pg_cfg.cur_primary || pg_cfg.cur_primary == this->osd_num);
+        bool currently_taken = this->pgs.find(pg_num) != this->pgs.end() &&
+            this->pgs[pg_num].state != PG_OFFLINE;
+        if (currently_taken && !take)
+        {
+            // Stop this PG
+            stop_pg(pg_num);
+        }
+        else if (take)
+        {
+            // Take this PG
+            std::set<osd_num_t> all_peers;
+            for (osd_num_t pg_osd: pg_cfg.target_set)
+            {
+                if (pg_osd != 0)
+                {
+                    all_peers.insert(pg_osd);
+                }
+            }
+            for (osd_num_t pg_osd: pg_cfg.all_peers)
+            {
+                if (pg_osd != 0)
+                {
+                    all_peers.insert(pg_osd);
+                }
+            }
+            for (auto & hist_item: pg_cfg.target_history)
+            {
+                for (auto pg_osd: hist_item)
+                {
+                    if (pg_osd != 0)
+                    {
+                        all_peers.insert(pg_osd);
+                    }
+                }
+            }
+            if (currently_taken)
+            {
+                if (this->pgs[pg_num].state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
+                {
+                    if (this->pgs[pg_num].target_set == pg_cfg.target_set)
+                    {
+                        // No change in osd_set; history changes are ignored
+                        continue;
+                    }
+                    else
+                    {
+                        // Stop PG, reapply change after stopping
+                        stop_pg(pg_num);
+                        all_applied = false;
+                        continue;
+                    }
+                }
+                else if (this->pgs[pg_num].state & PG_STOPPING)
+                {
+                    // Reapply change after stopping
+                    all_applied = false;
+                    continue;
+                }
+                else if (this->pgs[pg_num].state & PG_STARTING)
+                {
+                    if (pg_cfg.cur_primary == this->osd_num)
+                    {
+                        // PG locked, continue
+                    }
+                    else
+                    {
+                        // Reapply change after locking the PG
+                        all_applied = false;
+                        continue;
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(this->pgs[pg_num].state));
+                }
+            }
+            this->pgs[pg_num] = (pg_t){
+                .state = pg_cfg.cur_primary == this->osd_num ? PG_PEERING : PG_STARTING,
+                .pg_cursize = 0,
+                .pg_num = pg_num,
+                .target_history = pg_cfg.target_history,
+                .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
+                .target_set = pg_cfg.target_set,
+            };
+            this->pg_state_dirty.insert(pg_num);
+            this->pgs[pg_num].print_state();
+            if (pg_cfg.cur_primary == this->osd_num)
+            {
+                // Add peers
+                for (auto pg_osd: all_peers)
+                {
+                    if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
+                    {
+                        c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
+                    }
+                }
+                start_pg_peering(pg_num);
+            }
+            else
+            {
+                // Reapply change after locking the PG
+                all_applied = false;
+            }
+        }
+    }
+    report_pg_states();
+    this->pg_config_applied = all_applied;
+}
+
+void osd_t::report_pg_states()
+{
+    if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size())
+    {
+        return;
+    }
+    etcd_reporting_pg_state = true;
+    std::vector<std::pair<pg_num_t,bool>> reporting_pgs;
+    json11::Json::array checks;
+    json11::Json::array success;
+    json11::Json::array failure;
+    for (auto it = pg_state_dirty.begin(); it != pg_state_dirty.end(); it++)
+    {
+        auto pg_it = this->pgs.find(*it);
+        if (pg_it == this->pgs.end())
+        {
+            continue;
+        }
+        auto & pg = pg_it->second;
+        reporting_pgs.push_back({ pg.pg_num, pg.history_changed });
+        std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num));
+        if (pg.state == PG_STARTING)
+        {
+            // Check that the PG key does not exist
+            // Failed check indicates an unsuccessful PG lock attempt in this case
+            checks.push_back(json11::Json::object {
+                { "target", "VERSION" },
+                { "version", 0 },
+                { "key", state_key_base64 },
+            });
+        }
+        else
+        {
+            // Check that the key is ours
+            // Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
+            // and an unexpected race condition for started pgs (PG lock is held by someone else)
+            checks.push_back(json11::Json::object {
+                { "target", "LEASE" },
+                { "lease", etcd_lease_id },
+                { "key", state_key_base64 },
+            });
+        }
+        if (pg.state == PG_OFFLINE)
+        {
+            success.push_back(json11::Json::object {
+                { "request_delete_range", json11::Json::object {
+                    { "key", state_key_base64 },
+                } }
+            });
+        }
+        else
+        {
+            json11::Json::array pg_state_keywords;
+            for (int i = 0; i < pg_state_bit_count; i++)
+            {
+                if (pg.state & pg_state_bits[i])
+                {
+                    pg_state_keywords.push_back(pg_state_names[i]);
+                }
+            }
+            success.push_back(json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pg_num)) },
+                    { "value", base64_encode(json11::Json(json11::Json::object {
+                        { "primary", this->osd_num },
+                        { "state", pg_state_keywords },
+                        { "peers", pg.cur_peers },
+                    }).dump()) },
+                    { "lease", etcd_lease_id },
+                } }
+            });
+            if (pg.history_changed)
+            {
+                pg.history_changed = false;
+                if (pg.state == PG_ACTIVE)
+                {
+                    success.push_back(json11::Json::object {
+                        { "request_delete_range", json11::Json::object {
+                            { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
+                        } }
+                    });
+                }
+                else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
+                {
+                    success.push_back(json11::Json::object {
+                        { "request_put", json11::Json::object {
+                            { "key", base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pg_num)) },
+                            { "value", base64_encode(json11::Json(json11::Json::object {
+                                { "all_peers", pg.all_peers },
+                            }).dump()) },
+                        } }
+                    });
+                }
+            }
+        }
+        failure.push_back(json11::Json::object {
+            { "request_range", json11::Json::object {
+                { "key", state_key_base64 },
+            } }
+        });
+    }
+    pg_state_dirty.clear();
+    st_cli.etcd_txn(json11::Json::object {
+        { "compare", checks }, { "success", success }, { "failure", failure }
+    }, ETCD_QUICK_TIMEOUT, [this, reporting_pgs](std::string err, json11::Json data)
+    {
+        etcd_reporting_pg_state = false;
+        if (!data["succeeded"].bool_value())
+        {
+            // One of PG state updates failed, put dirty flags back
+            for (auto pp: reporting_pgs)
+            {
+                this->pg_state_dirty.insert(pp.first);
+                if (pp.second)
+                {
+                    auto pg_it = this->pgs.find(pp.first);
+                    if (pg_it != this->pgs.end())
+                    {
+                        pg_it->second.history_changed = true;
+                    }
+                }
+            }
+            for (auto & res: data["responses"].array_items())
+            {
+                if (res["kvs"].array_items().size())
+                {
+                    auto kv = st_cli.parse_etcd_kv(res["kvs"][0]);
+                    pg_num_t pg_num = stoull_full(kv.key.substr(st_cli.etcd_prefix.length()+10));
+                    auto pg_it = pgs.find(pg_num);
+                    if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE && pg_it->second.state != PG_STARTING)
+                    {
+                        // Live PG state update failed
+                        printf("Failed to report state of PG %u which is live. Race condition detected, exiting\n", pg_num);
+                        force_stop(1);
+                        return;
+                    }
+                }
+            }
+            // Retry after a short pause (hope we'll get some updates and update PG states accordingly)
+            tfd->set_timer(500, false, [this](int) { report_pg_states(); });
+        }
+        else
+        {
+            // Success. We'll get our changes back via the watcher and react to them
+            for (auto pp: reporting_pgs)
+            {
+                auto pg_it = this->pgs.find(pp.first);
+                if (pg_it != this->pgs.end())
+                {
+                    if (pg_it->second.state == PG_OFFLINE)
+                    {
+                        // Remove offline PGs after reporting their state
+                        this->pgs.erase(pg_it);
+                    }
+                }
+            }
+            // Push other PG state updates, if any
+            report_pg_states();
+            if (!this->pg_state_dirty.size())
+            {
+                // Update statistics
+                report_statistics();
+            }
+        }
+    });
+}
--- a/osd_flush.cpp
+++ b/osd_flush.cpp
@@ -0,0 +1,296 @@
+#include "osd.h"
+
+#define FLUSH_BATCH 512
+
+void osd_t::submit_pg_flush_ops(pg_num_t pg_num)
+{
+    pg_t & pg = pgs[pg_num];
+    pg_flush_batch_t *fb = new pg_flush_batch_t();
+    pg.flush_batch = fb;
+    auto it = pg.flush_actions.begin(), prev_it = pg.flush_actions.begin();
+    bool first = true;
+    while (it != pg.flush_actions.end())
+    {
+        if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
+            (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
+            fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
+            fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
+        {
+            // Stop only at the object boundary
+            break;
+        }
+        it->second.submitted = true;
+        if (it->second.rollback)
+        {
+            fb->flush_objects++;
+            fb->rollback_lists[it->first.osd_num].push_back((obj_ver_id){
+                .oid = it->first.oid,
+                .version = it->second.rollback_to,
+            });
+        }
+        if (it->second.make_stable)
+        {
+            fb->flush_objects++;
+            fb->stable_lists[it->first.osd_num].push_back((obj_ver_id){
+                .oid = it->first.oid,
+                .version = it->second.stable_to,
+            });
+        }
+        prev_it = it;
+        first = false;
+        it++;
+    }
+    for (auto & l: fb->rollback_lists)
+    {
+        if (l.second.size() > 0)
+        {
+            fb->flush_ops++;
+            submit_flush_op(pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
+        }
+    }
+    for (auto & l: fb->stable_lists)
+    {
+        if (l.second.size() > 0)
+        {
+            fb->flush_ops++;
+            submit_flush_op(pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
+        }
+    }
+}
+
+void osd_t::handle_flush_op(bool rollback, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
+{
+    if (pgs.find(pg_num) == pgs.end() || pgs[pg_num].flush_batch != fb)
+    {
+        // Throw the result away
+        return;
+    }
+    if (retval != 0)
+    {
+        if (peer_osd == this->osd_num)
+        {
+            throw std::runtime_error(
+                std::string(rollback
+                    ? "Error while doing local rollback operation: "
+                    : "Error while doing local stabilize operation: "
+                ) + strerror(-retval)
+            );
+        }
+        else
+        {
+            printf("Error while doing flush on OSD %lu: %s\n", osd_num, strerror(-retval));
+            assert(c_cli.osd_peer_fds.find(peer_osd) != c_cli.osd_peer_fds.end());
+            c_cli.stop_client(c_cli.osd_peer_fds[peer_osd]);
+            return;
+        }
+    }
+    fb->flush_done++;
+    if (fb->flush_done == fb->flush_ops)
+    {
+        // This flush batch is done
+        std::vector<osd_op_t*> continue_ops;
+        auto & pg = pgs[pg_num];
+        auto it = pg.flush_actions.begin(), prev_it = it;
+        auto erase_start = it;
+        while (1)
+        {
+            if (it == pg.flush_actions.end() ||
+                it->first.oid.inode != prev_it->first.oid.inode ||
+                (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
+            {
+                pg.ver_override.erase((object_id){
+                    .inode = prev_it->first.oid.inode,
+                    .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
+                });
+                auto wr_it = pg.write_queue.find((object_id){
+                    .inode = prev_it->first.oid.inode,
+                    .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
+                });
+                if (wr_it != pg.write_queue.end())
+                {
+                    continue_ops.push_back(wr_it->second);
+                    pg.write_queue.erase(wr_it);
+                }
+            }
+            if ((it == pg.flush_actions.end() || !it->second.submitted) &&
+                erase_start != it)
+            {
+                pg.flush_actions.erase(erase_start, it);
+            }
+            if (it == pg.flush_actions.end())
+            {
+                break;
+            }
+            prev_it = it;
+            if (!it->second.submitted)
+            {
+                it++;
+                erase_start = it;
+            }
+            else
+            {
+                it++;
+            }
+        }
+        delete fb;
+        pg.flush_batch = NULL;
+        if (!pg.flush_actions.size())
+        {
+            pg.state = pg.state & ~PG_HAS_UNCLEAN;
+            report_pg_state(pg);
+        }
+        for (osd_op_t *op: continue_ops)
+        {
+            continue_primary_write(op);
+        }
+        if (pg.inflight == 0 && (pg.state & PG_STOPPING))
+        {
+            finish_stop_pg(pg);
+        }
+    }
+}
+
+void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
+{
+    osd_op_t *op = new osd_op_t();
+    // Copy buffer so it gets freed along with the operation
+    op->buf = malloc(sizeof(obj_ver_id) * count);
+    memcpy(op->buf, data, sizeof(obj_ver_id) * count);
+    if (peer_osd == this->osd_num)
+    {
+        // local
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t({
+            .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
+            .callback = [this, op, pg_num, fb](blockstore_op_t *bs_op)
+            {
+                add_bs_subop_stats(op);
+                handle_flush_op(bs_op->opcode == BS_OP_ROLLBACK, pg_num, fb, this->osd_num, bs_op->retval);
+                delete op->bs_op;
+                op->bs_op = NULL;
+                delete op;
+            },
+            .len = (uint32_t)count,
+            .buf = op->buf,
+        });
+        bs->enqueue_op(op->bs_op);
+    }
+    else
+    {
+        // Peer
+        int peer_fd = c_cli.osd_peer_fds[peer_osd];
+        op->op_type = OSD_OP_OUT;
+        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+        op->send_list.push_back(op->buf, count * sizeof(obj_ver_id));
+        op->peer_fd = peer_fd;
+        op->req = {
+            .sec_stab = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = (uint64_t)(rollback ? OSD_OP_SECONDARY_ROLLBACK : OSD_OP_SECONDARY_STABILIZE),
+                },
+                .len = count * sizeof(obj_ver_id),
+            },
+        };
+        op->callback = [this, pg_num, fb, peer_osd](osd_op_t *op)
+        {
+            handle_flush_op(op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK, pg_num, fb, peer_osd, op->reply.hdr.retval);
+            delete op;
+        };
+        c_cli.outbox_push(op);
+    }
+}
+
+bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
+{
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    {
+        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
+        {
+            for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
+            {
+                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                {
+                    op.degraded = true;
+                    op.pg_num = pg_it->first;
+                    op.oid = obj_it->first;
+                    return true;
+                }
+            }
+        }
+    }
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    {
+        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+        {
+            for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+            {
+                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                {
+                    op.degraded = false;
+                    op.pg_num = pg_it->first;
+                    op.oid = obj_it->first;
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+void osd_t::submit_recovery_op(osd_recovery_op_t *op)
+{
+    op->osd_op = new osd_op_t();
+    op->osd_op->op_type = OSD_OP_OUT;
+    op->osd_op->req = {
+        .rw = {
+            .header = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = 1,
+                .opcode = OSD_OP_WRITE,
+            },
+            .inode = op->oid.inode,
+            .offset = op->oid.stripe,
+            .len = 0,
+        },
+    };
+    op->osd_op->callback = [this, op](osd_op_t *osd_op)
+    {
+        // Don't sync the write, it will be synced by our regular sync coroutine
+        if (osd_op->reply.hdr.retval < 0)
+        {
+            // Error recovering object
+            if (osd_op->reply.hdr.retval == -EPIPE)
+            {
+                // PG is stopped or one of the OSDs is gone, error is harmless
+            }
+            else
+            {
+                throw std::runtime_error("Failed to recover an object");
+            }
+        }
+        recovery_ops.erase(op->oid);
+        delete osd_op;
+        op->osd_op = NULL;
+        continue_recovery();
+    };
+    exec_op(op->osd_op);
+}
+
+// Just trigger write requests for degraded objects. They'll be recovered during writing
+bool osd_t::continue_recovery()
+{
+    while (recovery_ops.size() < recovery_queue_depth)
+    {
+        osd_recovery_op_t op;
+        if (pick_next_recovery(op))
+        {
+            recovery_ops[op.oid] = op;
+            submit_recovery_op(&recovery_ops[op.oid]);
+        }
+        else
+            return false;
+    }
+    return true;
+}
--- a/osd_main.cpp
+++ b/osd_main.cpp
@@ -2,8 +2,17 @@

 #include <signal.h>

-void handle_sigint(int sig)
+static osd_t *osd = NULL;
+static bool force_stopping = false;
+
+static void handle_sigint(int sig)
 {
+    if (osd && !force_stopping)
+    {
+        force_stopping = true;
+        osd->force_stop(0);
+        return;
+    }
    exit(0);
 }

@@ -25,9 +34,11 @@ int main(int narg, char *args[])
        }
    }
    signal(SIGINT, handle_sigint);
+    signal(SIGTERM, handle_sigint);
    ring_loop_t *ringloop = new ring_loop_t(512);
+    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
    blockstore_t *bs = new blockstore_t(config, ringloop);
-    osd_t *osd = new osd_t(config, bs, ringloop);
+    osd = new osd_t(config, bs, ringloop);
    while (1)
    {
        ringloop->loop();
--- a/osd_ops.h
+++ b/osd_ops.h
@@ -22,9 +22,12 @@
 #define OSD_OP_READ                 10
 #define OSD_OP_WRITE                11
 #define OSD_OP_SYNC                 12
-#define OSD_OP_MAX                  12
+#define OSD_OP_DELETE               13
+#define OSD_OP_MAX                  13
 // Alignment & limit for read/write operations
-#define OSD_RW_ALIGN                512
+#ifndef MEM_ALIGNMENT
+#define MEM_ALIGNMENT               512
+#endif
 #define OSD_RW_MAX                  64*1024*1024

 // common request and reply headers
@@ -57,6 +60,7 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
    // object
    object_id oid;
    // read/write version (automatic or specific)
+    // FIXME deny values close to UINT64_MAX
    uint64_t version;
    // offset
    uint32_t offset;
@@ -130,7 +134,7 @@ struct __attribute__((__packed__)) osd_op_secondary_list_t
    osd_op_header_t header;
    // placement group total number and total count
    pg_num_t list_pg, pg_count;
-    uint64_t parity_block_size;
+    uint64_t pg_stripe_size;
 };

 struct __attribute__((__packed__)) osd_reply_secondary_list_t
@@ -142,6 +146,7 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
 };

 // read or write to the primary OSD (must be within individual stripe)
+// FIXME: allow to return used block bitmap (required for snapshots)
 struct __attribute__((__packed__)) osd_op_rw_t
 {
    osd_op_header_t header;
@@ -169,6 +174,7 @@ struct __attribute__((__packed__)) osd_reply_sync_t
    osd_reply_header_t header;
 };

+// FIXME it would be interesting to try to unify blockstore_op and osd_op formats
 union osd_any_op_t
 {
    osd_op_header_t hdr;
--- a/osd_peering.cpp
+++ b/osd_peering.cpp
@@ -3,286 +3,213 @@

 #include <algorithm>

+#include "base64.h"
 #include "osd.h"

-void osd_t::init_primary()
-{
-    // Initial test version of clustering code requires exactly 2 peers
-    // FIXME Hardcode
-    std::string peerstr = config["peers"];
-    while (peerstr.size())
-    {
-        int pos = peerstr.find(',');
-        peers.push_back(parse_peer(pos < 0 ? peerstr : peerstr.substr(0, pos)));
-        peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
-        for (int i = 0; i < peers.size()-1; i++)
-            if (peers[i].osd_num == peers[peers.size()-1].osd_num)
-                throw std::runtime_error("same osd number "+std::to_string(peers[i].osd_num)+" specified twice in peers");
-    }
-    if (peers.size() < 2)
-        throw std::runtime_error("run_primary requires at least 2 peers");
-    pgs.push_back((pg_t){
-        .state = PG_OFFLINE,
-        .pg_cursize = 0,
-        .pg_num = 1,
-        .target_set = { 1, 2, 3 },
-        .cur_set = { 1, 0, 0 },
-    });
-    pg_count = 1;
-    peering_state = OSD_PEERING_PEERS;
-}
-
-osd_peer_def_t osd_t::parse_peer(std::string peer)
-{
-    // OSD_NUM:IP:PORT
-    int pos1 = peer.find(':');
-    int pos2 = peer.find(':', pos1+1);
-    if (pos1 < 0 || pos2 < 0)
-        throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
-    osd_peer_def_t r;
-    r.addr = peer.substr(pos1+1, pos2-pos1-1);
-    std::string osd_num_str = peer.substr(0, pos1);
-    std::string port_str = peer.substr(pos2+1);
-    r.osd_num = strtoull(osd_num_str.c_str(), NULL, 10);
-    if (!r.osd_num)
-        throw new std::runtime_error("Could not parse OSD peer osd_num");
-    r.port = strtoull(port_str.c_str(), NULL, 10);
-    if (!r.port)
-        throw new std::runtime_error("Could not parse OSD peer port");
-    return r;
-}
-
-void osd_t::connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback)
-{
-    struct sockaddr_in addr;
-    int r;
-    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
-    {
-        callback(osd_num, -EINVAL);
-        return;
-    }
-    addr.sin_family = AF_INET;
-    addr.sin_port = htons(peer_port ? peer_port : 11203);
-    int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
-    if (peer_fd < 0)
-    {
-        callback(osd_num, -errno);
-        return;
-    }
-    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
-    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
-    if (r < 0 && errno != EINPROGRESS)
-    {
-        close(peer_fd);
-        callback(osd_num, -errno);
-        return;
-    }
-    clients[peer_fd] = (osd_client_t){
-        .peer_addr = addr,
-        .peer_port = peer_port,
-        .peer_fd = peer_fd,
-        .peer_state = PEER_CONNECTING,
-        .connect_callback = callback,
-        .osd_num = osd_num,
-    };
-    osd_peer_fds[osd_num] = peer_fd;
-    // Add FD to epoll (EPOLLOUT for tracking connect() result)
-    epoll_event ev;
-    ev.data.fd = peer_fd;
-    ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
-    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
-    {
-        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-    }
-}
-
-void osd_t::handle_connect_result(int peer_fd)
-{
-    auto & cl = clients[peer_fd];
-    osd_num_t osd_num = cl.osd_num;
-    auto callback = cl.connect_callback;
-    int result = 0;
-    socklen_t result_len = sizeof(result);
-    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
-    {
-        result = errno;
-    }
-    if (result != 0)
-    {
-        stop_client(peer_fd);
-        callback(osd_num, -result);
-        return;
-    }
-    int one = 1;
-    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-    // Disable EPOLLOUT on this fd
-    cl.connect_callback = NULL;
-    cl.peer_state = PEER_CONNECTED;
-    epoll_event ev;
-    ev.data.fd = peer_fd;
-    ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
-    if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
-    {
-        throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
-    }
-    callback(osd_num, peer_fd);
-}
-
 // Peering loop
 void osd_t::handle_peers()
 {
-    if (peering_state & OSD_PEERING_PEERS)
-    {
-        for (int i = 0; i < peers.size(); i++)
-        {
-            if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end() &&
-                time(NULL) - peers[i].last_connect_attempt > 5) // FIXME hardcode 5
-            {
-                peers[i].last_connect_attempt = time(NULL);
-                connect_peer(peers[i].osd_num, peers[i].addr.c_str(), peers[i].port, [this](osd_num_t osd_num, int peer_fd)
-                {
-                    // FIXME: Check peer config after connecting
-                    if (peer_fd < 0)
-                    {
-                        printf("Failed to connect to peer OSD %lu: %s\n", osd_num, strerror(-peer_fd));
-                        return;
-                    }
-                    printf("Connected with peer OSD %lu (fd %d)\n", clients[peer_fd].osd_num, peer_fd);
-                    int i;
-                    for (i = 0; i < peers.size(); i++)
-                    {
-                        if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end())
-                            break;
-                    }
-                    if (i >= peers.size())
-                    {
-                        // Connected to all peers
-                        peering_state = peering_state & ~OSD_PEERING_PEERS;
-                    }
-                    repeer_pgs(osd_num, true);
-                });
-            }
-        }
-    }
    if (peering_state & OSD_PEERING_PGS)
    {
-        bool still_doing_pgs = false;
-        for (int i = 0; i < pgs.size(); i++)
+        bool still = false;
+        for (auto & p: pgs)
        {
-            if (pgs[i].state == PG_PEERING)
+            if (p.second.state == PG_PEERING)
            {
-                if (!pgs[i].peering_state->list_ops.size())
+                if (!p.second.peering_state->list_ops.size())
                {
-                    pgs[i].calc_object_states();
+                    p.second.calc_object_states(log_level);
+                    report_pg_state(p.second);
+                    incomplete_objects += p.second.incomplete_objects.size();
+                    misplaced_objects += p.second.misplaced_objects.size();
+                    // FIXME: degraded objects may currently include misplaced, too! Report them separately?
+                    degraded_objects += p.second.degraded_objects.size();
+                    if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
+                        peering_state = peering_state | OSD_FLUSHING_PGS;
+                    else
+                        peering_state = peering_state | OSD_RECOVERING;
                }
                else
                {
-                    still_doing_pgs = true;
+                    still = true;
                }
            }
        }
-        if (!still_doing_pgs)
+        if (!still)
        {
            // Done all PGs
            peering_state = peering_state & ~OSD_PEERING_PGS;
        }
    }
-}
-
-void osd_t::repeer_pgs(osd_num_t osd_num, bool is_connected)
-{
-    // Re-peer affected PGs
-    // FIXME: We shouldn't rely just on target_set. Other OSDs may also contain PG data.
-    osd_num_t real_osd = (is_connected ? osd_num : 0);
-    for (int i = 0; i < pgs.size(); i++)
+    if ((peering_state & OSD_FLUSHING_PGS) && !readonly)
    {
-        bool repeer = false;
-        for (int r = 0; r < pgs[i].target_set.size(); r++)
+        bool still = false;
+        for (auto & p: pgs)
        {
-            if (pgs[i].target_set[r] == osd_num &&
-                pgs[i].cur_set[r] != real_osd)
+            if ((p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN)) == (PG_ACTIVE | PG_HAS_UNCLEAN))
            {
-                pgs[i].cur_set[r] = real_osd;
-                repeer = true;
-                break;
+                if (!p.second.flush_batch)
+                {
+                    submit_pg_flush_ops(p.first);
+                }
+                still = true;
            }
        }
-        if (repeer)
+        if (!still)
        {
-            // Repeer this pg
-            printf("Repeer PG %d because of OSD %lu\n", i, osd_num);
-            start_pg_peering(i);
-            peering_state |= OSD_PEERING_PGS;
+            peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
+        }
+    }
+    if ((peering_state & OSD_RECOVERING) && !readonly)
+    {
+        if (!continue_recovery())
+        {
+            peering_state = peering_state & ~OSD_RECOVERING;
+        }
+    }
+}
+
+void osd_t::repeer_pgs(osd_num_t peer_osd)
+{
+    // Re-peer affected PGs
+    for (auto & p: pgs)
+    {
+        bool repeer = false;
+        if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
+        {
+            for (osd_num_t pg_osd: p.second.all_peers)
+            {
+                if (pg_osd == peer_osd)
+                {
+                    repeer = true;
+                    break;
+                }
+            }
+            if (repeer)
+            {
+                // Repeer this pg
+                printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
+                start_pg_peering(p.second.pg_num);
+            }
        }
    }
 }

 // Repeer on each connect/disconnect peer event
-void osd_t::start_pg_peering(int pg_idx)
+void osd_t::start_pg_peering(pg_num_t pg_num)
 {
-    auto & pg = pgs[pg_idx];
+    auto & pg = pgs[pg_num];
    pg.state = PG_PEERING;
+    this->peering_state |= OSD_PEERING_PGS;
+    report_pg_state(pg);
+    // Reset PG state
+    pg.cur_peers.clear();
    pg.state_dict.clear();
-    pg.obj_states.clear();
+    incomplete_objects -= pg.incomplete_objects.size();
+    misplaced_objects -= pg.misplaced_objects.size();
+    degraded_objects -= pg.degraded_objects.size();
+    pg.incomplete_objects.clear();
+    pg.misplaced_objects.clear();
+    pg.degraded_objects.clear();
+    pg.flush_actions.clear();
    pg.ver_override.clear();
-    pg.pg_cursize = 0;
-    for (int role = 0; role < pg.cur_set.size(); role++)
+    if (pg.flush_batch)
    {
+        delete pg.flush_batch;
+    }
+    pg.flush_batch = NULL;
+    for (auto p: pg.write_queue)
+    {
+        finish_op(p.second, -EPIPE);
+    }
+    pg.write_queue.clear();
+    for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
+    {
+        // Forget this PG's unstable writes
+        pg_num_t n = (it->first.oid.inode + it->first.oid.stripe / pg_stripe_size) % pg_count + 1;
+        if (n == pg.pg_num)
+            unstable_writes.erase(it++);
+        else
+            it++;
+    }
+    pg.inflight = 0;
+    dirty_pgs.erase(pg.pg_num);
+    // Calculate current write OSD set
+    pg.pg_cursize = 0;
+    pg.cur_set.resize(pg.target_set.size());
+    pg.cur_loc_set.clear();
+    for (int role = 0; role < pg.target_set.size(); role++)
+    {
+        pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
+            c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0;
        if (pg.cur_set[role] != 0)
        {
            pg.pg_cursize++;
+            pg.cur_loc_set.push_back({
+                .role = (uint64_t)role,
+                .osd_num = pg.cur_set[role],
+                .outdated = false,
+            });
+        }
+    }
+    if (pg.target_history.size())
+    {
+        // Refuse to start PG if no peers are available from any of the historical OSD sets
+        // (PG history is kept up to the latest active+clean state)
+        for (auto & history_set: pg.target_history)
+        {
+            bool found = false;
+            for (auto history_osd: history_set)
+            {
+                if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
+                {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                pg.state = PG_INCOMPLETE;
+                report_pg_state(pg);
+            }
        }
    }
    if (pg.pg_cursize < pg.pg_minsize)
    {
        pg.state = PG_INCOMPLETE;
+        report_pg_state(pg);
    }
+    std::set<osd_num_t> cur_peers;
+    for (auto pg_osd: pg.all_peers)
+    {
+        if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end())
+        {
+            cur_peers.insert(pg_osd);
+        }
+        else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
+        {
+            c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]["addresses"], st_cli.peer_states[pg_osd]["port"].int64_value());
+        }
+    }
+    pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
    if (pg.peering_state)
    {
-        // Adjust the peering operation that's still in progress
-        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
+        // Adjust the peering operation that's still in progress - discard unneeded results
+        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
        {
-            int role;
-            for (role = 0; role < pg.cur_set.size(); role++)
-            {
-                if (pg.cur_set[role] == it->first)
-                    break;
-            }
-            if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
+            if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
            {
                // Discard the result after completion, which, chances are, will be unsuccessful
-                auto list_op = it->second;
-                if (list_op->peer_fd == 0)
-                {
-                    // Self
-                    list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
-                    {
-                        if (list_op->bs_op->buf)
-                            free(list_op->bs_op->buf);
-                        delete list_op;
-                    };
-                }
-                else
-                {
-                    // Peer
-                    list_op->callback = [](osd_op_t *list_op)
-                    {
-                        delete list_op;
-                    };
-                }
+                discard_list_subop(it->second);
                pg.peering_state->list_ops.erase(it);
                it = pg.peering_state->list_ops.begin();
            }
+            else
+                it++;
        }
-        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
+        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
        {
-            int role;
-            for (role = 0; role < pg.cur_set.size(); role++)
-            {
-                if (pg.cur_set[role] == it->first)
-                    break;
-            }
-            if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
+            if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
            {
                if (it->second.buf)
                {
@@ -291,6 +218,8 @@ void osd_t::start_pg_peering(int pg_idx)
                pg.peering_state->list_results.erase(it);
                it = pg.peering_state->list_results.begin();
            }
+            else
+                it++;
        }
    }
    if (pg.state == PG_INCOMPLETE)
@@ -300,107 +229,300 @@ void osd_t::start_pg_peering(int pg_idx)
            delete pg.peering_state;
            pg.peering_state = NULL;
        }
-        printf("PG %d is incomplete\n", pg.pg_num);
        return;
    }
    if (!pg.peering_state)
    {
        pg.peering_state = new pg_peering_state_t();
+        pg.peering_state->pg_num = pg.pg_num;
    }
-    auto ps = pg.peering_state;
-    for (int role = 0; role < pg.cur_set.size(); role++)
+    for (osd_num_t peer_osd: cur_peers)
    {
-        osd_num_t role_osd = pg.cur_set[role];
-        if (!role_osd)
+        if (pg.peering_state->list_ops.find(peer_osd) != pg.peering_state->list_ops.end() ||
+            pg.peering_state->list_results.find(peer_osd) != pg.peering_state->list_results.end())
        {
            continue;
        }
-        if (ps->list_ops.find(role_osd) != ps->list_ops.end() ||
-            ps->list_results.find(role_osd) != ps->list_results.end())
-        {
-            continue;
-        }
-        if (role_osd == this->osd_num)
-        {
-            // Self
-            osd_op_t *op = new osd_op_t();
-            op->op_type = 0;
-            op->peer_fd = 0;
-            op->bs_op = new blockstore_op_t();
-            op->bs_op->opcode = BS_OP_LIST;
-            op->bs_op->oid.stripe = parity_block_size;
-            op->bs_op->len = pg_count,
-            op->bs_op->offset = pg.pg_num-1,
-            op->bs_op->callback = [ps, op, role_osd](blockstore_op_t *bs_op)
-            {
-                if (op->bs_op->retval < 0)
-                {
-                    throw std::runtime_error("local OP_LIST failed");
-                }
-                printf(
-                    "Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
-                    role_osd, bs_op->retval, bs_op->version
-                );
-                ps->list_results[role_osd] = {
-                    .buf = (obj_ver_id*)op->bs_op->buf,
-                    .total_count = (uint64_t)op->bs_op->retval,
-                    .stable_count = op->bs_op->version,
-                };
-                ps->list_done++;
-                ps->list_ops.erase(role_osd);
-                delete op;
-            };
-            bs->enqueue_op(op->bs_op);
-            ps->list_ops[role_osd] = op;
-        }
-        else
-        {
-            // Peer
-            auto & cl = clients[osd_peer_fds[role_osd]];
-            osd_op_t *op = new osd_op_t();
-            op->op_type = OSD_OP_OUT;
-            op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
-            op->peer_fd = cl.peer_fd;
-            op->req = {
-                .sec_list = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = this->next_subop_id++,
-                        .opcode = OSD_OP_SECONDARY_LIST,
-                    },
-                    .list_pg = pg.pg_num,
-                    .pg_count = pg_count,
-                    .parity_block_size = parity_block_size,
-                },
-            };
-            op->callback = [this, ps, role_osd](osd_op_t *op)
-            {
-                if (op->reply.hdr.retval < 0)
-                {
-                    printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
-                    ps->list_ops.erase(role_osd);
-                    stop_client(op->peer_fd);
-                    delete op;
-                    return;
-                }
-                printf(
-                    "Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
-                    role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
-                );
-                ps->list_results[role_osd] = {
-                    .buf = (obj_ver_id*)op->buf,
-                    .total_count = (uint64_t)op->reply.hdr.retval,
-                    .stable_count = op->reply.sec_list.stable_count,
-                };
-                // set op->buf to NULL so it doesn't get freed
-                op->buf = NULL;
-                ps->list_done++;
-                ps->list_ops.erase(role_osd);
-                delete op;
-            };
-            outbox_push(cl, op);
-            ps->list_ops[role_osd] = op;
-        }
+        submit_sync_and_list_subop(peer_osd, pg.peering_state);
    }
    ringloop->wakeup();
 }
+
+void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
+{
+    // Sync before listing, if not readonly
+    if (readonly)
+    {
+        submit_list_subop(role_osd, ps);
+    }
+    else if (role_osd == this->osd_num)
+    {
+        // Self
+        osd_op_t *op = new osd_op_t();
+        op->op_type = 0;
+        op->peer_fd = 0;
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t();
+        op->bs_op->opcode = BS_OP_SYNC;
+        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
+        {
+            if (bs_op->retval < 0)
+            {
+                printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
+                force_stop(1);
+                return;
+            }
+            add_bs_subop_stats(op);
+            delete op->bs_op;
+            op->bs_op = NULL;
+            delete op;
+            ps->list_ops.erase(role_osd);
+            submit_list_subop(role_osd, ps);
+        };
+        bs->enqueue_op(op->bs_op);
+        ps->list_ops[role_osd] = op;
+    }
+    else
+    {
+        // Peer
+        auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+        op->peer_fd = cl.peer_fd;
+        op->req = {
+            .sec_sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_SYNC,
+                },
+            },
+        };
+        op->callback = [this, ps, role_osd](osd_op_t *op)
+        {
+            if (op->reply.hdr.retval < 0)
+            {
+                // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
+                printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
+                ps->list_ops.erase(role_osd);
+                c_cli.stop_client(op->peer_fd);
+                delete op;
+                return;
+            }
+            delete op;
+            ps->list_ops.erase(role_osd);
+            submit_list_subop(role_osd, ps);
+        };
+        c_cli.outbox_push(op);
+        ps->list_ops[role_osd] = op;
+    }
+}
+
+void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
+{
+    if (role_osd == this->osd_num)
+    {
+        // Self
+        osd_op_t *op = new osd_op_t();
+        op->op_type = 0;
+        op->peer_fd = 0;
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t();
+        op->bs_op->opcode = BS_OP_LIST;
+        op->bs_op->oid.stripe = pg_stripe_size;
+        op->bs_op->len = pg_count;
+        op->bs_op->offset = ps->pg_num-1;
+        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
+        {
+            if (op->bs_op->retval < 0)
+            {
+                throw std::runtime_error("local OP_LIST failed");
+            }
+            add_bs_subop_stats(op);
+            printf(
+                "[PG %u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
+                ps->pg_num, role_osd, bs_op->retval, bs_op->version
+            );
+            ps->list_results[role_osd] = {
+                .buf = (obj_ver_id*)op->bs_op->buf,
+                .total_count = (uint64_t)op->bs_op->retval,
+                .stable_count = op->bs_op->version,
+            };
+            ps->list_ops.erase(role_osd);
+            delete op->bs_op;
+            op->bs_op = NULL;
+            delete op;
+        };
+        bs->enqueue_op(op->bs_op);
+        ps->list_ops[role_osd] = op;
+    }
+    else
+    {
+        // Peer
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
+        op->peer_fd = c_cli.osd_peer_fds[role_osd];
+        op->req = {
+            .sec_list = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_LIST,
+                },
+                .list_pg = ps->pg_num,
+                .pg_count = pg_count,
+                .pg_stripe_size = pg_stripe_size,
+            },
+        };
+        op->callback = [this, ps, role_osd](osd_op_t *op)
+        {
+            if (op->reply.hdr.retval < 0)
+            {
+                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
+                ps->list_ops.erase(role_osd);
+                c_cli.stop_client(op->peer_fd);
+                delete op;
+                return;
+            }
+            printf(
+                "[PG %u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
+                ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
+            );
+            ps->list_results[role_osd] = {
+                .buf = (obj_ver_id*)op->buf,
+                .total_count = (uint64_t)op->reply.hdr.retval,
+                .stable_count = op->reply.sec_list.stable_count,
+            };
+            // set op->buf to NULL so it doesn't get freed
+            op->buf = NULL;
+            ps->list_ops.erase(role_osd);
+            delete op;
+        };
+        c_cli.outbox_push(op);
+        ps->list_ops[role_osd] = op;
+    }
+}
+
+void osd_t::discard_list_subop(osd_op_t *list_op)
+{
+    if (list_op->peer_fd == 0)
+    {
+        // Self
+        list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
+        {
+            if (list_op->bs_op->buf)
+                free(list_op->bs_op->buf);
+            delete list_op->bs_op;
+            list_op->bs_op = NULL;
+            delete list_op;
+        };
+    }
+    else
+    {
+        // Peer
+        list_op->callback = [](osd_op_t *list_op)
+        {
+            delete list_op;
+        };
+    }
+}
+
+bool osd_t::stop_pg(pg_num_t pg_num)
+{
+    auto pg_it = pgs.find(pg_num);
+    if (pg_it == pgs.end())
+    {
+        return false;
+    }
+    auto & pg = pg_it->second;
+    if (pg.peering_state)
+    {
+        // Stop peering
+        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
+        {
+            discard_list_subop(it->second);
+        }
+        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
+        {
+            if (it->second.buf)
+            {
+                free(it->second.buf);
+            }
+        }
+        delete pg.peering_state;
+        pg.peering_state = NULL;
+    }
+    if (!(pg.state & PG_ACTIVE))
+    {
+        return false;
+    }
+    pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
+    if (pg.inflight == 0 && !pg.flush_batch)
+    {
+        finish_stop_pg(pg);
+    }
+    else
+    {
+        report_pg_state(pg);
+    }
+    return true;
+}
+
+void osd_t::finish_stop_pg(pg_t & pg)
+{
+    pg.state = PG_OFFLINE;
+    report_pg_state(pg);
+}
+
+void osd_t::report_pg_state(pg_t & pg)
+{
+    pg.print_state();
+    this->pg_state_dirty.insert(pg.pg_num);
+    if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
+    {
+        // Clear history of active+clean PGs
+        pg.history_changed = true;
+        pg.target_history.clear();
+        pg.all_peers = pg.target_set;
+        pg.cur_peers = pg.target_set;
+    }
+    else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
+    {
+        // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
+        pg.history_changed = true;
+        pg.target_history.clear();
+        std::set<osd_num_t> dead_peers;
+        for (auto pg_osd: pg.all_peers)
+        {
+            dead_peers.insert(pg_osd);
+        }
+        for (auto pg_osd: pg.cur_peers)
+        {
+            dead_peers.erase(pg_osd);
+        }
+        for (auto pg_osd: pg.target_set)
+        {
+            if (pg_osd)
+            {
+                dead_peers.insert(pg_osd);
+            }
+        }
+        pg.all_peers.clear();
+        pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
+        pg.cur_peers.clear();
+        for (auto pg_osd: pg.target_set)
+        {
+            if (pg_osd)
+            {
+                pg.cur_peers.push_back(pg_osd);
+            }
+        }
+    }
+    if (pg.state == PG_OFFLINE && !this->pg_config_applied)
+    {
+        apply_pg_config();
+    }
+    report_pg_states();
+}
--- a/osd_peering_pg.cpp
+++ b/osd_peering_pg.cpp
@@ -1,159 +1,361 @@
 #include "osd_peering_pg.h"

-void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all)
+struct obj_ver_role
 {
-    auto & pg = *this;
-    // Remember the decision
-    uint64_t state = 0;
-    if (st.n_roles == pg.pg_cursize)
+    object_id oid;
+    uint64_t version;
+    uint64_t osd_num;
+    bool is_stable;
+};
+
+inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
+{
+    // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, role ASC, osd_num ASC
+    return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
+        (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
+        (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
+            a.version > b.version ||
+            a.version == b.version && (
+                a.oid.stripe < b.oid.stripe ||
+                a.oid.stripe == b.oid.stripe && a.osd_num < b.osd_num
+            )
+        )
+    );
+}
+
+struct obj_piece_ver_t
+{
+    uint64_t max_ver = 0;
+    uint64_t stable_ver = 0;
+    uint64_t max_target = 0;
+};
+
+struct pg_obj_state_check_t
+{
+    pg_t *pg;
+    std::vector<obj_ver_role> list;
+    int list_pos;
+    int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
+    object_id oid = { 0 };
+    uint64_t max_ver = 0;
+    uint64_t last_ver = 0;
+    uint64_t target_ver = 0;
+    uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
+    uint64_t n_unstable = 0, n_buggy = 0;
+    pg_osd_set_t osd_set;
+    int log_level;
+
+    void walk();
+    void start_object();
+    void handle_version();
+    void finish_object();
+};
+
+void pg_obj_state_check_t::walk()
+{
+    pg->clean_count = 0;
+    pg->total_count = 0;
+    pg->state = 0;
+    for (list_pos = 0; list_pos < list.size(); list_pos++)
    {
-        if (st.n_matched == pg.pg_cursize)
-            state = OBJ_CLEAN;
+        if (oid.inode != list[list_pos].oid.inode ||
+            oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
+        {
+            if (oid.inode != 0)
+            {
+                finish_object();
+            }
+            start_object();
+        }
+        handle_version();
+    }
+    if (oid.inode != 0)
+    {
+        finish_object();
+    }
+    if (pg->pg_cursize < pg->pg_size)
+    {
+        pg->state |= PG_DEGRADED;
+    }
+    pg->state |= PG_ACTIVE;
+    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
+    {
+        pg->state |= PG_LEFT_ON_DEAD;
+    }
+}
+
+void pg_obj_state_check_t::start_object()
+{
+    obj_start = list_pos;
+    oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
+    last_ver = max_ver = list[list_pos].version;
+    target_ver = 0;
+    ver_start = list_pos;
+    has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
+    n_unstable = n_buggy = 0;
+}
+
+void pg_obj_state_check_t::handle_version()
+{
+    if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
+    {
+        // Version is either stable or recoverable
+        target_ver = last_ver;
+        ver_end = list_pos;
+    }
+    if (!target_ver)
+    {
+        if (last_ver != list[list_pos].version)
+        {
+            ver_start = list_pos;
+            has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
+            last_ver = list[list_pos].version;
+        }
+        int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
+        n_copies++;
+        if (replica >= pg->pg_size)
+        {
+            n_buggy++;
+        }
        else
        {
-            state = OBJ_MISPLACED;
-            pg.state = pg.state | PG_HAS_MISPLACED;
+            if (list[list_pos].is_stable)
+            {
+                n_stable++;
+            }
+            if (pg->cur_set[replica] != list[list_pos].osd_num)
+            {
+                n_mismatched++;
+            }
+            if (!(has_roles & (1 << replica)))
+            {
+                has_roles = has_roles | (1 << replica);
+                n_roles++;
+            }
        }
    }
-    else if (st.n_roles < pg.pg_minsize)
+    if (!list[list_pos].is_stable)
    {
-        printf("Object is unfound: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
-        state = OBJ_INCOMPLETE;
-        pg.state = pg.state | PG_HAS_UNFOUND;
+        n_unstable++;
    }
-    else
+}
+
+void pg_obj_state_check_t::finish_object()
+{
+    if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
    {
-        printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", st.oid.inode, st.oid.stripe, st.target_ver, st.max_ver);
-        state = OBJ_DEGRADED;
-        pg.state = pg.state | PG_HAS_DEGRADED;
+        // Version is either stable or recoverable
+        target_ver = last_ver;
+        ver_end = list_pos;
    }
-    if (st.n_copies > pg.pg_size)
+    obj_end = list_pos;
+    // Remember the decision
+    uint64_t state = 0;
+    if (n_buggy > 0)
    {
-        state |= OBJ_OVERCOPIED;
-        pg.state = pg.state | PG_HAS_UNCLEAN;
-    }
-    if (st.n_stable < st.n_copies)
-    {
-        state |= OBJ_NEEDS_STABLE;
-        pg.state = pg.state | PG_HAS_UNCLEAN;
-    }
-    if (st.target_ver < st.max_ver || st.has_old_unstable)
-    {
-        state |= OBJ_NEEDS_ROLLBACK;
-        pg.state = pg.state | PG_HAS_UNCLEAN;
-        pg.ver_override[st.oid] = st.target_ver;
-    }
-    if (st.is_buggy)
-    {
-        state |= OBJ_BUGGY;
+        state = OBJ_BUGGY;
        // FIXME: bring pg offline
        throw std::runtime_error("buggy object state");
    }
-    if (state != OBJ_CLEAN)
+    if (n_unstable > 0)
    {
-        st.osd_set.clear();
-        for (int i = st.ver_start; i < st.ver_end; i++)
+        pg->state |= PG_HAS_UNCLEAN;
+        std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
+        for (int i = obj_start; i < obj_end; i++)
        {
-            st.osd_set.push_back((pg_obj_loc_t){
-                .role = (all[i].oid.stripe & STRIPE_MASK),
-                .osd_num = all[i].osd_num,
-                .stable = all[i].is_stable,
+            auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
+            if (!pcs.max_ver)
+            {
+                pcs.max_ver = list[i].version;
+            }
+            if (list[i].is_stable && !pcs.stable_ver)
+            {
+                pcs.stable_ver = list[i].version;
+            }
+            if (list[i].version <= target_ver && !pcs.max_target)
+            {
+                pcs.max_target = list[i].version;
+            }
+        }
+        for (auto pp: pieces)
+        {
+            auto & pcs = pp.second;
+            if (pcs.stable_ver < pcs.max_ver)
+            {
+                auto & act = pg->flush_actions[pp.first];
+                // osd_set doesn't include rollback/stable states, so don't include them in the state code either
+                if (pcs.max_ver > target_ver)
+                {
+                    act.rollback = true;
+                    act.rollback_to = pcs.max_target;
+                }
+                if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
+                {
+                    act.make_stable = true;
+                    act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
+                }
+            }
+        }
+    }
+    if (!target_ver)
+    {
+        return;
+    }
+    if (n_roles < pg->pg_minsize)
+    {
+        if (log_level > 1)
+        {
+            printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+            for (int i = ver_start; i < ver_end; i++)
+            {
+                printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
+            }
+        }
+        if (log_level > 2)
+        {
+            for (int i = obj_start; i < obj_end; i++)
+            {
+                printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
+            }
+        }
+        state = OBJ_INCOMPLETE;
+        pg->state = pg->state | PG_HAS_INCOMPLETE;
+    }
+    else if (n_roles < pg->pg_cursize)
+    {
+        if (log_level > 1)
+        {
+            printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+            for (int i = ver_start; i < ver_end; i++)
+            {
+                printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
+            }
+        }
+        if (log_level > 2)
+        {
+            for (int i = obj_start; i < obj_end; i++)
+            {
+                printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
+            }
+        }
+        state = OBJ_DEGRADED;
+        pg->state = pg->state | PG_HAS_DEGRADED;
+    }
+    if (n_mismatched > 0)
+    {
+        state |= OBJ_MISPLACED;
+        pg->state = pg->state | PG_HAS_MISPLACED;
+    }
+    pg->total_count++;
+    if (state != 0 || ver_end < obj_end)
+    {
+        osd_set.clear();
+        for (int i = ver_start; i < ver_end; i++)
+        {
+            osd_set.push_back((pg_obj_loc_t){
+                .role = (list[i].oid.stripe & STRIPE_MASK),
+                .osd_num = list[i].osd_num,
+                .outdated = false,
            });
        }
-        std::sort(st.osd_set.begin(), st.osd_set.end());
-        auto it = pg.state_dict.find(st.osd_set);
-        if (it == pg.state_dict.end())
+    }
+    if (ver_end < obj_end)
+    {
+        // Check for outdated versions not present in the current target OSD set
+        for (int i = ver_end; i < obj_end; i++)
+        {
+            int j;
+            for (j = 0; j < osd_set.size(); j++)
+            {
+                if (osd_set[j].osd_num == list[i].osd_num)
+                {
+                    break;
+                }
+            }
+            if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
+            {
+                osd_set.push_back((pg_obj_loc_t){
+                    .role = (list[i].oid.stripe & STRIPE_MASK),
+                    .osd_num = list[i].osd_num,
+                    .outdated = true,
+                });
+                state |= OBJ_MISPLACED;
+                pg->state = pg->state | PG_HAS_MISPLACED;
+            }
+        }
+    }
+    if (target_ver < max_ver)
+    {
+        pg->ver_override[oid] = target_ver;
+    }
+    if (state == 0)
+    {
+        pg->clean_count++;
+    }
+    else
+    {
+        auto it = pg->state_dict.find(osd_set);
+        if (it == pg->state_dict.end())
        {
            std::vector<uint64_t> read_target;
-            read_target.resize(pg.pg_size);
-            for (int i = 0; i < pg.pg_size; i++)
+            read_target.resize(pg->pg_size);
+            for (int i = 0; i < pg->pg_size; i++)
            {
                read_target[i] = 0;
            }
-            for (auto & o: st.osd_set)
+            for (auto & o: osd_set)
            {
-                read_target[o.role] = o.osd_num;
+                if (!o.outdated)
+                {
+                    read_target[o.role] = o.osd_num;
+                }
            }
-            pg.state_dict[st.osd_set] = {
+            pg->state_dict[osd_set] = {
                .read_target = read_target,
-                .osd_set = st.osd_set,
+                .osd_set = osd_set,
                .state = state,
                .object_count = 1,
            };
-            it = pg.state_dict.find(st.osd_set);
+            it = pg->state_dict.find(osd_set);
        }
        else
        {
            it->second.object_count++;
        }
-        pg.obj_states[st.oid] = &it->second;
-        if (st.target_ver < st.max_ver)
+        if (state & OBJ_INCOMPLETE)
        {
-            pg.ver_override[st.oid] = st.target_ver;
+            pg->incomplete_objects[oid] = &it->second;
        }
-        if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
+        else if (state & OBJ_DEGRADED)
        {
-            spp::sparse_hash_map<obj_piece_id_t, obj_piece_ver_t> pieces;
-            for (int i = st.obj_start; i < st.obj_end; i++)
-            {
-                auto & pcs = pieces[(obj_piece_id_t){ .oid = all[i].oid, .osd_num = all[i].osd_num }];
-                if (!pcs.max_ver)
-                {
-                    pcs.max_ver = all[i].version;
-                }
-                if (all[i].is_stable && !pcs.stable_ver)
-                {
-                    pcs.stable_ver = all[i].version;
-                }
-            }
-            for (auto pp: pieces)
-            {
-                auto & pcs = pp.second;
-                if (pcs.stable_ver < pcs.max_ver)
-                {
-                    auto & act = obj_stab_actions[pp.first];
-                    if (pcs.max_ver > st.target_ver)
-                    {
-                        act.rollback = true;
-                        act.rollback_to = st.target_ver;
-                    }
-                    else if (pcs.max_ver < st.target_ver && pcs.stable_ver < pcs.max_ver)
-                    {
-                        act.rollback = true;
-                        act.rollback_to = pcs.stable_ver;
-                    }
-                    if (pcs.max_ver >= st.target_ver && pcs.stable_ver < st.target_ver)
-                    {
-                        act.make_stable = true;
-                        act.stable_to = st.target_ver;
-                    }
-                }
-            }
+            pg->degraded_objects[oid] = &it->second;
+        }
+        else
+        {
+            pg->misplaced_objects[oid] = &it->second;
        }
    }
-    else
-        pg.clean_count++;
-    pg.total_count++;
 }

 // FIXME: Write at least some tests for this function
-void pg_t::calc_object_states()
+void pg_t::calc_object_states(int log_level)
 {
-    auto & pg = *this;
    // Copy all object lists into one array
-    std::vector<obj_ver_role> all;
-    auto ps = pg.peering_state;
+    pg_obj_state_check_t st;
+    st.log_level = log_level;
+    st.pg = this;
+    auto ps = peering_state;
    for (auto it: ps->list_results)
    {
        auto nstab = it.second.stable_count;
        auto n = it.second.total_count;
        auto osd_num = it.first;
-        uint64_t start = all.size();
-        all.resize(start + n);
+        uint64_t start = st.list.size();
+        st.list.resize(start + n);
        obj_ver_id *ov = it.second.buf;
        for (uint64_t i = 0; i < n; i++, ov++)
        {
-            all[start+i] = {
+            st.list[start+i] = {
                .oid = ov->oid,
                .version = ov->version,
                .osd_num = osd_num,
@@ -165,101 +367,26 @@ void pg_t::calc_object_states()
    }
    ps->list_results.clear();
    // Sort
-    std::sort(all.begin(), all.end());
+    std::sort(st.list.begin(), st.list.end());
    // Walk over it and check object states
-    pg.clean_count = 0;
-    pg.total_count = 0;
-    pg.state = 0;
-    int replica = 0;
-    pg_obj_state_check_t st;
-    for (int i = 0; i < all.size(); i++)
-    {
-        if (st.oid.inode != all[i].oid.inode ||
-            st.oid.stripe != (all[i].oid.stripe & ~STRIPE_MASK))
-        {
-            if (st.oid.inode != 0)
-            {
-                // Remember object state
-                st.obj_end = st.ver_end = i;
-                remember_object(st, all);
-            }
-            st.obj_start = st.ver_start = i;
-            st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe & ~STRIPE_MASK };
-            st.max_ver = st.target_ver = all[i].version;
-            st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
-            st.is_buggy = st.has_old_unstable = false;
-        }
-        else if (st.target_ver != all[i].version)
-        {
-            if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
-            {
-                // Last processed version is either recoverable or stable, choose it as target and skip previous versions
-                st.ver_end = i;
-                i++;
-                while (i < all.size() && st.oid.inode == all[i].oid.inode &&
-                    st.oid.stripe == (all[i].oid.stripe & ~STRIPE_MASK))
-                {
-                    if (!all[i].is_stable)
-                    {
-                        st.has_old_unstable = true;
-                    }
-                    i++;
-                }
-                st.obj_end = i;
-                i--;
-                continue;
-            }
-            else
-            {
-                // Last processed version is unstable and unrecoverable
-                // We'll know that because target_ver < max_ver
-                st.ver_start = i;
-                st.target_ver = all[i].version;
-                st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
-            }
-        }
-        replica = (all[i].oid.stripe & STRIPE_MASK);
-        st.n_copies++;
-        if (replica >= pg.pg_size)
-        {
-            // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
-            st.is_buggy = true;
-        }
-        else
-        {
-            if (all[i].is_stable)
-            {
-                st.n_stable++;
-            }
-            if (pg.cur_set[replica] == all[i].osd_num)
-            {
-                st.n_matched++;
-            }
-            if (!(st.has_roles & (1 << replica)))
-            {
-                st.has_roles = st.has_roles | (1 << replica);
-                st.n_roles++;
-            }
-        }
-    }
-    if (st.oid.inode != 0)
-    {
-        // Remember object state
-        st.obj_end = st.ver_end = all.size();
-        remember_object(st, all);
-    }
-    if (pg.pg_cursize < pg.pg_size)
-    {
-        pg.state = pg.state | PG_DEGRADED;
-    }
-    printf(
-        "PG %u is active%s%s%s%s%s (%lu objects)\n", pg.pg_num,
-        (pg.state & PG_DEGRADED) ? " + degraded" : "",
-        (pg.state & PG_HAS_UNFOUND) ? " + has_unfound" : "",
-        (pg.state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
-        (pg.state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
-        (pg.state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
-        pg.total_count
-    );
-    pg.state = pg.state | PG_ACTIVE;
+    st.walk();
+}
+
+void pg_t::print_state()
+{
+    printf(
+        "[PG %u] is %s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
+        (state & PG_STARTING) ? "starting" : "",
+        (state & PG_OFFLINE) ? "offline" : "",
+        (state & PG_PEERING) ? "peering" : "",
+        (state & PG_INCOMPLETE) ? "incomplete" : "",
+        (state & PG_ACTIVE) ? "active" : "",
+        (state & PG_STOPPING) ? "stopping" : "",
+        (state & PG_DEGRADED) ? " + degraded" : "",
+        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
+        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
+        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
+        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
+        total_count
+    );
 }
--- a/osd_peering_pg.h
+++ b/osd_peering_pg.h
@@ -1,43 +1,19 @@
 #include <map>
+#include <unordered_map>
 #include <vector>
 #include <algorithm>

+#include "cpp-btree/btree_map.h"
+
 #include "object_id.h"
 #include "osd_ops.h"
-
-#include "sparsepp/sparsepp/spp.h"
-
-// Placement group states
-// Exactly one of these:
-#define PG_OFFLINE (1<<0)
-#define PG_PEERING (1<<1)
-#define PG_INCOMPLETE (1<<2)
-#define PG_ACTIVE (1<<3)
-// Plus any of these:
-#define PG_DEGRADED (1<<4)
-#define PG_HAS_UNFOUND (1<<5)
-#define PG_HAS_DEGRADED (1<<6)
-#define PG_HAS_MISPLACED (1<<7)
-#define PG_HAS_UNCLEAN (1<<8)
-
-// FIXME: Safe default that doesn't depend on parity_block_size of pg_parity_size
-#define STRIPE_MASK ((uint64_t)4096 - 1)
-
-// OSD object states
-#define OBJ_CLEAN 0x01
-#define OBJ_MISPLACED 0x02
-#define OBJ_DEGRADED 0x03
-#define OBJ_INCOMPLETE 0x04
-#define OBJ_NEEDS_STABLE 0x10000
-#define OBJ_NEEDS_ROLLBACK 0x20000
-#define OBJ_OVERCOPIED 0x40000
-#define OBJ_BUGGY 0x80000
+#include "pg_states.h"

 struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    bool stable;
+    bool outdated;
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -64,28 +40,9 @@ struct osd_op_t;
 struct pg_peering_state_t
 {
    // osd_num -> list result
-    spp::sparse_hash_map<osd_num_t, osd_op_t*> list_ops;
-    spp::sparse_hash_map<osd_num_t, pg_list_result_t> list_results;
-    int list_done = 0;
-};
-
-struct pg_obj_state_check_t
-{
-    int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
-    object_id oid = { 0 };
-    uint64_t max_ver = 0;
-    uint64_t target_ver = 0;
-    uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
-    bool is_buggy = false, has_old_unstable = false;
-    pg_osd_set_t osd_set;
-};
-
-struct obj_ver_role
-{
-    object_id oid;
-    uint64_t version;
-    uint64_t osd_num;
-    bool is_stable;
+    std::unordered_map<osd_num_t, osd_op_t*> list_ops;
+    std::unordered_map<osd_num_t, pg_list_result_t> list_results;
+    pg_num_t pg_num = 0;
 };

 struct obj_piece_id_t
@@ -94,60 +51,63 @@ struct obj_piece_id_t
    uint64_t osd_num;
 };

-struct obj_piece_ver_t
-{
-    uint64_t max_ver = 0;
-    uint64_t stable_ver = 0;
-};
-
-struct obj_stab_action_t
+struct flush_action_t
 {
    bool rollback = false, make_stable = false;
    uint64_t stable_to = 0, rollback_to = 0;
+    bool submitted = false;
+};
+
+struct pg_flush_batch_t
+{
+    std::map<osd_num_t, std::vector<obj_ver_id>> rollback_lists;
+    std::map<osd_num_t, std::vector<obj_ver_id>> stable_lists;
+    int flush_ops = 0, flush_done = 0;
+    int flush_objects = 0;
 };

 struct pg_t
 {
-    int state;
+    int state = 0;
    uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
    pg_num_t pg_num;
    uint64_t clean_count = 0, total_count = 0;
+    // target history and all potential peers
+    std::vector<std::vector<osd_num_t>> target_history;
+    std::vector<osd_num_t> all_peers;
+    bool history_changed = false;
+    // peer list from the last peering event
+    std::vector<osd_num_t> cur_peers;
    // target_set is the "correct" peer OSD set for this PG
    std::vector<osd_num_t> target_set;
    // cur_set is the current set of connected peer OSDs for this PG
    // cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
    std::vector<osd_num_t> cur_set;
+    // same thing in state_dict-like format
+    pg_osd_set_t cur_loc_set;
    // moved object map. by default, each object is considered to reside on the cur_set.
    // this map stores all objects that differ.
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
-    spp::sparse_hash_map<object_id, pg_osd_set_state_t*> obj_states;
-    std::map<obj_piece_id_t, obj_stab_action_t> obj_stab_actions;
-    spp::sparse_hash_map<object_id, uint64_t> ver_override;
+    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
+    std::map<obj_piece_id_t, flush_action_t> flush_actions;
+    btree::btree_map<object_id, uint64_t> ver_override;
    pg_peering_state_t *peering_state = NULL;
+    pg_flush_batch_t *flush_batch = NULL;

+    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

-    void calc_object_states();
-    void remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &all);
+    void calc_object_states(int log_level);
+    void print_state();
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num ||
-        a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable;
-}
-
-inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
-{
-    // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, osd_num ASC
-    return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && (
-        (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) ||
-        (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && (
-            a.version > b.version || a.version == b.version && a.osd_num < b.osd_num
-        )
-    );
+    return a.outdated < b.outdated ||
+        a.outdated == b.outdated && a.role < b.role ||
+        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
@@ -172,7 +132,6 @@ namespace std
                // Copy-pasted from spp::hash_combine()
                seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
                seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
-                seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
            }
            return seed;
        }
--- a/osd_peering_pg_test.cpp
+++ b/osd_peering_pg_test.cpp
@@ -0,0 +1,54 @@
+#define _LARGEFILE64_SOURCE
+
+#include "osd_peering_pg.h"
+#define STRIPE_SHIFT 12
+
+/**
+ * TODO tests for object & pg state calculation.
+ *
+ * 1) pg=1,2,3. objects:
+ *    v1=1s,2s,3s -> clean
+ *    v1=1s,2s,3 v2=1s,2s,_ -> degraded + needs_rollback
+ *    v1=1s,2s,_ -> degraded
+ *    v1=1s,2s,3s v2=1,6,_ -> degraded + needs_stabilize
+ *    v1=2s,1s,3s -> misplaced
+ *    v1=4,5,6 -> misplaced + needs_stabilize
+ *    v1=1s,2s,6s -> misplaced
+ * 2) ...
+ */
+int main(int argc, char *argv[])
+{
+    pg_t pg = {
+        .state = PG_PEERING,
+        .pg_num = 1,
+        .target_set = { 1, 2, 3 },
+        .cur_set = { 1, 2, 3 },
+        .peering_state = new pg_peering_state_t(),
+    };
+    for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
+    {
+        pg_list_result_t r = {
+            .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
+            .total_count = 1024*1024*8,
+            .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
+        };
+        for (uint64_t i = 0; i < r.total_count; i++)
+        {
+            r.buf[i] = {
+                .oid = {
+                    .inode = 1,
+                    .stripe = (i << STRIPE_SHIFT) | (osd_num-1),
+                },
+                .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
+            };
+        }
+        pg.peering_state->list_results[osd_num] = r;
+    }
+    pg.calc_object_states(0);
+    printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
+    for (auto it: pg.state_dict)
+    {
+        printf("dev: state=%lx\n", it.second.state);
+    }
+    return 0;
+}
--- a/osd_primary.cpp
+++ b/osd_primary.cpp
@@ -1,58 +1,13 @@
-#include "osd.h"
-#include "osd_rmw.h"
-
-#define SUBMIT_READ 0
-#define SUBMIT_RMW_READ 1
-#define SUBMIT_WRITE 2
+#include "osd_primary.h"

 // read: read directly or read paired stripe(s), reconstruct, return
-// write: read paired stripe(s), modify, write
+// write: read paired stripe(s), reconstruct, modify, calculate parity, write
 //
 // nuance: take care to read the same version from paired stripes!
 // to do so, we remember "last readable" version until a write request completes
 // and we postpone other write requests to the same stripe until completion of previous ones
 //
-// sync: sync peers, get unstable versions from somewhere, stabilize them
-
-struct unstable_osd_num_t
-{
-    osd_num_t osd_num;
-    int start, len;
-};
-
-struct osd_primary_op_data_t
-{
-    int st = 0;
-    pg_num_t pg_num;
-    object_id oid;
-    uint64_t target_ver;
-    uint64_t fact_ver = 0;
-    int n_subops = 0, done = 0, errors = 0;
-    int degraded = 0, pg_size, pg_minsize;
-    osd_rmw_stripe_t *stripes;
-    osd_op_t *subops = NULL;
-    // for sync. oops, requires freeing
-    std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
-    obj_ver_id *unstable_writes = NULL;
-};
-
-void osd_t::finish_primary_op(osd_op_t *cur_op, int retval)
-{
-    // FIXME add separate magic number
-    auto cl_it = clients.find(cur_op->peer_fd);
-    if (cl_it != clients.end())
-    {
-        cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        cur_op->reply.hdr.id = cur_op->req.hdr.id;
-        cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-        cur_op->reply.hdr.retval = retval;
-        outbox_push(cl_it->second, cur_op);
-    }
-    else
-    {
-        delete cur_op;
-    }
-}
+// sync: sync peers, get unstable versions, stabilize them

 bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
@@ -60,39 +15,69 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
    // But we must not use K in the process of calculating the PG number
    // So we calculate the PG number using a separate setting which should be per-inode (FIXME)
-    // FIXME Real pg_num should equal the below expression + 1
-    pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / parity_block_size) % pg_count;
-    // FIXME: Postpone operations in inactive PGs
-    if (pg_num > pgs.size() || !(pgs[pg_num].state & PG_ACTIVE))
+    pg_num_t pg_num = (cur_op->req.rw.inode + cur_op->req.rw.offset / pg_stripe_size) % pg_count + 1;
+    auto pg_it = pgs.find(pg_num);
+    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
    {
-        finish_primary_op(cur_op, -EINVAL);
+        // This OSD is not primary for this PG or the PG is inactive
+        finish_op(cur_op, -EPIPE);
        return false;
    }
-    uint64_t pg_parity_size = bs_block_size * pgs[pg_num].pg_minsize;
+    uint64_t pg_block_size = bs_block_size * pg_it->second.pg_minsize;
    object_id oid = {
        .inode = cur_op->req.rw.inode,
        // oid.stripe = starting offset of the parity stripe, so it can be mapped back to the PG
-        .stripe = (cur_op->req.rw.offset / parity_block_size) * parity_block_size +
-            ((cur_op->req.rw.offset % parity_block_size) / pg_parity_size) * pg_parity_size
+        .stripe = (cur_op->req.rw.offset / pg_stripe_size) * pg_stripe_size +
+            ((cur_op->req.rw.offset % pg_stripe_size) / pg_block_size) * pg_block_size
    };
-    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_parity_size) ||
+    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
        (cur_op->req.rw.len % bs_disk_alignment) != 0)
    {
-        finish_primary_op(cur_op, -EINVAL);
+        finish_op(cur_op, -EINVAL);
        return false;
    }
    osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc(
-        sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pgs[pg_num].pg_size, 1
+        sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * pg_it->second.pg_size, 1
    );
    op_data->pg_num = pg_num;
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    cur_op->op_data = op_data;
-    split_stripes(pgs[pg_num].pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
+    split_stripes(pg_it->second.pg_minsize, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
+    pg_it->second.inflight++;
    return true;
 }

+static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
+{
+    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
+    {
+        *object_state = NULL;
+        return def;
+    }
+    auto st_it = pg.incomplete_objects.find(oid);
+    if (st_it != pg.incomplete_objects.end())
+    {
+        *object_state = st_it->second;
+        return st_it->second->read_target.data();
+    }
+    st_it = pg.degraded_objects.find(oid);
+    if (st_it != pg.degraded_objects.end())
+    {
+        *object_state = st_it->second;
+        return st_it->second->read_target.data();
+    }
+    st_it = pg.misplaced_objects.find(oid);
+    if (st_it != pg.misplaced_objects.end())
+    {
+        *object_state = st_it->second;
+        return st_it->second->read_target.data();
+    }
+    *object_state = NULL;
+    return def;
+}
+
 void osd_t::continue_primary_read(osd_op_t *cur_op)
 {
    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
@@ -123,14 +108,10 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        else
        {
            // PG may be degraded or have misplaced objects
-            auto st_it = pg.obj_states.find(op_data->oid);
-            uint64_t* cur_set = (st_it != pg.obj_states.end()
-                ? st_it->second->read_target.data()
-                : pg.cur_set.data());
+            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
            {
-                free(op_data);
-                finish_primary_op(cur_op, -EIO);
+                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
@@ -147,9 +128,7 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        free(op_data);
-        cur_op->op_data = NULL;
-        finish_primary_op(cur_op, -EIO);
+        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
    if (op_data->degraded)
@@ -173,143 +152,34 @@ resume_2:
            }
        }
    }
-    free(op_data);
-    cur_op->op_data = NULL;
-    finish_primary_op(cur_op, cur_op->req.rw.len);
+    finish_op(cur_op, cur_op->req.rw.len);
 }

-void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
-{
-    bool w = submit_type == SUBMIT_WRITE;
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    osd_rmw_stripe_t *stripes = op_data->stripes;
-    // Allocate subops
-    int n_subops = 0, zero_read = -1;
-    for (int role = 0; role < pg_size; role++)
-    {
-        if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
-        {
-            zero_read = role;
-        }
-        if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
-        {
-            n_subops++;
-        }
-    }
-    if (!n_subops && submit_type == SUBMIT_RMW_READ)
-    {
-        n_subops = 1;
-    }
-    else
-    {
-        zero_read = -1;
-    }
-    osd_op_t *subops = new osd_op_t[n_subops];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_subops;
-    op_data->subops = subops;
-    int subop = 0;
-    for (int role = 0; role < pg_size; role++)
-    {
-        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(w || stripes[role].read_end != 0 || zero_read == role))
-        {
-            continue;
-        }
-        osd_num_t role_osd_num = osd_set[role];
-        if (role_osd_num != 0)
-        {
-            if (role_osd_num == this->osd_num)
-            {
-                subops[subop].bs_op = new blockstore_op_t({
-                    .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
-                    .callback = [cur_op, this](blockstore_op_t *subop)
-                    {
-                        handle_primary_subop(cur_op, subop->retval == subop->len, subop->version);
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | role,
-                    },
-                    .version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
-                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
-                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
-                    .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
-                });
-                bs->enqueue_op(subops[subop].bs_op);
-            }
-            else
-            {
-                subops[subop].op_type = OSD_OP_OUT;
-                subops[subop].send_list.push_back(subops[subop].req.buf, OSD_PACKET_SIZE);
-                subops[subop].peer_fd = this->osd_peer_fds.at(role_osd_num);
-                subops[subop].req.sec_rw = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = this->next_subop_id++,
-                        .opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
-                    },
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | role,
-                    },
-                    .version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
-                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
-                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
-                };
-                subops[subop].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
-                if (w && stripes[role].write_end > 0)
-                {
-                    subops[subop].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
-                }
-                subops[subop].callback = [cur_op, this](osd_op_t *subop)
-                {
-                    // so it doesn't get freed
-                    subop->buf = NULL;
-                    handle_primary_subop(cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
-                };
-                outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
-            }
-            subop++;
-        }
-    }
-}
-
-void osd_t::handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version)
+bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    op_data->fact_ver = version;
-    if (!ok)
+    // Check if actions are pending for this object
+    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
+        .oid = op_data->oid,
+        .osd_num = 0,
+    });
+    if (act_it != pg.flush_actions.end() &&
+        act_it->first.oid.inode == op_data->oid.inode &&
+        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
    {
-        // FIXME: Handle errors
-        op_data->errors++;
+        pg.write_queue.emplace(op_data->oid, cur_op);
+        return false;
    }
-    else
+    // Check if there are other write requests to the same object
+    auto vo_it = pg.write_queue.find(op_data->oid);
+    if (vo_it != pg.write_queue.end())
    {
-        op_data->done++;
-    }
-    if ((op_data->errors + op_data->done) >= op_data->n_subops)
-    {
-        delete[] op_data->subops;
-        op_data->subops = NULL;
-        op_data->st++;
-        if (cur_op->req.hdr.opcode == OSD_OP_READ)
-        {
-            continue_primary_read(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
-        {
-            continue_primary_write(cur_op);
-        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
-        {
-            continue_primary_sync(cur_op);
-        }
-        else
-        {
-            throw std::runtime_error("BUG: unknown opcode");
-        }
+        op_data->st = 1;
+        pg.write_queue.emplace(op_data->oid, cur_op);
+        return false;
    }
+    pg.write_queue.emplace(op_data->oid, cur_op);
+    return true;
 }

 void osd_t::continue_primary_write(osd_op_t *cur_op)
@@ -319,117 +189,212 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    // FIXME: Handle operation cancel
    auto & pg = pgs[op_data->pg_num];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    else if (op_data->st == 7) goto resume_7;
+    else if (op_data->st == 8) goto resume_8;
    assert(op_data->st == 0);
-    // Check if actions are pending for this object
+    printf("primary_write\n");
+    if (!check_write_queue(cur_op, pg))
    {
-        auto act_it = pg.obj_stab_actions.lower_bound((obj_piece_id_t){
-            .oid = op_data->oid,
-            .osd_num = 0,
-        });
-        if (act_it != pg.obj_stab_actions.end() &&
-            act_it->first.oid.inode == op_data->oid.inode &&
-            (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
-        {
-            // FIXME postpone the request until actions are done
-            free(op_data);
-            finish_primary_op(cur_op, -EIO);
-            return;
-        }
-    }
-    // Check if there are other write requests to the same object
-    {
-        auto vo_it = pg.write_queue.find(op_data->oid);
-        if (vo_it != pg.write_queue.end())
-        {
-            op_data->st = 1;
-            pg.write_queue.emplace(op_data->oid, cur_op);
-            return;
-        }
-        pg.write_queue.emplace(op_data->oid, cur_op);
+        return;
    }
 resume_1:
-    // Determine blocks to read
-    cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
+    // Determine blocks to read and write
+    // Missing chunks are allowed to be overwritten even in incomplete objects
+    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for the lower performance impact
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
+        pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
    // Read required blocks
-    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
+    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
    // Save version override for parallel reads
    pg.ver_override[op_data->oid] = op_data->fact_ver;
-    // Calculate parity
-    calc_rmw_parity(op_data->stripes, pg.pg_size);
+    // Recover missing stripes, calculate parity
+    calc_rmw_parity(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
    // Send writes
    submit_primary_subops(SUBMIT_WRITE, pg.pg_size, pg.cur_set.data(), cur_op);
 resume_4:
    op_data->st = 4;
    return;
 resume_5:
-    // Remember version as unstable
-    osd_num_t *osd_set = pg.cur_set.data();
-    for (int role = 0; role < pg.pg_size; role++)
+    if (op_data->errors > 0)
    {
-        if (osd_set[role] != 0)
-        {
-            this->unstable_writes[(osd_object_id_t){
-                .osd_num = osd_set[role],
-                .oid = {
-                    .inode = op_data->oid.inode,
-                    .stripe = op_data->oid.stripe | role,
-                },
-            }] = op_data->fact_ver;
-        }
+        pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    if (op_data->fact_ver == 1)
+    {
+        // Object is created
+        pg.clean_count++;
+        pg.total_count++;
+    }
+    if (op_data->object_state)
+    {
+        {
+            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
+            recovery_stat_count[0][recovery_type]++;
+            if (!recovery_stat_count[0][recovery_type])
+            {
+                recovery_stat_count[0][recovery_type]++;
+                recovery_stat_bytes[0][recovery_type] = 0;
+            }
+            for (int role = 0; role < pg.pg_size; role++)
+            {
+                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
+            }
+        }
+        if (op_data->object_state->state & OBJ_MISPLACED)
+        {
+            // Remove extra chunks
+            submit_primary_del_subops(cur_op, pg.cur_set.data(), op_data->object_state->osd_set);
+            if (op_data->n_subops > 0)
+            {
+                op_data->st = 8;
+                return;
+resume_8:
+                if (op_data->errors > 0)
+                {
+                    pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                    return;
+                }
+            }
+        }
+        // Clear object state
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        pg.clean_count++;
    }
-    // Remember PG as dirty to drop the connection when PG goes offline
-    // (this is required because of the "lazy sync")
-    this->clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
    // Remove version override
    pg.ver_override.erase(op_data->oid);
-    finish_primary_op(cur_op, cur_op->req.rw.len);
-    // Continue other write operations to the same object
+    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
+resume_6:
+resume_7:
+    if (!finalize_primary_write(cur_op, pg, pg.cur_loc_set, 6))
    {
-        auto next_it = pg.write_queue.find(op_data->oid);
-        auto this_it = next_it;
-        next_it++;
-        pg.write_queue.erase(this_it);
-        if (next_it != pg.write_queue.end() &&
-            next_it->first == op_data->oid)
-        {
-            osd_op_t *next_op = next_it->second;
-            continue_primary_write(next_op);
-        }
+        return;
+    }
+    object_id oid = op_data->oid;
+    finish_op(cur_op, cur_op->req.rw.len);
+    // Continue other write operations to the same object
+    auto next_it = pg.write_queue.find(oid);
+    auto this_it = next_it;
+    next_it++;
+    pg.write_queue.erase(this_it);
+    if (next_it != pg.write_queue.end() &&
+        next_it->first == oid)
+    {
+        osd_op_t *next_op = next_it->second;
+        continue_primary_write(next_op);
    }
 }

+bool osd_t::finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == base_state)
+    {
+        goto resume_6;
+    }
+    else if (op_data->st == base_state+1)
+    {
+        goto resume_7;
+    }
+    if (immediate_commit == IMMEDIATE_ALL)
+    {
+        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+        op_data->unstable_writes = new obj_ver_id[loc_set.size()];
+        {
+            int last_start = 0;
+            for (auto & chunk: loc_set)
+            {
+                op_data->unstable_writes[last_start] = (obj_ver_id){
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                    .version = op_data->fact_ver,
+                };
+                op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                    .osd_num = chunk.osd_num,
+                    .start = last_start,
+                    .len = 1,
+                });
+                last_start++;
+            }
+        }
+        submit_primary_stab_subops(cur_op);
+resume_6:
+        op_data->st = 6;
+        return false;
+resume_7:
+        // FIXME: Free those in the destructor?
+        delete op_data->unstable_write_osds;
+        delete[] op_data->unstable_writes;
+        op_data->unstable_writes = NULL;
+        op_data->unstable_write_osds = NULL;
+        if (op_data->errors > 0)
+        {
+            pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+            return false;
+        }
+    }
+    else
+    {
+        // Remember version as unstable
+        for (auto & chunk: loc_set)
+        {
+            this->unstable_writes[(osd_object_id_t){
+                .osd_num = chunk.osd_num,
+                .oid = {
+                    .inode = op_data->oid.inode,
+                    .stripe = op_data->oid.stripe | chunk.role,
+                },
+            }] = op_data->fact_ver;
+        }
+        // Remember PG as dirty to drop the connection when PG goes offline
+        // (this is required because of the "lazy sync")
+        c_cli.clients[cur_op->peer_fd].dirty_pgs.insert(op_data->pg_num);
+        dirty_pgs.insert(op_data->pg_num);
+    }
+    return true;
+}
+
 // Save and clear unstable_writes -> SYNC all -> STABLE all
-// FIXME: Run regular automatic syncs based on the number of unstable writes and/or system time
 void osd_t::continue_primary_sync(osd_op_t *cur_op)
 {
    if (!cur_op->op_data)
    {
        cur_op->op_data = (osd_primary_op_data_t*)calloc(sizeof(osd_primary_op_data_t), 1);
    }
-    if (cur_op->op_data->st == 1)      goto resume_1;
-    else if (cur_op->op_data->st == 2) goto resume_2;
-    else if (cur_op->op_data->st == 3) goto resume_3;
-    else if (cur_op->op_data->st == 4) goto resume_4;
-    else if (cur_op->op_data->st == 5) goto resume_5;
-    else if (cur_op->op_data->st == 6) goto resume_6;
-    assert(cur_op->op_data->st == 0);
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == 1)      goto resume_1;
+    else if (op_data->st == 2) goto resume_2;
+    else if (op_data->st == 3) goto resume_3;
+    else if (op_data->st == 4) goto resume_4;
+    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    assert(op_data->st == 0);
+    printf("primary_sync\n");
    if (syncs_in_progress.size() > 0)
    {
        // Wait for previous syncs, if any
        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
        syncs_in_progress.push_back(cur_op);
-        cur_op->op_data->st = 1;
+        op_data->st = 1;
 resume_1:
        return;
    }
@@ -438,27 +403,28 @@ resume_1:
        syncs_in_progress.push_back(cur_op);
    }
 resume_2:
-    // FIXME: Handle operation cancel
    if (unstable_writes.size() == 0)
    {
        // Nothing to sync
        goto finish;
    }
    // Save and clear unstable_writes
-    // FIXME: This is possible to do it on a per-client basis
-    // It would be cool not to copy them here at all, but someone has to deduplicate them by object IDs anyway
-    cur_op->op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-    cur_op->op_data->unstable_writes = new obj_ver_id[unstable_writes.size()];
+    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
+    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
    {
+        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
+        op_data->dirty_pgs = new pg_num_t[dirty_pgs.size()];
+        op_data->dirty_pg_count = dirty_pgs.size();
        osd_num_t last_osd = 0;
        int last_start = 0, last_end = 0;
-        for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++)
+        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
        {
            if (last_osd != it->first.osd_num)
            {
                if (last_osd != 0)
                {
-                    cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                        .osd_num = last_osd,
                        .start = last_start,
                        .len = last_end - last_start,
@@ -467,7 +433,7 @@ resume_2:
                last_osd = it->first.osd_num;
                last_start = last_end;
            }
-            cur_op->op_data->unstable_writes[last_end] = (obj_ver_id){
+            op_data->unstable_writes[last_end] = (obj_ver_id){
                .oid = it->first.oid,
                .version = it->second,
            };
@@ -475,129 +441,231 @@ resume_2:
        }
        if (last_osd != 0)
        {
-            cur_op->op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
                .osd_num = last_osd,
                .start = last_start,
                .len = last_end - last_start,
            });
        }
+        int dpg = 0;
+        for (auto dirty_pg_num: dirty_pgs)
+        {
+            pgs[dirty_pg_num].inflight++;
+            op_data->dirty_pgs[dpg++] = dirty_pg_num;
+        }
+        dirty_pgs.clear();
+        this->unstable_writes.clear();
    }
-    unstable_writes.clear();
-    // SYNC
-    submit_primary_sync_subops(cur_op);
+    if (immediate_commit != IMMEDIATE_ALL)
+    {
+        // SYNC
+        submit_primary_sync_subops(cur_op);
 resume_3:
-    cur_op->op_data->st = 3;
-    return;
+        op_data->st = 3;
+        return;
 resume_4:
+        if (op_data->errors > 0)
+        {
+            goto resume_6;
+        }
+    }
    // Stabilize version sets
    submit_primary_stab_subops(cur_op);
 resume_5:
-    cur_op->op_data->st = 5;
+    op_data->st = 5;
    return;
 resume_6:
-    // FIXME: Free them correctly (via a destructor or so)
-    delete cur_op->op_data->unstable_write_osds;
-    delete[] cur_op->op_data->unstable_writes;
-    cur_op->op_data->unstable_writes = NULL;
-    cur_op->op_data->unstable_write_osds = NULL;
+    if (op_data->errors > 0)
+    {
+        // Return objects back into the unstable write set
+        for (auto unstable_osd: *(op_data->unstable_write_osds))
+        {
+            for (int i = 0; i < unstable_osd.len; i++)
+            {
+                // Except those from peered PGs
+                auto & w = op_data->unstable_writes[i];
+                pg_num_t wpg = map_to_pg(w.oid);
+                if (pgs[wpg].state & PG_ACTIVE)
+                {
+                    uint64_t & dest = this->unstable_writes[(osd_object_id_t){
+                        .osd_num = unstable_osd.osd_num,
+                        .oid = w.oid,
+                    }];
+                    dest = dest < w.version ? w.version : dest;
+                    dirty_pgs.insert(wpg);
+                }
+            }
+        }
+    }
+    for (int i = 0; i < op_data->dirty_pg_count; i++)
+    {
+        auto & pg = pgs.at(op_data->dirty_pgs[i]);
+        pg.inflight--;
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
+        {
+            finish_stop_pg(pg);
+        }
+    }
+    // FIXME: Free those in the destructor?
+    delete op_data->dirty_pgs;
+    delete op_data->unstable_write_osds;
+    delete[] op_data->unstable_writes;
+    op_data->unstable_writes = NULL;
+    op_data->unstable_write_osds = NULL;
+    if (op_data->errors > 0)
+    {
+        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+    }
+    else
+    {
 finish:
+        if (cur_op->peer_fd)
+        {
+            auto it = c_cli.clients.find(cur_op->peer_fd);
+            if (it != c_cli.clients.end())
+                it->second.dirty_pgs.clear();
+        }
+        finish_op(cur_op, 0);
+    }
    assert(syncs_in_progress.front() == cur_op);
    syncs_in_progress.pop_front();
-    finish_primary_op(cur_op, 0);
    if (syncs_in_progress.size() > 0)
    {
        cur_op = syncs_in_progress.front();
-        cur_op->op_data->st++;
+        op_data = cur_op->op_data;
+        op_data->st++;
        goto resume_2;
    }
 }

-void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
+// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
+void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
 {
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int n_osds = op_data->unstable_write_osds->size();
-    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_osds;
-    op_data->subops = subops;
-    for (int i = 0; i < n_osds; i++)
+    if (object_state->state & OBJ_INCOMPLETE)
    {
-        osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
-        if (sync_osd == this->osd_num)
+        // Successful write means that object is not incomplete anymore
+        this->incomplete_objects--;
+        pg.incomplete_objects.erase(oid);
+        if (!pg.incomplete_objects.size())
        {
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_SYNC,
-                .callback = [cur_op, this](blockstore_op_t *subop)
-                {
-                    handle_primary_subop(cur_op, subop->retval == 0, 0);
-                },
-            });
-            bs->enqueue_op(subops[i].bs_op);
+            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
+            report_pg_state(pg);
        }
-        else
+    }
+    else if (object_state->state & OBJ_DEGRADED)
+    {
+        this->degraded_objects--;
+        pg.degraded_objects.erase(oid);
+        if (!pg.degraded_objects.size())
        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-            subops[i].peer_fd = osd_peer_fds.at(sync_osd);
-            subops[i].req.sec_sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = this->next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_SYNC,
-                },
-            };
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
-            {
-                handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
-            };
-            outbox_push(clients[subops[i].peer_fd], &subops[i]);
+            pg.state = pg.state & ~PG_HAS_DEGRADED;
+            report_pg_state(pg);
        }
    }
+    else if (object_state->state & OBJ_MISPLACED)
+    {
+        this->misplaced_objects--;
+        pg.misplaced_objects.erase(oid);
+        if (!pg.misplaced_objects.size())
+        {
+            pg.state = pg.state & ~PG_HAS_MISPLACED;
+            report_pg_state(pg);
+        }
+    }
+    else
+    {
+        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
+    }
+    object_state->object_count--;
+    if (!object_state->object_count)
+    {
+        pg.state_dict.erase(object_state->osd_set);
+    }
 }

-void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
+void osd_t::continue_primary_del(osd_op_t *cur_op)
 {
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int n_osds = op_data->unstable_write_osds->size();
-    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
-    op_data->n_subops = n_osds;
-    op_data->subops = subops;
-    for (int i = 0; i < n_osds; i++)
+    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
    {
-        auto & stab_osd = (*(op_data->unstable_write_osds))[i];
-        if (stab_osd.osd_num == this->osd_num)
-        {
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_STABLE,
-                .callback = [cur_op, this](blockstore_op_t *subop)
-                {
-                    handle_primary_subop(cur_op, subop->retval == 0, 0);
-                },
-                .len = (uint32_t)stab_osd.len,
-                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
-            });
-            bs->enqueue_op(subops[i].bs_op);
-        }
-        else
-        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
-            subops[i].peer_fd = osd_peer_fds.at(stab_osd.osd_num);
-            subops[i].req.sec_stab = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = this->next_subop_id++,
-                    .opcode = OSD_OP_SECONDARY_STABILIZE,
-                },
-                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
-            };
-            subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
-            {
-                handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
-            };
-            outbox_push(clients[subops[i].peer_fd], &subops[i]);
-        }
+        return;
+    }
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    auto & pg = pgs[op_data->pg_num];
+    if (op_data->st == 1)      goto resume_1;
+    else if (op_data->st == 2) goto resume_2;
+    else if (op_data->st == 3) goto resume_3;
+    else if (op_data->st == 4) goto resume_4;
+    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    else if (op_data->st == 7) goto resume_7;
+    assert(op_data->st == 0);
+    // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
+    if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
+    {
+        finish_op(cur_op, -EBUSY);
+        return;
+    }
+    if (!check_write_queue(cur_op, pg))
+    {
+        return;
+    }
+resume_1:
+    // Determine which OSDs contain this object and delete it
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    // Submit 1 read to determine the actual version number
+    submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, op_data->prev_set, cur_op);
+resume_2:
+    op_data->st = 2;
+    return;
+resume_3:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    // Save version override for parallel reads
+    pg.ver_override[op_data->oid] = op_data->fact_ver;
+    // Submit deletes
+    op_data->fact_ver++;
+    submit_primary_del_subops(cur_op, NULL, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
+resume_4:
+    op_data->st = 4;
+    return;
+resume_5:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    // Remove version override
+    pg.ver_override.erase(op_data->oid);
+resume_6:
+resume_7:
+    if (!finalize_primary_write(cur_op, pg, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set, 6))
+    {
+        return;
+    }
+    // Adjust PG stats after "instant stabilize", because we need object_state above
+    if (!op_data->object_state)
+    {
+        pg.clean_count--;
+    }
+    else
+    {
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+    }
+    pg.total_count--;
+    object_id oid = op_data->oid;
+    finish_op(cur_op, cur_op->req.rw.len);
+    // Continue other write operations to the same object
+    auto next_it = pg.write_queue.find(oid);
+    auto this_it = next_it;
+    next_it++;
+    pg.write_queue.erase(this_it);
+    if (next_it != pg.write_queue.end() &&
+        next_it->first == oid)
+    {
+        osd_op_t *next_op = next_it->second;
+        continue_primary_write(next_op);
    }
 }
--- a/osd_primary.h
+++ b/osd_primary.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "osd.h"
+#include "osd_rmw.h"
+
+#define SUBMIT_READ 0
+#define SUBMIT_RMW_READ 1
+#define SUBMIT_WRITE 2
+
+struct unstable_osd_num_t
+{
+    osd_num_t osd_num;
+    int start, len;
+};
+
+struct osd_primary_op_data_t
+{
+    int st = 0;
+    pg_num_t pg_num;
+    object_id oid;
+    uint64_t target_ver;
+    uint64_t fact_ver = 0;
+    int n_subops = 0, done = 0, errors = 0, epipe = 0;
+    int degraded = 0, pg_size, pg_minsize;
+    osd_rmw_stripe_t *stripes;
+    osd_op_t *subops = NULL;
+    uint64_t *prev_set = NULL;
+    pg_osd_set_state_t *object_state = NULL;
+
+    // for sync. oops, requires freeing
+    std::vector<unstable_osd_num_t> *unstable_write_osds = NULL;
+    pg_num_t *dirty_pgs = NULL;
+    int dirty_pg_count = 0;
+    obj_ver_id *unstable_writes = NULL;
+};
--- a/osd_primary_subops.cpp
+++ b/osd_primary_subops.cpp
@@ -0,0 +1,489 @@
+#include "osd_primary.h"
+
+void osd_t::autosync()
+{
+    // FIXME Autosync based on the number of unstable writes to prevent
+    // "journal_sector_buffer_count is too low for this batch" errors
+    if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
+    {
+        autosync_op = new osd_op_t();
+        autosync_op->op_type = OSD_OP_IN;
+        autosync_op->req = {
+            .sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = 1,
+                    .opcode = OSD_OP_SYNC,
+                },
+            },
+        };
+        autosync_op->callback = [this](osd_op_t *op)
+        {
+            if (op->reply.hdr.retval < 0)
+            {
+                printf("Warning: automatic sync resulted in an error: %ld (%s)\n", -op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
+            }
+            delete autosync_op;
+            autosync_op = NULL;
+        };
+        exec_op(autosync_op);
+    }
+}
+
+void osd_t::finish_op(osd_op_t *cur_op, int retval)
+{
+    inflight_ops--;
+    if (cur_op->op_data && cur_op->op_data->pg_num > 0)
+    {
+        auto & pg = pgs[cur_op->op_data->pg_num];
+        pg.inflight--;
+        assert(pg.inflight >= 0);
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
+        {
+            finish_stop_pg(pg);
+        }
+    }
+    if (!cur_op->peer_fd)
+    {
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
+    }
+    else
+    {
+        // FIXME add separate magic number
+        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
+        if (cl_it != c_cli.clients.end())
+        {
+            cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+            cur_op->reply.hdr.id = cur_op->req.hdr.id;
+            cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+            cur_op->reply.hdr.retval = retval;
+            c_cli.outbox_push(cur_op);
+        }
+        else
+        {
+            delete cur_op;
+        }
+    }
+}
+
+void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op)
+{
+    bool w = submit_type == SUBMIT_WRITE;
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    osd_rmw_stripe_t *stripes = op_data->stripes;
+    // Allocate subops
+    int n_subops = 0, zero_read = -1;
+    for (int role = 0; role < pg_size; role++)
+    {
+        if (osd_set[role] == this->osd_num || osd_set[role] != 0 && zero_read == -1)
+        {
+            zero_read = role;
+        }
+        if (osd_set[role] != 0 && (w || stripes[role].read_end != 0))
+        {
+            n_subops++;
+        }
+    }
+    if (!n_subops && submit_type == SUBMIT_RMW_READ)
+    {
+        n_subops = 1;
+    }
+    else
+    {
+        zero_read = -1;
+    }
+    uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
+    osd_op_t *subops = new osd_op_t[n_subops];
+    op_data->fact_ver = 0;
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_subops;
+    op_data->subops = subops;
+    int i = 0;
+    for (int role = 0; role < pg_size; role++)
+    {
+        // We always submit zero-length writes to all replicas, even if the stripe is not modified
+        if (!(w || stripes[role].read_end != 0 || zero_read == role))
+        {
+            continue;
+        }
+        osd_num_t role_osd_num = osd_set[role];
+        if (role_osd_num != 0)
+        {
+            if (role_osd_num == this->osd_num)
+            {
+                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+                subops[i].op_type = (uint64_t)cur_op;
+                subops[i].bs_op = new blockstore_op_t({
+                    .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
+                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                    {
+                        handle_primary_bs_subop(subop);
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | role,
+                    },
+                    .version = op_version,
+                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
+                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
+                    .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
+                });
+                bs->enqueue_op(subops[i].bs_op);
+            }
+            else
+            {
+                subops[i].op_type = OSD_OP_OUT;
+                subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+                subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
+                subops[i].req.sec_rw = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = c_cli.next_subop_id++,
+                        .opcode = (uint64_t)(w ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ),
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | role,
+                    },
+                    .version = op_version,
+                    .offset = w ? stripes[role].write_start : stripes[role].read_start,
+                    .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
+                };
+                subops[i].buf = w ? stripes[role].write_buf : stripes[role].read_buf;
+                if (w && stripes[role].write_end > 0)
+                {
+                    subops[i].send_list.push_back(stripes[role].write_buf, stripes[role].write_end - stripes[role].write_start);
+                }
+                subops[i].callback = [cur_op, this](osd_op_t *subop)
+                {
+                    int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE &&
+                        subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
+                    // so it doesn't get freed
+                    subop->buf = NULL;
+                    handle_primary_subop(
+                        subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval,
+                        subop->req.sec_rw.len, subop->reply.sec_rw.version
+                    );
+                    if (fail_fd >= 0)
+                    {
+                        // write operation failed, drop the connection
+                        c_cli.stop_client(fail_fd);
+                    }
+                };
+                c_cli.outbox_push(&subops[i]);
+            }
+            i++;
+        }
+    }
+}
+
+static uint64_t bs_op_to_osd_op[] = {
+    0,
+    OSD_OP_SECONDARY_READ,      // BS_OP_READ
+    OSD_OP_SECONDARY_WRITE,     // BS_OP_WRITE
+    OSD_OP_SECONDARY_SYNC,      // BS_OP_SYNC
+    OSD_OP_SECONDARY_STABILIZE, // BS_OP_STABLE
+    OSD_OP_SECONDARY_DELETE,    // BS_OP_DELETE
+    OSD_OP_SECONDARY_LIST,      // BS_OP_LIST
+    OSD_OP_SECONDARY_ROLLBACK,  // BS_OP_ROLLBACK
+    OSD_OP_TEST_SYNC_STAB_ALL,  // BS_OP_SYNC_STAB_ALL
+};
+
+void osd_t::handle_primary_bs_subop(osd_op_t *subop)
+{
+    osd_op_t *cur_op = (osd_op_t*)subop->op_type;
+    blockstore_op_t *bs_op = subop->bs_op;
+    int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE ? bs_op->len : 0;
+    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
+    {
+        // die
+        throw std::runtime_error(
+            "local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
+            " retval = "+std::to_string(bs_op->retval)+")"
+        );
+    }
+    add_bs_subop_stats(subop);
+    uint64_t opcode = bs_op_to_osd_op[bs_op->opcode];
+    int retval = bs_op->retval;
+    uint64_t version = bs_op->version;
+    delete bs_op;
+    subop->bs_op = NULL;
+    handle_primary_subop(opcode, cur_op, retval, expected, version);
+}
+
+void osd_t::add_bs_subop_stats(osd_op_t *subop)
+{
+    // Include local blockstore ops in statistics
+    uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
+    timespec tv_end;
+    clock_gettime(CLOCK_REALTIME, &tv_end);
+    c_cli.stats.op_stat_count[opcode]++;
+    if (!c_cli.stats.op_stat_count[opcode])
+    {
+        c_cli.stats.op_stat_count[opcode] = 1;
+        c_cli.stats.op_stat_sum[opcode] = 0;
+        c_cli.stats.op_stat_bytes[opcode] = 0;
+    }
+    c_cli.stats.op_stat_sum[opcode] += (
+        (tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
+        (tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
+    );
+    if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
+    {
+        c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
+    }
+}
+
+void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, int expected, uint64_t version)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (retval != expected)
+    {
+        printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected);
+        if (retval == -EPIPE)
+        {
+            op_data->epipe++;
+        }
+        op_data->errors++;
+    }
+    else
+    {
+        op_data->done++;
+        if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
+        {
+            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
+            {
+                throw std::runtime_error(
+                    "different fact_versions returned from "+std::string(osd_op_names[opcode])+
+                    " subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
+                );
+            }
+            op_data->fact_ver = version;
+        }
+    }
+    if ((op_data->errors + op_data->done) >= op_data->n_subops)
+    {
+        delete[] op_data->subops;
+        op_data->subops = NULL;
+        op_data->st++;
+        if (cur_op->req.hdr.opcode == OSD_OP_READ)
+        {
+            continue_primary_read(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
+        {
+            continue_primary_write(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
+        {
+            continue_primary_sync(cur_op);
+        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
+        {
+            continue_primary_del(cur_op);
+        }
+        else
+        {
+            throw std::runtime_error("BUG: unknown opcode");
+        }
+    }
+}
+
+void osd_t::submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, pg_osd_set_t & loc_set)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int extra_chunks = 0;
+    for (auto & chunk: loc_set)
+    {
+        if (!cur_set || chunk.osd_num != cur_set[chunk.role])
+        {
+            extra_chunks++;
+        }
+    }
+    op_data->n_subops = extra_chunks;
+    op_data->done = op_data->errors = 0;
+    if (!extra_chunks)
+    {
+        return;
+    }
+    osd_op_t *subops = new osd_op_t[extra_chunks];
+    op_data->subops = subops;
+    int i = 0;
+    for (auto & chunk: loc_set)
+    {
+        if (!cur_set || chunk.osd_num != cur_set[chunk.role])
+        {
+            if (chunk.osd_num == this->osd_num)
+            {
+                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+                subops[i].op_type = (uint64_t)cur_op;
+                subops[i].bs_op = new blockstore_op_t({
+                    .opcode = BS_OP_DELETE,
+                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                    {
+                        handle_primary_bs_subop(subop);
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                    // Same version as write
+                    .version = op_data->fact_ver,
+                });
+                bs->enqueue_op(subops[i].bs_op);
+            }
+            else
+            {
+                subops[i].op_type = OSD_OP_OUT;
+                subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+                subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
+                subops[i].req.sec_del = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = c_cli.next_subop_id++,
+                        .opcode = OSD_OP_SECONDARY_DELETE,
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                    // Same version as write
+                    .version = op_data->fact_ver,
+                };
+                subops[i].callback = [cur_op, this](osd_op_t *subop)
+                {
+                    int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                    handle_primary_subop(OSD_OP_SECONDARY_DELETE, cur_op, subop->reply.hdr.retval, 0, 0);
+                    if (fail_fd >= 0)
+                    {
+                        // delete operation failed, drop the connection
+                        c_cli.stop_client(fail_fd);
+                    }
+                };
+                c_cli.outbox_push(&subops[i]);
+            }
+            i++;
+        }
+    }
+}
+
+void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int n_osds = op_data->unstable_write_osds->size();
+    osd_op_t *subops = new osd_op_t[n_osds];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_osds;
+    op_data->subops = subops;
+    for (int i = 0; i < n_osds; i++)
+    {
+        osd_num_t sync_osd = (*(op_data->unstable_write_osds))[i].osd_num;
+        if (sync_osd == this->osd_num)
+        {
+            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+            subops[i].op_type = (uint64_t)cur_op;
+            subops[i].bs_op = new blockstore_op_t({
+                .opcode = BS_OP_SYNC,
+                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                {
+                    handle_primary_bs_subop(subop);
+                },
+            });
+            bs->enqueue_op(subops[i].bs_op);
+        }
+        else
+        {
+            subops[i].op_type = OSD_OP_OUT;
+            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+            subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
+            subops[i].req.sec_sync = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_SYNC,
+                },
+            };
+            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            {
+                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0);
+                if (fail_fd >= 0)
+                {
+                    // sync operation failed, drop the connection
+                    c_cli.stop_client(fail_fd);
+                }
+            };
+            c_cli.outbox_push(&subops[i]);
+        }
+    }
+}
+
+void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int n_osds = op_data->unstable_write_osds->size();
+    osd_op_t *subops = new osd_op_t[n_osds];
+    op_data->done = op_data->errors = 0;
+    op_data->n_subops = n_osds;
+    op_data->subops = subops;
+    for (int i = 0; i < n_osds; i++)
+    {
+        auto & stab_osd = (*(op_data->unstable_write_osds))[i];
+        if (stab_osd.osd_num == this->osd_num)
+        {
+            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+            subops[i].op_type = (uint64_t)cur_op;
+            subops[i].bs_op = new blockstore_op_t({
+                .opcode = BS_OP_STABLE,
+                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                {
+                    handle_primary_bs_subop(subop);
+                },
+                .len = (uint32_t)stab_osd.len,
+                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
+            });
+            bs->enqueue_op(subops[i].bs_op);
+        }
+        else
+        {
+            subops[i].op_type = OSD_OP_OUT;
+            subops[i].send_list.push_back(subops[i].req.buf, OSD_PACKET_SIZE);
+            subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
+            subops[i].req.sec_stab = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = c_cli.next_subop_id++,
+                    .opcode = OSD_OP_SECONDARY_STABILIZE,
+                },
+                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
+            };
+            subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
+            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            {
+                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0);
+                if (fail_fd >= 0)
+                {
+                    // sync operation failed, drop the connection
+                    c_cli.stop_client(fail_fd);
+                }
+            };
+            c_cli.outbox_push(&subops[i]);
+        }
+    }
+}
+
+void osd_t::pg_cancel_write_queue(pg_t & pg, object_id oid, int retval)
+{
+    auto st_it = pg.write_queue.find(oid), it = st_it;
+    while (it != pg.write_queue.end() && it->first == oid)
+    {
+        finish_op(it->second, retval);
+        it++;
+    }
+    if (st_it != it)
+    {
+        pg.write_queue.erase(st_it, it);
+    }
+}
--- a/osd_receive.cpp
+++ b/osd_receive.cpp
@@ -1,11 +1,16 @@
-#include "osd.h"
+#include "cluster_client.h"

-void osd_t::read_requests()
+void cluster_client_t::read_requests()
 {
    for (int i = 0; i < read_ready_clients.size(); i++)
    {
        int peer_fd = read_ready_clients[i];
        auto & cl = clients[peer_fd];
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
        io_uring_sqe* sqe = ringloop->get_sqe();
        if (!sqe)
        {
@@ -13,22 +18,16 @@ void osd_t::read_requests()
            return;
        }
        ring_data_t* data = ((ring_data_t*)sqe->user_data);
-        if (!cl.read_buf)
+        if (!cl.read_op || cl.read_remaining < receive_buffer_size)
        {
-            // no reads in progress
-            // so this is either a new command or a reply to a previously sent command
-            if (!cl.read_op)
-            {
-                cl.read_op = new osd_op_t;
-                cl.read_op->peer_fd = peer_fd;
-            }
-            cl.read_op->op_type = OSD_OP_IN;
-            cl.read_buf = &cl.read_op->req.buf;
-            cl.read_remaining = OSD_PACKET_SIZE;
-            cl.read_state = CL_READ_OP;
+            cl.read_iov.iov_base = cl.in_buf;
+            cl.read_iov.iov_len = receive_buffer_size;
+        }
+        else
+        {
+            cl.read_iov.iov_base = cl.read_buf;
+            cl.read_iov.iov_len = cl.read_remaining;
        }
-        cl.read_iov.iov_base = cl.read_buf;
-        cl.read_iov.iov_len = cl.read_remaining;
        cl.read_msg.msg_iov = &cl.read_iov;
        cl.read_msg.msg_iovlen = 1;
        data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
@@ -37,107 +36,164 @@ void osd_t::read_requests()
    read_ready_clients.clear();
 }

-void osd_t::handle_read(ring_data_t *data, int peer_fd)
+void cluster_client_t::handle_read(ring_data_t *data, int peer_fd)
 {
    auto cl_it = clients.find(peer_fd);
    if (cl_it != clients.end())
    {
        auto & cl = cl_it->second;
-        if (data->res == -EAGAIN)
-        {
-            cl.read_ready--;
-            if (cl.read_ready > 0)
-                read_ready_clients.push_back(peer_fd);
-            return;
-        }
-        else if (data->res < 0)
+        if (data->res < 0 && data->res != -EAGAIN)
        {
            // this is a client socket, so don't panic. just disconnect it
            printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
            stop_client(peer_fd);
            return;
        }
-        read_ready_clients.push_back(peer_fd);
+        if (data->res == -EAGAIN || cl.read_iov.iov_base == cl.in_buf && data->res < receive_buffer_size)
+        {
+            cl.read_ready--;
+            if (cl.read_ready > 0)
+                read_ready_clients.push_back(peer_fd);
+        }
+        else
+        {
+            read_ready_clients.push_back(peer_fd);
+        }
+        if (data->res == -EAGAIN)
+        {
+            return;
+        }
        if (data->res > 0)
        {
-            cl.read_remaining -= data->res;
-            cl.read_buf += data->res;
-            if (cl.read_remaining <= 0)
+            if (cl.read_iov.iov_base == cl.in_buf)
            {
-                cl.read_buf = NULL;
-                if (cl.read_state == CL_READ_OP)
+                // Compose operation(s) from the buffer
+                int remain = data->res;
+                void *curbuf = cl.in_buf;
+                while (remain > 0)
                {
-                    if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
+                    if (!cl.read_op)
                    {
-                        handle_reply_hdr(&cl);
+                        cl.read_op = new osd_op_t;
+                        cl.read_op->peer_fd = peer_fd;
+                        cl.read_op->op_type = OSD_OP_IN;
+                        cl.read_buf = cl.read_op->req.buf;
+                        cl.read_remaining = OSD_PACKET_SIZE;
+                        cl.read_state = CL_READ_HDR;
+                    }
+                    if (cl.read_remaining > remain)
+                    {
+                        memcpy(cl.read_buf, curbuf, remain);
+                        cl.read_remaining -= remain;
+                        cl.read_buf += remain;
+                        remain = 0;
+                        if (cl.read_remaining <= 0)
+                            handle_finished_read(cl);
                    }
                    else
                    {
-                        handle_op_hdr(&cl);
+                        memcpy(cl.read_buf, curbuf, cl.read_remaining);
+                        curbuf += cl.read_remaining;
+                        remain -= cl.read_remaining;
+                        cl.read_remaining = 0;
+                        cl.read_buf = NULL;
+                        handle_finished_read(cl);
                    }
                }
-                else if (cl.read_state == CL_READ_DATA)
+            }
+            else
+            {
+                // Long data
+                cl.read_remaining -= data->res;
+                cl.read_buf += data->res;
+                if (cl.read_remaining <= 0)
                {
-                    // Operation is ready
-                    exec_op(cl.read_op);
-                    cl.read_op = NULL;
-                    cl.read_state = 0;
-                }
-                else if (cl.read_state == CL_READ_REPLY_DATA)
-                {
-                    // Reply is ready
-                    auto req_it = cl.sent_ops.find(cl.read_reply_id);
-                    osd_op_t *request = req_it->second;
-                    cl.sent_ops.erase(req_it);
-                    cl.read_reply_id = 0;
-                    cl.read_state = 0;
-                    // Measure subop latency
-                    timespec tv_end;
-                    clock_gettime(CLOCK_REALTIME, &tv_end);
-                    subop_stat_count[request->req.hdr.opcode]++;
-                    subop_stat_sum[request->req.hdr.opcode] += (
-                        (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
-                        (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
-                    );
-                    request->callback(request);
+                    handle_finished_read(cl);
                }
            }
        }
    }
 }

-void osd_t::handle_op_hdr(osd_client_t *cl)
+void cluster_client_t::handle_finished_read(osd_client_t & cl)
+{
+    if (cl.read_state == CL_READ_HDR)
+    {
+        if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
+            handle_reply_hdr(&cl);
+        else
+            handle_op_hdr(&cl);
+    }
+    else if (cl.read_state == CL_READ_DATA)
+    {
+        // Operation is ready
+        exec_op(cl.read_op);
+        cl.read_op = NULL;
+        cl.read_state = 0;
+    }
+    else if (cl.read_state == CL_READ_REPLY_DATA)
+    {
+        // Reply is ready
+        auto req_it = cl.sent_ops.find(cl.read_reply_id);
+        osd_op_t *request = req_it->second;
+        cl.sent_ops.erase(req_it);
+        cl.read_reply_id = 0;
+        delete cl.read_op;
+        cl.read_op = NULL;
+        cl.read_state = 0;
+        // Measure subop latency
+        timespec tv_end;
+        clock_gettime(CLOCK_REALTIME, &tv_end);
+        stats.subop_stat_count[request->req.hdr.opcode]++;
+        if (!stats.subop_stat_count[request->req.hdr.opcode])
+        {
+            stats.subop_stat_count[request->req.hdr.opcode]++;
+            stats.subop_stat_sum[request->req.hdr.opcode] = 0;
+        }
+        stats.subop_stat_sum[request->req.hdr.opcode] += (
+            (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
+            (tv_end.tv_nsec - request->tv_begin.tv_nsec)/1000
+        );
+        request->callback(request);
+    }
+    else
+    {
+        assert(0);
+    }
+}
+
+void cluster_client_t::handle_op_hdr(osd_client_t *cl)
 {
    osd_op_t *cur_op = cl->read_op;
    if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
    {
        if (cur_op->req.sec_rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
        cl->read_remaining = 0;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
    {
        if (cur_op->req.sec_rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
        cl->read_remaining = cur_op->req.sec_rw.len;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
        cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
    {
        if (cur_op->req.sec_stab.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.sec_stab.len);
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
        cl->read_remaining = cur_op->req.sec_stab.len;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
    {
        if (cur_op->req.rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.rw.len);
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
        cl->read_remaining = 0;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
    {
        if (cur_op->req.rw.len > 0)
-            cur_op->buf = memalign(512, cur_op->req.rw.len);
+            cur_op->buf = memalign(MEM_ALIGNMENT, cur_op->req.rw.len);
        cl->read_remaining = cur_op->req.rw.len;
    }
    if (cl->read_remaining > 0)
@@ -155,7 +211,7 @@ void osd_t::handle_op_hdr(osd_client_t *cl)
    }
 }

-void osd_t::handle_reply_hdr(osd_client_t *cl)
+void cluster_client_t::handle_reply_hdr(osd_client_t *cl)
 {
    osd_op_t *cur_op = cl->read_op;
    auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
@@ -181,24 +237,41 @@ void osd_t::handle_reply_hdr(osd_client_t *cl)
    else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
        op->reply.hdr.retval > 0)
    {
-        op->buf = memalign(512, sizeof(obj_ver_id) * op->reply.hdr.retval);
+        op->buf = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id) * op->reply.hdr.retval);
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_reply_id = op->req.hdr.id;
        cl->read_buf = op->buf;
        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
    }
+    else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG &&
+        op->reply.hdr.retval > 0)
+    {
+        op->buf = malloc(op->reply.hdr.retval);
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_reply_id = op->req.hdr.id;
+        cl->read_buf = op->buf;
+        cl->read_remaining = op->reply.hdr.retval;
+    }
    else
    {
+        delete cl->read_op;
        cl->read_state = 0;
+        cl->read_op = NULL;
        cl->sent_ops.erase(req_it);
        // Measure subop latency
        timespec tv_end;
        clock_gettime(CLOCK_REALTIME, &tv_end);
-        subop_stat_count[op->req.hdr.opcode]++;
-        subop_stat_sum[op->req.hdr.opcode] += (
+        stats.subop_stat_count[op->req.hdr.opcode]++;
+        if (!stats.subop_stat_count[op->req.hdr.opcode])
+        {
+            stats.subop_stat_count[op->req.hdr.opcode]++;
+            stats.subop_stat_sum[op->req.hdr.opcode] = 0;
+        }
+        stats.subop_stat_sum[op->req.hdr.opcode] += (
            (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
            (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
        );
-        op->callback(op);
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(op->callback)(op);
    }
 }
--- a/osd_rmw.cpp
+++ b/osd_rmw.cpp
@@ -1,4 +1,5 @@
 #include <malloc.h>
+#include <string.h>
 #include <assert.h>
 #include "xor.h"
 #include "osd_rmw.h"
@@ -55,6 +56,11 @@ static inline void cover_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & s

 void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t end, osd_rmw_stripe_t *stripes)
 {
+    if (end == 0)
+    {
+        // Zero length request - offset doesn't matter
+        return;
+    }
    end = start+end;
    for (int role = 0; role < pg_minsize; role++)
    {
@@ -79,18 +85,21 @@ void reconstruct_stripe(osd_rmw_stripe_t *stripes, int pg_size, int role)
            }
            else if (prev >= 0)
            {
+                assert(stripes[role].read_start >= stripes[prev].read_start &&
+                    stripes[role].read_start >= stripes[other].read_start);
                memxor(
-                    stripes[prev].read_buf + (stripes[prev].read_start - stripes[role].read_start),
-                    stripes[other].read_buf + (stripes[other].read_start - stripes[other].read_start),
+                    stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
+                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
                );
                prev = -1;
            }
            else
            {
+                assert(stripes[role].read_start >= stripes[other].read_start);
                memxor(
                    stripes[role].read_buf,
-                    stripes[other].read_buf + (stripes[other].read_start - stripes[role].read_start),
+                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
                );
            }
@@ -156,10 +165,11 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
    return buf;
 }

-void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize)
+void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
+    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size)
 {
    // Generic parity modification (read-modify-write) algorithm
-    // Reconstruct -> Read -> Calc parity -> Write
+    // Read -> Reconstruct missing chunks -> Calc parity chunks -> Write
    // Now we always read continuous ranges. This means that an update of the beginning
    // of one data stripe and the end of another will lead to a read of full paired stripes.
    // FIXME: (Maybe) read small individual ranges in that case instead.
@@ -174,64 +184,90 @@ void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_s
            stripes[role].write_end = stripes[role].req_end;
        }
    }
-    for (int role = 0; role < pg_minsize; role++)
-    {
-        cover_read(start, end, stripes[role]);
-    }
-    int has_parity = 0;
+    int write_parity = 0;
    for (int role = pg_minsize; role < pg_size; role++)
    {
-        if (osd_set[role] != 0)
+        if (write_osd_set[role] != 0)
        {
-            has_parity++;
+            write_parity = 1;
            stripes[role].write_start = start;
            stripes[role].write_end = end;
        }
-        else
-            stripes[role].missing = true;
+    }
+    if (write_parity)
+    {
+        for (int role = 0; role < pg_minsize; role++)
+        {
+            cover_read(start, end, stripes[role]);
+        }
+    }
+    if (write_osd_set != read_osd_set)
+    {
+        pg_cursize = 0;
+        // Object is degraded/misplaced and will be moved to <write_osd_set>
+        for (int role = 0; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role])
+            {
+                // FIXME: For EC more than 2+1: handle case when write_osd_set == 0 and read_osd_set != 0
+                // We need to get data for any moved / recovered chunk
+                // And we need a continuous write buffer so we'll only optimize
+                // for the case when the whole chunk is ovewritten in the request
+                if (stripes[role].req_start != 0 ||
+                    stripes[role].req_end != chunk_size)
+                {
+                    stripes[role].read_start = 0;
+                    stripes[role].read_end = chunk_size;
+                    // Warning: We don't modify write_start/write_end here, we do it in calc_rmw_parity()
+                }
+            }
+            if (read_osd_set[role] != 0)
+            {
+                pg_cursize++;
+            }
+        }
    }
    if (pg_cursize < pg_size)
    {
-        if (has_parity == 0)
+        // Some stripe(s) are missing, so we need to read parity
+        for (int role = 0; role < pg_size; role++)
        {
-            // Parity is missing, we don't need to read anything
-            for (int role = 0; role < pg_minsize; role++)
+            if (read_osd_set[role] == 0)
            {
-                stripes[role].read_end = 0;
-            }
-        }
-        else
-        {
-            // Other stripe(s) are missing
-            for (int role = 0; role < pg_minsize; role++)
-            {
-                if (osd_set[role] == 0 && stripes[role].read_end != 0)
+                stripes[role].missing = true;
+                if (stripes[role].read_end != 0)
                {
-                    stripes[role].missing = true;
-                    for (int r2 = 0; r2 < pg_size; r2++)
+                    int found = 0;
+                    for (int r2 = 0; r2 < pg_size && found < pg_minsize; r2++)
                    {
-                        // Read the non-covered range of <role> from all other stripes to reconstruct it
-                        if (r2 != role && osd_set[r2] != 0)
+                        // Read the non-covered range of <role> from at least <minsize> other stripes to reconstruct it
+                        if (read_osd_set[r2] != 0)
                        {
                            extend_read(stripes[role].read_start, stripes[role].read_end, stripes[r2]);
+                            found++;
                        }
                    }
+                    if (found < pg_minsize)
+                    {
+                        // FIXME Object is incomplete - refuse partial overwrite
+                        assert(0);
+                    }
                }
            }
        }
    }
    // Allocate read buffers
-    void *rmw_buf = alloc_read_buffer(stripes, pg_size, has_parity * (end - start));
-    // Position parity & write buffers
+    void *rmw_buf = alloc_read_buffer(stripes, pg_size, (write_parity ? pg_size-pg_minsize : 0) * (end - start));
+    // Position write buffers
    uint64_t buf_pos = 0, in_pos = 0;
    for (int role = 0; role < pg_size; role++)
    {
        if (stripes[role].req_end != 0)
        {
-            stripes[role].write_buf = write_buf + in_pos;
+            stripes[role].write_buf = request_buf + in_pos;
            in_pos += stripes[role].req_end - stripes[role].req_start;
        }
-        else if (role >= pg_minsize && osd_set[role] != 0)
+        else if (role >= pg_minsize && write_osd_set[role] != 0 && end != 0)
        {
            stripes[role].write_buf = rmw_buf + buf_pos;
            buf_pos += end - start;
@@ -321,13 +357,9 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
    }
 }

-void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size)
+void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
 {
-    if (stripes[pg_size-1].missing)
-    {
-        // Parity OSD is unavailable
-        return;
-    }
+    int pg_minsize = pg_size-1;
    for (int role = 0; role < pg_size; role++)
    {
        if (stripes[role].read_end != 0 && stripes[role].missing)
@@ -337,31 +369,82 @@ void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size)
            break;
        }
    }
-    // Calculate new parity (EC k+1)
-    int parity = pg_size-1, prev = -2;
-    auto wr_end = stripes[parity].write_end;
-    auto wr_start = stripes[parity].write_start;
-    for (int other = 0; other < pg_size-1; other++)
+    uint32_t start = 0, end = 0;
+    if (!stripes[pg_minsize].missing || write_osd_set != read_osd_set)
    {
-        if (prev == -2)
+        for (int role = 0; role < pg_minsize; role++)
        {
-            prev = other;
-        }
-        else
-        {
-            int n1 = 0, n2 = 0;
-            buf_len_t xor1[3], xor2[3];
-            if (prev == -1)
+            if (stripes[role].req_end != 0)
            {
-                xor1[n1++] = { .buf = stripes[parity].write_buf, .len = wr_end-wr_start };
+                start = !end || stripes[role].req_start < start ? stripes[role].req_start : start;
+                end = std::max(stripes[role].req_end, end);
+            }
+        }
+    }
+    if (write_osd_set != read_osd_set)
+    {
+        for (int role = 0; role < pg_minsize; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role] &&
+                (stripes[role].req_start != 0 || stripes[role].req_end != chunk_size))
+            {
+                // FIXME again, handle case when write_osd_set[role] is 0
+                // Copy modified chunk into the read buffer to write it back
+                memcpy(
+                    stripes[role].read_buf + stripes[role].req_start,
+                    stripes[role].write_buf,
+                    stripes[role].req_end - stripes[role].req_start
+                );
+                stripes[role].write_buf = stripes[role].read_buf;
+                stripes[role].write_start = 0;
+                stripes[role].write_end = chunk_size;
+            }
+        }
+    }
+    if (!stripes[pg_minsize].missing && end != 0)
+    {
+        // Calculate new parity (EC k+1)
+        int parity = pg_minsize, prev = -2;
+        for (int other = 0; other < pg_minsize; other++)
+        {
+            if (prev == -2)
+            {
+                prev = other;
            }
            else
            {
-                get_old_new_buffers(stripes[prev], wr_start, wr_end, xor1, n1);
-                prev = -1;
+                int n1 = 0, n2 = 0;
+                buf_len_t xor1[3], xor2[3];
+                if (prev == -1)
+                {
+                    xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
+                }
+                else
+                {
+                    get_old_new_buffers(stripes[prev], start, end, xor1, n1);
+                    prev = -1;
+                }
+                get_old_new_buffers(stripes[other], start, end, xor2, n2);
+                xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
+            }
+        }
+    }
+    if (write_osd_set != read_osd_set)
+    {
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            {
+                // Copy new parity into the read buffer to write it back
+                memcpy(
+                    stripes[role].read_buf + start,
+                    stripes[role].write_buf,
+                    end - start
+                );
+                stripes[role].write_buf = stripes[role].read_buf;
+                stripes[role].write_start = 0;
+                stripes[role].write_end = chunk_size;
            }
-            get_old_new_buffers(stripes[other], wr_start, wr_end, xor2, n2);
-            xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, wr_end-wr_start);
        }
    }
 }
--- a/osd_rmw.h
+++ b/osd_rmw.h
@@ -31,6 +31,7 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi

 void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);

-void* calc_rmw_reads(void *write_buf, osd_rmw_stripe_t *stripes, uint64_t *osd_set, uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize);
+void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
+    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);

-void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size);
+void calc_rmw_parity(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
--- a/osd_rmw_test.cpp
+++ b/osd_rmw_test.cpp
@@ -2,16 +2,147 @@
 #include "osd_rmw.cpp"
 #include "test_pattern.h"

+void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size);
+void test1();
+void test4();
+void test5();
+void test6();
+void test7();
+void test8();
+void test9();
+
+/***
+
+Cases:
+
+1. split(offset=128K-4K, len=8K)
+   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
+
+2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
+
+3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
+   = { read: [ 0, 128K-4K ] }
+
+4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = {
+     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   + check write2 buffer
+
+5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+
+6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+
+7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity(): {
+     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write1==read1,
+   }
+   + check write1 buffer
+   + check write2 buffer
+
+8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+   + check write2 buffer
+
+9. object recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     input buffer: NULL,
+     rmw buffer: [ read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity(): {
+     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write0==read0,
+   }
+   + check write0 buffer
+
+***/
+
 int main(int narg, char *args[])
+{
+    // Test 1
+    test1();
+    // Test 4
+    test4();
+    // Test 5
+    test5();
+    // Test 6
+    test6();
+    // Test 7
+    test7();
+    // Test 8
+    test8();
+    // Test 9
+    test9();
+    // End
+    printf("all ok\n");
+    return 0;
+}
+
+void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
+{
+    printf("request");
+    for (int i = 0; i < pg_size; i++)
+    {
+        printf(" {%uK-%uK}", stripes[i].req_start/1024, stripes[i].req_end/1024);
+    }
+    printf("\n");
+    printf("read");
+    for (int i = 0; i < pg_size; i++)
+    {
+        printf(" {%uK-%uK}", stripes[i].read_start/1024, stripes[i].read_end/1024);
+    }
+    printf("\n");
+    printf("write");
+    for (int i = 0; i < pg_size; i++)
+    {
+        printf(" {%uK-%uK}", stripes[i].write_start/1024, stripes[i].write_end/1024);
+    }
+    printf("\n");
+}
+
+void test1()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 1
+    // Test 1.1
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
    assert(stripes[2].req_end == 0);
-    // Test 2
+    // Test 1.2
    for (int i = 0; i < 3; i++)
    {
        stripes[i].read_start = stripes[i].req_start;
@@ -20,18 +151,26 @@ int main(int narg, char *args[])
    assert(extend_missing_stripes(stripes, osd_set, 2, 3) == 0);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
-    // Test 3
+    // Test 1.3
    stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
    cover_read(0, 128*1024, stripes[0]);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
+}
+
+void test4()
+{
+    osd_num_t osd_set[3] = { 1, 0, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 4.1
-    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    void* write_buf = malloc(8192);
-    void* rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
+    void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 4096 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == rmw_buf+128*1024);
    assert(stripes[1].read_buf == rmw_buf+128*1024*2);
    assert(stripes[2].read_buf == rmw_buf+128*1024*3-4096);
@@ -43,24 +182,32 @@ int main(int narg, char *args[])
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
    set_pattern(stripes[1].read_buf, 128*1024-4096, UINT64_MAX); // didn't read it, it's missing
    set_pattern(stripes[2].read_buf, 128*1024-4096, 0); // old parity = 0
-    calc_rmw_parity(stripes, 3);
+    calc_rmw_parity(stripes, 3, osd_set, osd_set, 128*1024);
    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
    free(rmw_buf);
    free(write_buf);
+}
+
+void test5()
+{
+    osd_num_t osd_set[3] = { 1, 0, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 5.1
-    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 64*1024);
    assert(stripes[2].req_end == 0);
    // Test 5.2
-    write_buf = malloc(64*1024*3);
-    rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 2);
+    void *write_buf = malloc(64*1024*3);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
    assert(stripes[0].read_start == 64*1024 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 64*1024 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == rmw_buf+128*1024);
    assert(stripes[1].read_buf == rmw_buf+64*3*1024);
    assert(stripes[2].read_buf == rmw_buf+64*4*1024);
@@ -69,15 +216,22 @@ int main(int narg, char *args[])
    assert(stripes[2].write_buf == rmw_buf);
    free(rmw_buf);
    free(write_buf);
+}
+
+void test6()
+{
+    osd_num_t osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 6.1
-    memset(stripes, 0, sizeof(stripes));
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
-    osd_set[1] = 2;
-    write_buf = malloc(64*1024*3);
-    rmw_buf = calc_rmw_reads(write_buf, stripes, osd_set, 3, 2, 3);
+    void *write_buf = malloc(64*1024*3);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024);
    assert(stripes[0].read_end == 0);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 64*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
    assert(stripes[0].read_buf == 0);
    assert(stripes[1].read_buf == rmw_buf+128*1024);
    assert(stripes[2].read_buf == 0);
@@ -86,8 +240,121 @@ int main(int narg, char *args[])
    assert(stripes[2].write_buf == rmw_buf);
    free(rmw_buf);
    free(write_buf);
-    osd_set[1] = 0;
-    // End
-    printf("all ok\n");
-    return 0;
+}
+
+void test7()
+{
+    osd_num_t osd_set[3] = { 1, 0, 3 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 7.1
+    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
+    void *write_buf = malloc(8192);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+128*1024);
+    assert(stripes[1].read_buf == rmw_buf+128*1024*2);
+    assert(stripes[2].read_buf == rmw_buf+128*1024*3);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 7.2
+    set_pattern(write_buf, 8192, PATTERN0);
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
+    set_pattern(stripes[1].read_buf, 128*1024, UINT64_MAX); // didn't read it, it's missing
+    set_pattern(stripes[2].read_buf, 128*1024, 0); // old parity = 0
+    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[1].write_buf == stripes[1].read_buf);
+    check_pattern(stripes[1].write_buf, 4096, PATTERN0);
+    check_pattern(stripes[1].write_buf+4096, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
+    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
+    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
+    free(rmw_buf);
+    free(write_buf);
+}
+
+void test8()
+{
+    osd_num_t osd_set[3] = { 0, 2, 3 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 8.1
+    split_stripes(2, 128*1024, 0, 128*1024+4096, stripes);
+    void *write_buf = malloc(128*1024+4096);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
+    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == NULL);
+    assert(stripes[1].read_buf == rmw_buf+128*1024);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+128*1024);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 8.2
+    set_pattern(write_buf, 128*1024+4096, PATTERN0);
+    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN1);
+    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); // recheck again
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);     // recheck again
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); // recheck again
+    assert(stripes[0].write_buf == write_buf);                               // recheck again
+    assert(stripes[1].write_buf == write_buf+128*1024);                      // recheck again
+    assert(stripes[2].write_buf == rmw_buf);                                 // recheck again
+    check_pattern(stripes[2].write_buf, 4096, 0); // new parity
+    check_pattern(stripes[2].write_buf+4096, 128*1024-4096, PATTERN0^PATTERN1); // new parity
+    free(rmw_buf);
+    free(write_buf);
+}
+
+void test9()
+{
+    osd_num_t osd_set[3] = { 0, 2, 3 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 9.0
+    split_stripes(2, 128*1024, 64*1024, 0, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 9.1
+    void *write_buf = NULL;
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
+    assert(stripes[0].read_buf == rmw_buf);
+    assert(stripes[1].read_buf == rmw_buf+128*1024);
+    assert(stripes[2].read_buf == rmw_buf+128*1024*2);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == NULL);
+    // Test 8.2
+    set_pattern(stripes[1].read_buf, 128*1024, 0);
+    set_pattern(stripes[2].read_buf, 128*1024, PATTERN1);
+    calc_rmw_parity(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
+    assert(stripes[0].write_buf == rmw_buf);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == NULL);
+    check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    check_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
+    free(rmw_buf);
 }
--- a/osd_secondary.cpp
+++ b/osd_secondary.cpp
@@ -4,45 +4,34 @@

 void osd_t::secondary_op_callback(osd_op_t *op)
 {
-    inflight_ops--;
-    auto cl_it = clients.find(op->peer_fd);
-    if (cl_it != clients.end())
+    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
+        op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
    {
-        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        op->reply.hdr.id = op->req.hdr.id;
-        op->reply.hdr.opcode = op->req.hdr.opcode;
-        op->reply.hdr.retval = op->bs_op->retval;
-        if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
-            op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
-        {
-            op->reply.sec_rw.version = op->bs_op->version;
-        }
-        else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
-        {
-            op->reply.sec_del.version = op->bs_op->version;
-        }
-        if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
-            op->reply.hdr.retval > 0)
-        {
-            op->send_list.push_back(op->buf, op->reply.hdr.retval);
-        }
-        else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
-        {
-            // allocated by blockstore
-            op->buf = op->bs_op->buf;
-            if (op->reply.hdr.retval > 0)
-            {
-                op->send_list.push_back(op->buf, op->reply.hdr.retval * sizeof(obj_ver_id));
-            }
-            op->reply.sec_list.stable_count = op->bs_op->version;
-        }
-        auto & cl = cl_it->second;
-        outbox_push(cl, op);
+        op->reply.sec_rw.version = op->bs_op->version;
    }
-    else
+    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_DELETE)
    {
-        delete op;
+        op->reply.sec_del.version = op->bs_op->version;
    }
+    if (op->req.hdr.opcode == OSD_OP_SECONDARY_READ &&
+        op->bs_op->retval > 0)
+    {
+        op->send_list.push_back(op->buf, op->bs_op->retval);
+    }
+    else if (op->req.hdr.opcode == OSD_OP_SECONDARY_LIST)
+    {
+        // allocated by blockstore
+        op->buf = op->bs_op->buf;
+        if (op->bs_op->retval > 0)
+        {
+            op->send_list.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id));
+        }
+        op->reply.sec_list.stable_count = op->bs_op->version;
+    }
+    int retval = op->bs_op->retval;
+    delete op->bs_op;
+    op->bs_op = NULL;
+    finish_op(op, retval);
 }

 void osd_t::exec_secondary(osd_op_t *cur_op)
@@ -95,7 +84,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.parity_block_size;
+        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
 #ifdef OSD_STUB
@@ -114,15 +103,10 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
 {
    // FIXME: Send the real config, not its source
    std::string cfg_str = json11::Json(config).dump();
-    cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-    cur_op->reply.hdr.id = cur_op->req.hdr.id;
-    cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-    cur_op->reply.hdr.retval = cfg_str.size()+1;
    cur_op->buf = malloc(cfg_str.size()+1);
    memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
-    auto & cl = clients[cur_op->peer_fd];
-    cur_op->send_list.push_back(cur_op->buf, cur_op->reply.hdr.retval);
-    outbox_push(cl, cur_op);
+    cur_op->send_list.push_back(cur_op->buf, cfg_str.size()+1);
+    finish_op(cur_op, cfg_str.size()+1);
 }

 void osd_t::exec_sync_stab_all(osd_op_t *cur_op)
--- a/osd_send.cpp
+++ b/osd_send.cpp
@@ -1,8 +1,9 @@
-#include "osd.h"
+#include "cluster_client.h"

-void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
+void cluster_client_t::outbox_push(osd_op_t *cur_op)
 {
    assert(cur_op->peer_fd);
+    auto & cl = clients.at(cur_op->peer_fd);
    if (cur_op->op_type == OSD_OP_OUT)
    {
        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@@ -17,11 +18,18 @@ void osd_t::outbox_push(osd_client_t & cl, osd_op_t *cur_op)
        }
        ringloop->wakeup();
    }
+    else
+        ringloop->submit();
 }

-bool osd_t::try_send(osd_client_t & cl)
+bool cluster_client_t::try_send(osd_client_t & cl)
 {
    int peer_fd = cl.peer_fd;
+        {
+            timespec now;
+            clock_gettime(CLOCK_REALTIME, &now);
+            printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
+        }
    io_uring_sqe* sqe = ringloop->get_sqe();
    if (!sqe)
    {
@@ -34,16 +42,32 @@ bool osd_t::try_send(osd_client_t & cl)
        cl.write_op = cl.outbox.front();
        cl.outbox.pop_front();
        cl.write_state = CL_WRITE_REPLY;
-        clock_gettime(CLOCK_REALTIME, &cl.write_op->tv_send);
        if (cl.write_op->op_type == OSD_OP_IN)
        {
            // Measure execution latency
-            timespec tv_end = cl.write_op->tv_send;
-            op_stat_count[cl.write_op->req.hdr.opcode]++;
-            op_stat_sum[cl.write_op->req.hdr.opcode] += (
+            timespec tv_end;
+            clock_gettime(CLOCK_REALTIME, &tv_end);
+            stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
+            if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
+            {
+                stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
+                stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
+                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
+            }
+            stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
                (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
                (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
            );
+            if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
+                cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
+            {
+                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.rw.len;
+            }
+            else if (cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
+                cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
+            {
+                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
+            }
        }
    }
    cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
@@ -53,7 +77,7 @@ bool osd_t::try_send(osd_client_t & cl)
    return true;
 }

-void osd_t::send_replies()
+void cluster_client_t::send_replies()
 {
    for (int i = 0; i < write_ready_clients.size(); i++)
    {
@@ -67,7 +91,7 @@ void osd_t::send_replies()
    write_ready_clients.clear();
 }

-void osd_t::handle_send(ring_data_t *data, int peer_fd)
+void cluster_client_t::handle_send(ring_data_t *data, int peer_fd)
 {
    auto cl_it = clients.find(peer_fd);
    if (cl_it != clients.end())
@@ -101,16 +125,6 @@ void osd_t::handle_send(ring_data_t *data, int peer_fd)
            if (cur_op->send_list.sent >= cur_op->send_list.count)
            {
                // Done
-                if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE)
-                {
-                    timespec tv_end;
-                    clock_gettime(CLOCK_REALTIME, &tv_end);
-                    send_stat_count++;
-                    send_stat_sum += (
-                        (tv_end.tv_sec - cl.write_op->tv_send.tv_sec)*1000000 +
-                        (tv_end.tv_nsec - cl.write_op->tv_send.tv_nsec)/1000
-                    );
-                }
                if (cur_op->op_type == OSD_OP_IN)
                {
                    delete cur_op;
--- a/osd_test.cpp
+++ b/osd_test.cpp
@@ -29,6 +29,8 @@ void test_primary_sync(int connect_fd);

 void test_sync_stab_all(int connect_fd);

+void test_list_stab(int connect_fd);
+
 int main0(int narg, char *args[])
 {
    int connect_fd;
@@ -94,6 +96,15 @@ int main2(int narg, char *args[])
    return 0;
 }

+int main3(int narg, char *args[])
+{
+    int connect_fd;
+    connect_fd = connect_osd("127.0.0.1", 11203);
+    test_list_stab(connect_fd);
+    close(connect_fd);
+    return 0;
+}
+
 int main(int narg, char *args[])
 {
    int connect_fd;
@@ -148,7 +159,7 @@ bool check_reply(int r, osd_any_op_t & op, osd_any_reply_t & reply, int expected
        printf("bad reply: magic, id or opcode does not match request\n");
        return false;
    }
-    if (reply.hdr.retval != expected)
+    if (expected >= 0 && reply.hdr.retval != expected)
    {
        printf("operation failed, retval=%ld\n", reply.hdr.retval);
        return false;
@@ -170,7 +181,7 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
    op.sec_rw.version = version;
    op.sec_rw.offset = 0;
    op.sec_rw.len = 128*1024;
-    void *data = memalign(512, op.sec_rw.len);
+    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
    for (int i = 0; i < (op.sec_rw.len)/sizeof(uint64_t); i++)
        ((uint64_t*)data)[i] = pattern;
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
@@ -205,7 +216,7 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(512, len);
+    void *data = memalign(MEM_ALIGNMENT, len);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    if (!check_reply(r, op, reply, len))
@@ -233,7 +244,7 @@ void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(512, len);
+    void *data = memalign(MEM_ALIGNMENT, len);
    set_pattern(data, len, pattern);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    write_blocking(connect_fd, data, len);
@@ -265,3 +276,40 @@ void test_sync_stab_all(int connect_fd)
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    assert(check_reply(r, op, reply, 0));
 }
+
+void test_list_stab(int connect_fd)
+{
+    osd_any_op_t op;
+    osd_any_reply_t reply;
+    op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
+    op.hdr.id = 1;
+    op.hdr.opcode = OSD_OP_SECONDARY_LIST;
+    op.sec_list.pg_count = 0;
+    assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
+    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
+    assert(check_reply(r, op, reply, -1));
+    int total_count = reply.hdr.retval;
+    int stable_count = reply.sec_list.stable_count;
+    obj_ver_id *data = (obj_ver_id*)malloc(total_count * sizeof(obj_ver_id));
+    assert(data);
+    assert(read_blocking(connect_fd, data, total_count * sizeof(obj_ver_id)) == (total_count * sizeof(obj_ver_id)));
+    int last_start = stable_count;
+    for (int i = stable_count; i <= total_count; i++)
+    {
+        // Stabilize in portions of 32 entries
+        if (i - last_start >= 32 || i == total_count)
+        {
+            op.hdr.opcode = OSD_OP_SECONDARY_STABILIZE;
+            op.sec_stab.len = sizeof(obj_ver_id) * (i - last_start);
+            assert(write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE) == OSD_PACKET_SIZE);
+            assert(write_blocking(connect_fd, data + last_start, op.sec_stab.len) == op.sec_stab.len);
+            r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
+            assert(check_reply(r, op, reply, 0));
+            last_start = i;
+        }
+    }
+    obj_ver_id *data2 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 32);
+    assert(data2);
+    free(data2);
+    free(data);
+}
--- a/pg_states.cpp
+++ b/pg_states.cpp
@@ -0,0 +1,33 @@
+#include "pg_states.h"
+
+const int pg_state_bit_count = 13;
+
+const int pg_state_bits[13] = {
+    PG_STARTING,
+    PG_PEERING,
+    PG_INCOMPLETE,
+    PG_ACTIVE,
+    PG_STOPPING,
+    PG_OFFLINE,
+    PG_DEGRADED,
+    PG_HAS_INCOMPLETE,
+    PG_HAS_DEGRADED,
+    PG_HAS_MISPLACED,
+    PG_HAS_UNCLEAN,
+    PG_LEFT_ON_DEAD,
+};
+
+const char *pg_state_names[13] = {
+    "starting",
+    "peering",
+    "incomplete",
+    "active",
+    "stopping",
+    "offline",
+    "degraded",
+    "has_incomplete",
+    "has_degraded",
+    "has_misplaced",
+    "has_unclean",
+    "left_on_dead",
+};
--- a/pg_states.h
+++ b/pg_states.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// Placement group states
+// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
+// Exactly one of these:
+#define PG_STARTING (1<<0)
+#define PG_PEERING (1<<1)
+#define PG_INCOMPLETE (1<<2)
+#define PG_ACTIVE (1<<3)
+#define PG_STOPPING (1<<4)
+#define PG_OFFLINE (1<<5)
+// Plus any of these:
+#define PG_DEGRADED (1<<6)
+#define PG_HAS_INCOMPLETE (1<<7)
+#define PG_HAS_DEGRADED (1<<8)
+#define PG_HAS_MISPLACED (1<<9)
+#define PG_HAS_UNCLEAN (1<<10)
+#define PG_LEFT_ON_DEAD (1<<11)
+
+// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
+#define STRIPE_MASK ((uint64_t)4096 - 1)
+
+// OSD object states
+#define OBJ_DEGRADED 0x02
+#define OBJ_INCOMPLETE 0x04
+#define OBJ_MISPLACED 0x08
+#define OBJ_NEEDS_STABLE 0x10000
+#define OBJ_NEEDS_ROLLBACK 0x20000
+#define OBJ_BUGGY 0x80000
+
+extern const int pg_state_bits[];
+extern const char *pg_state_names[];
+extern const int pg_state_bit_count;
--- a/ringloop.cpp
+++ b/ringloop.cpp
@@ -18,6 +18,7 @@ ring_loop_t::ring_loop_t(int qd)
    {
        free_ring_data[i] = i;
    }
+    wait_sqe_id = 1;
 }

 ring_loop_t::~ring_loop_t()
@@ -27,11 +28,10 @@ ring_loop_t::~ring_loop_t()
    io_uring_queue_exit(&ring);
 }

-int ring_loop_t::register_consumer(ring_consumer_t & consumer)
+void ring_loop_t::register_consumer(ring_consumer_t *consumer)
 {
-    consumer.number = consumers.size();
+    unregister_consumer(consumer);
    consumers.push_back(consumer);
-    return consumer.number;
 }

 void ring_loop_t::wakeup()
@@ -39,12 +39,15 @@ void ring_loop_t::wakeup()
    loop_again = true;
 }

-void ring_loop_t::unregister_consumer(ring_consumer_t & consumer)
+void ring_loop_t::unregister_consumer(ring_consumer_t *consumer)
 {
-    if (consumer.number >= 0 && consumer.number < consumers.size())
+    for (int i = 0; i < consumers.size(); i++)
    {
-        consumers[consumer.number].loop = NULL;
-        consumer.number = -1;
+        if (consumers[i] == consumer)
+        {
+            consumers.erase(consumers.begin()+i, consumers.begin()+i+1);
+            break;
+        }
    }
 }

@@ -62,12 +65,17 @@ void ring_loop_t::loop()
        free_ring_data[free_ring_data_ptr++] = d - ring_datas;
        io_uring_cqe_seen(&ring, cqe);
    }
+    while (get_sqe_queue.size() > 0)
+    {
+        (get_sqe_queue[0].second)();
+        get_sqe_queue.erase(get_sqe_queue.begin());
+    }
    do
    {
        loop_again = false;
        for (int i = 0; i < consumers.size(); i++)
        {
-            consumers[i].loop();
+            consumers[i]->loop();
        }
    } while (loop_again);
 }
--- a/ringloop.h
+++ b/ringloop.h
@@ -113,23 +113,24 @@ struct ring_data_t

 struct ring_consumer_t
 {
-    int number;
    std::function<void(void)> loop;
 };

 class ring_loop_t
 {
-    std::vector<ring_consumer_t> consumers;
+    std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
+    std::vector<ring_consumer_t*> consumers;
    struct ring_data_t *ring_datas;
    int *free_ring_data;
+    int wait_sqe_id;
    unsigned free_ring_data_ptr;
    bool loop_again;
    struct io_uring ring;
 public:
    ring_loop_t(int qd);
    ~ring_loop_t();
-    int register_consumer(ring_consumer_t & consumer);
-    void unregister_consumer(ring_consumer_t & consumer);
+    void register_consumer(ring_consumer_t *consumer);
+    void unregister_consumer(ring_consumer_t *consumer);

    inline struct io_uring_sqe* get_sqe()
    {
@@ -140,19 +141,35 @@ public:
            io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
        return sqe;
    }
+    inline int wait_sqe(std::function<void()> cb)
+    {
+        get_sqe_queue.push_back({ wait_sqe_id, cb });
+        return wait_sqe_id++;
+    }
+    inline void cancel_wait_sqe(int wait_id)
+    {
+        for (int i = 0; i < get_sqe_queue.size(); i++)
+        {
+            if (get_sqe_queue[i].first == wait_id)
+            {
+                get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
+            }
+        }
+    }
    inline int submit()
    {
        return io_uring_submit(&ring);
    }
    inline int wait()
    {
-        return io_uring_submit_and_wait(&ring, 1);
+        struct io_uring_cqe *cqe;
+        return io_uring_wait_cqe(&ring, &cqe);
    }
    inline unsigned space_left()
    {
        return free_ring_data_ptr;
    }
-    inline bool get_loop_again()
+    inline bool has_work()
    {
        return loop_again;
    }
--- a/test.cpp
+++ b/test.cpp
@@ -13,6 +13,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <liburing.h>
+#include <math.h>

 #include <sys/socket.h>
 #include <sys/epoll.h>
@@ -61,24 +62,6 @@ static void test_write(struct io_uring *ring, int fd)
    free(buf);
 }

-class obj_ver_hash
-{
-public:
-    size_t operator()(const obj_ver_id &s) const
-    {
-        size_t seed = 0;
-        spp::hash_combine(seed, s.oid.inode);
-        spp::hash_combine(seed, s.oid.stripe);
-        spp::hash_combine(seed, s.version);
-        return seed;
-    }
-};
-
-inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
-{
-    return a.oid == b.oid && a.version == b.version;
-}
-
 int main00(int argc, char *argv[])
 {
    // queue with random removal: vector is best :D
@@ -170,9 +153,9 @@ int main0(int argc, char *argv[])
    // btree_map 5M entries monotone -> 0.458s, random -> 5.429s
    // absl::btree_map 5M entries random -> 5.09s
    // sparse_hash_map 5M entries -> 2.193s, random -> 2.586s
-    //btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
+    btree::btree_map<obj_ver_id, dirty_entry> dirty_db;
    //std::map<obj_ver_id, dirty_entry> dirty_db;
-    spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
+    //spp::sparse_hash_map<obj_ver_id, dirty_entry, obj_ver_hash> dirty_db;
    for (int i = 0; i < 5000000; i++)
    {
        dirty_db[(obj_ver_id){
@@ -182,7 +165,7 @@ int main0(int argc, char *argv[])
            },
            .version = 1,
        }] = (dirty_entry){
-            .state = ST_D_META_SYNCED,
+            .state = ST_D_SYNCED,
            .flags = 0,
            .location = (uint64_t)i << 17,
            .offset = 0,
@@ -337,87 +320,253 @@ int main04(int argc, char *argv[])
    return 0;
 }

-int main05(int argc, char *argv[])
+uint64_t jumphash(uint64_t key, int count)
 {
-    // FIXME extract this into a test
-    pg_t pg = {
-        .state = PG_PEERING,
-        .pg_num = 1,
-        .target_set = { 1, 2, 3 },
-        .cur_set = { 1, 2, 3 },
-        .peering_state = new pg_peering_state_t(),
-    };
-    for (uint64_t osd_num = 1; osd_num <= 3; osd_num++)
+    uint64_t b = 0;
+    uint64_t seed = key;
+    for (int j = 1; j < count; j++)
    {
-        pg_list_result_t r = {
-            .buf = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 1024*1024*8),
-            .total_count = 1024*1024*8,
-            .stable_count = (uint64_t)(1024*1024*8 - (osd_num == 1 ? 10 : 0)),
-        };
-        for (uint64_t i = 0; i < r.total_count; i++)
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        if (seed < (UINT64_MAX / (j+1)))
        {
-            r.buf[i] = {
-                .oid = {
-                    .inode = 1,
-                    .stripe = (i << STRIPE_SHIFT) | (osd_num-1),
-                },
-                .version = (uint64_t)(osd_num == 1 && i >= r.total_count - 10 ? 2 : 1),
-            };
+            b = j;
        }
-        pg.peering_state->list_results[osd_num] = r;
    }
-    pg.calc_object_states();
-    printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
-    for (auto it: pg.state_dict)
+    return b;
+}
+
+void jumphash_prepare(int count, uint64_t *out_weights, uint64_t *in_weights)
+{
+    if (count <= 0)
    {
-        printf("dev: state=%lx\n", it.second.state);
+        return;
+    }
+    uint64_t total_weight = in_weights[0];
+    out_weights[0] = UINT64_MAX;
+    for (int j = 1; j < count; j++)
+    {
+        total_weight += in_weights[j];
+        out_weights[j] = UINT64_MAX / total_weight * in_weights[j];
+    }
+}
+
+uint64_t jumphash_weights(uint64_t key, int count, uint64_t *prepared_weights)
+{
+    uint64_t b = 0;
+    uint64_t seed = key;
+    for (int j = 1; j < count; j++)
+    {
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        if (seed < prepared_weights[j])
+        {
+            b = j;
+        }
+    }
+    return b;
+}
+
+void jumphash3(uint64_t key, int count, uint64_t *weights, uint64_t *r)
+{
+    r[0] = 0;
+    r[1] = 1;
+    r[2] = 2;
+    uint64_t total_weight = weights[0]+weights[1]+weights[2];
+    uint64_t seed = key;
+    for (int j = 3; j < count; j++)
+    {
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        total_weight += weights[j];
+        if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
+            r[0] = j;
+        else
+        {
+            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+            if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
+                r[1] = j;
+            else
+            {
+                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+                if (seed < UINT64_MAX*1.0*weights[j]/total_weight)
+                    r[2] = j;
+            }
+        }
+    }
+}
+
+uint64_t crush(uint64_t key, int count, uint64_t *weights)
+{
+    uint64_t b = 0;
+    uint64_t seed = 0;
+    uint64_t max = 0;
+    for (int j = 0; j < count; j++)
+    {
+        seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+        seed ^= (j + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+        seed = -log(((double)seed) / (1ul << 32) / (1ul << 32)) * weights[j];
+        if (seed > max)
+        {
+            max = seed;
+            b = j;
+        }
+    }
+    return b;
+}
+
+void crush3(uint64_t key, int count, uint64_t *weights, uint64_t *r, uint64_t total_weight)
+{
+    uint64_t seed = 0;
+    uint64_t max = 0;
+    for (int k1 = 0; k1 < count; k1++)
+    {
+        for (int k2 = k1+1; k2 < count; k2++)
+        {
+            if (k2 == k1)
+            {
+                continue;
+            }
+            for (int k3 = k2+1; k3 < count; k3++)
+            {
+                if (k3 == k1 || k3 == k2)
+                {
+                    continue;
+                }
+                seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= (k1 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= (k2 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed ^= (k3 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+                //seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (weights[k1] + weights[k2] + weights[k3]);
+                seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (1 -
+                    (1 - 1.0*weights[k1]/total_weight)*
+                    (1 - 1.0*weights[k2]/total_weight)*
+                    (1 - 1.0*weights[k3]/total_weight)
+                ) * UINT64_MAX;
+                if (seed > max)
+                {
+                    r[0] = k1;
+                    r[1] = k2;
+                    r[2] = k3;
+                    max = seed;
+                }
+            }
+        }
    }
-    return 0;
 }

 int main(int argc, char *argv[])
 {
-    timeval fill_start, fill_end, filter_end;
-    spp::sparse_hash_map<object_id, clean_entry> clean_db;
-    //std::map<object_id, clean_entry> clean_db;
-    //btree::btree_map<object_id, clean_entry> clean_db;
-    gettimeofday(&fill_start, NULL);
-    printf("filling\n");
-    uint64_t total = 1024*1024*8*4;
-    clean_db.resize(total);
-    for (uint64_t i = 0; i < total; i++)
+    int host_count = 6;
+    uint64_t host_weights[] = {
+        34609*3,
+        34931*3,
+        35850+36387+35859,
+        36387,
+        36387*2,
+        36387,
+    };
+    /*int osd_count[] = { 3, 3, 3, 1, 2 };
+    uint64_t osd_weights[][3] = {
+        { 34609, 34609, 34609 },
+        { 34931, 34931, 34931 },
+        { 35850, 36387, 35859 },
+        { 36387 },
+        { 36387, 36387 },
+    };*/
+    uint64_t total_weight = 0;
+    for (int i = 0; i < host_count; i++)
    {
-        clean_db[(object_id){
-            .inode = 1,
-            //.stripe = (i << STRIPE_SHIFT),
-            .stripe = (((367*i) % total) << STRIPE_SHIFT),
-        }] = (clean_entry){
-            .version = 1,
-            .location = i << DEFAULT_ORDER,
-        };
+        total_weight += host_weights[i];
    }
-    gettimeofday(&fill_end, NULL);
-    // no resize():
-    // spp = 17.87s (seq), 41.81s (rand), 3.29s (seq+resize), 8.3s (rand+resize), ~1.3G RAM in all cases
-    // std::unordered_map = 6.14 sec, ~2.3G RAM
-    // std::map = 13 sec (seq), 5.54 sec (rand), ~2.5G RAM
-    // cpp-btree = 2.47 sec (seq) ~1.2G RAM, 20.6 sec (pseudo-random 367*i % total) ~1.5G RAM
-    printf("filled %.2f sec\n", (fill_end.tv_sec - fill_start.tv_sec) + (fill_end.tv_usec - fill_start.tv_usec) / 1000000.0);
-    for (int pg = 0; pg < 100; pg++)
+    uint64_t host_weights_prepared[host_count];
+    jumphash_prepare(host_count, host_weights_prepared, host_weights);
+    uint64_t total_pgs[host_count] = { 0 };
+    int pg_count = 256;
+    double uniformity[pg_count] = { 0 };
+    for (uint64_t pg = 1; pg <= pg_count; pg++)
    {
-        obj_ver_id* buf1 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * ((total+99)/100));
-        int j = 0;
-        for (auto it: clean_db)
-            if ((it.first % 100) == pg)
-                buf1[j++] = { .oid = it.first, .version = it.second.version };
-        free(buf1);
-        printf("filtered %d\n", j);
+        uint64_t r[3];
+
+/*
+        // Select first host
+        //r[0] = jumphash_weights(pg, host_count, host_weights_prepared);
+        r[0] = crush(pg, host_count, host_weights);
+        // Select second host
+        uint64_t seed = pg;
+        r[1] = r[0];
+        while (r[1] == r[0])
+        {
+            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+            //r[1] = jumphash_weights(seed, host_count, host_weights_prepared);
+            r[1] = crush(seed, host_count, host_weights);
+        }
+        // Select third host
+        seed = pg;
+        r[2] = r[0];
+        while (r[2] == r[0] || r[2] == r[1])
+        {
+            seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
+            //r[2] = jumphash_weights(seed, host_count, host_weights_prepared);
+            r[2] = crush(seed, host_count, host_weights);
+        }
+*/
+
+/*
+        // Select second host
+        uint64_t host_weights1[host_count];
+        for (int i = 0; i < r[0]; i++)
+            host_weights1[i] = host_weights[i];
+        for (int i = r[0]+1; i < host_count; i++)
+            host_weights1[i-1] = host_weights[i];
+        r[1] = crush(pg, host_count-1, host_weights1);
+        // Select third host
+        for (int i = r[1]+1; i < host_count-1; i++)
+            host_weights1[i-1] = host_weights[i];
+        r[2] = crush(pg, host_count-2, host_weights1);
+        // Transform numbers
+        r[2] = r[2] >= r[1] ? 1+r[2] : r[2];
+        r[2] = r[2] >= r[0] ? 1+r[2] : r[2];
+        r[1] = r[1] >= r[0] ? 1+r[1] : r[1];
+*/
+
+        crush3(pg, host_count, host_weights, r, total_weight);
+        uint64_t shift = (2862933555777941757ull*pg + 3037000493ull) % host_count;
+        if (shift == 1)
+        {
+            uint64_t tmp;
+            tmp = r[0];
+            r[0] = r[1];
+            r[1] = r[2];
+            r[2] = tmp;
+        }
+        else if (shift == 2)
+        {
+            uint64_t tmp;
+            tmp = r[0];
+            r[0] = r[2];
+            r[2] = r[1];
+            r[1] = tmp;
+        }
+
+        total_pgs[r[0]]++;
+        total_pgs[r[1]]++;
+        total_pgs[r[2]]++;
+
+        double u = 0;
+        for (int i = 0; i < host_count; i++)
+        {
+            double d = abs(1 - total_pgs[i]/3.0/pg * total_weight/host_weights[i]);
+            u += d;
+        }
+        uniformity[pg-1] = u/host_count;
+
+        printf("pg %lu: hosts %lu, %lu, %lu ; avg deviation = %.2f\n", pg, r[0], r[1], r[2], u/host_count);
    }
-    gettimeofday(&filter_end, NULL);
-    // spp = 42.15 sec / 60 sec (rand)
-    // std::unordered_map = 43.7 sec
-    // std::map = 156.13 sec
-    // cpp-btree = 21.87 sec (seq), 44.33 sec (rand)
-    printf("100 times filter %.2f sec\n", (filter_end.tv_sec - fill_end.tv_sec) + (filter_end.tv_usec - fill_end.tv_usec) / 1000000.0);
+    printf("total PGs: ");
+    for (int i = 0; i < host_count; i++)
+    {
+        printf(i > 0 ? ", %lu (%.2f)" : "%lu (%.2f)", total_pgs[i], total_pgs[i]/3.0/pg_count * total_weight/host_weights[i]);
+    }
+    printf("\n");
    return 0;
 }
--- a/test_blockstore.cpp
+++ b/test_blockstore.cpp
@@ -115,7 +115,7 @@ int main(int narg, char *args[])
        }
    };

-    ringloop->register_consumer(main_cons);
+    ringloop->register_consumer(&main_cons);
    while (1)
    {
        ringloop->loop();
--- a/timerfd_interval.cpp
+++ b/timerfd_interval.cpp
@@ -20,14 +20,14 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func
        throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
    }
    consumer.loop = [this]() { loop(); };
-    ringloop->register_consumer(consumer);
+    ringloop->register_consumer(&consumer);
    this->ringloop = ringloop;
    this->callback = cb;
 }

 timerfd_interval::~timerfd_interval()
 {
-    ringloop->unregister_consumer(consumer);
+    ringloop->unregister_consumer(&consumer);
    close(timerfd);
 }

--- a/timerfd_interval.h
+++ b/timerfd_interval.h
@@ -6,7 +6,6 @@ class timerfd_interval
 {
    int wait_state;
    int timerfd;
-    int status;
    ring_loop_t *ringloop;
    ring_consumer_t consumer;
    std::function<void(void)> callback;
--- a/timerfd_manager.cpp
+++ b/timerfd_manager.cpp
@@ -0,0 +1,159 @@
+#include <sys/timerfd.h>
+#include <sys/poll.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include "timerfd_manager.h"
+
+timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler)
+{
+    this->set_fd_handler = set_fd_handler;
+    wait_state = 0;
+    timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+    if (timerfd < 0)
+    {
+        throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
+    }
+    set_fd_handler(timerfd, false, [this](int fd, int events)
+    {
+        handle_readable();
+    });
+}
+
+timerfd_manager_t::~timerfd_manager_t()
+{
+    set_fd_handler(timerfd, false, NULL);
+    close(timerfd);
+}
+
+void timerfd_manager_t::inc_timer(timerfd_timer_t & t)
+{
+    t.next.tv_sec += t.millis/1000;
+    t.next.tv_nsec += (t.millis%1000)*1000000;
+    if (t.next.tv_nsec > 1000000000)
+    {
+        t.next.tv_sec++;
+        t.next.tv_nsec -= 1000000000;
+    }
+}
+
+int timerfd_manager_t::set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback)
+{
+    int timer_id = id++;
+    timespec start;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    timers.push_back({
+        .id = timer_id,
+        .millis = millis,
+        .start = start,
+        .next = start,
+        .repeat = repeat,
+        .callback = callback,
+    });
+    inc_timer(timers[timers.size()-1]);
+    set_nearest();
+    return timer_id;
+}
+
+void timerfd_manager_t::clear_timer(int timer_id)
+{
+    for (int i = 0; i < timers.size(); i++)
+    {
+        if (timers[i].id == timer_id)
+        {
+            timers.erase(timers.begin()+i, timers.begin()+i+1);
+            if (nearest == i)
+            {
+                nearest = -1;
+                wait_state = wait_state & ~1;
+            }
+            else if (nearest > i)
+            {
+                nearest--;
+            }
+            set_nearest();
+            break;
+        }
+    }
+}
+
+void timerfd_manager_t::set_nearest()
+{
+again:
+    if (!timers.size())
+    {
+        nearest = -1;
+        itimerspec exp = { 0 };
+        if (timerfd_settime(timerfd, 0, &exp, NULL))
+        {
+            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
+        }
+        wait_state = wait_state & ~1;
+    }
+    else
+    {
+        nearest = 0;
+        for (int i = 1; i < timers.size(); i++)
+        {
+            if (timers[i].next.tv_sec < timers[nearest].next.tv_sec ||
+                timers[i].next.tv_sec == timers[nearest].next.tv_sec &&
+                timers[i].next.tv_nsec < timers[nearest].next.tv_nsec)
+            {
+                nearest = i;
+            }
+        }
+        timespec now;
+        clock_gettime(CLOCK_MONOTONIC, &now);
+        itimerspec exp = {
+            .it_interval = { 0 },
+            .it_value = timers[nearest].next,
+        };
+        exp.it_value.tv_sec -= now.tv_sec;
+        exp.it_value.tv_nsec -= now.tv_nsec;
+        if (exp.it_value.tv_nsec < 0)
+        {
+            exp.it_value.tv_sec--;
+            exp.it_value.tv_nsec += 1000000000;
+        }
+        if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
+        {
+            // It already happened
+            trigger_nearest();
+            goto again;
+        }
+        if (timerfd_settime(timerfd, 0, &exp, NULL))
+        {
+            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
+        }
+        wait_state = wait_state | 1;
+    }
+}
+
+void timerfd_manager_t::handle_readable()
+{
+    uint64_t n;
+    size_t res = read(timerfd, &n, 8);
+    if (res == 8 && nearest >= 0)
+    {
+        trigger_nearest();
+    }
+    wait_state = 0;
+    set_nearest();
+}
+
+void timerfd_manager_t::trigger_nearest()
+{
+    int nearest_id = timers[nearest].id;
+    auto cb = timers[nearest].callback;
+    if (timers[nearest].repeat)
+    {
+        inc_timer(timers[nearest]);
+    }
+    else
+    {
+        timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
+    }
+    cb(nearest_id);
+    nearest = -1;
+}
--- a/timerfd_manager.h
+++ b/timerfd_manager.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <time.h>
+#include <vector>
+#include <functional>
+
+struct timerfd_timer_t
+{
+    int id;
+    uint64_t millis;
+    timespec start, next;
+    bool repeat;
+    std::function<void(int)> callback;
+};
+
+class timerfd_manager_t
+{
+    int wait_state = 0;
+    int timerfd;
+    int nearest = -1;
+    int id = 1;
+    std::vector<timerfd_timer_t> timers;
+
+    void inc_timer(timerfd_timer_t & t);
+    void set_nearest();
+    void trigger_nearest();
+    void handle_readable();
+public:
+    std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler;
+
+    timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler);
+    ~timerfd_manager_t();
+    int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
+    void clear_timer(int timer_id);
+};
Author	SHA1	Message	Date
Vitaliy Filippov	c414a90abc	TRACE	2020-05-28 12:41:08 +03:00
Vitaliy Filippov	36fe7d394b	EPOLLLT	2020-05-28 12:41:08 +03:00
Vitaliy Filippov	540137dd23	Submit	2020-05-28 12:41:08 +03:00
Vitaliy Filippov	b466e215f0	Fix queued OP_SYNC execution	2020-05-27 13:55:25 +03:00
Vitaliy Filippov	36f995367f	Fix bind_address reporting	2020-05-27 10:58:40 +03:00
Vitaliy Filippov	0aca6e9ca8	Extract peer connect and read-write loop into a separate file (to be shared with the client library)	2020-05-26 22:11:30 +03:00
Vitaliy Filippov	fa98be6bc0	Allow to specify multiple etcd addresses	2020-05-25 16:30:05 +03:00
Vitaliy Filippov	256a7f2667	Free op->bs_op manually	2020-05-25 15:31:22 +03:00
Vitaliy Filippov	79bf57b6e2	Allow to override pg_stripe_size	2020-05-25 15:31:22 +03:00
Vitaliy Filippov	53f6aba3e6	Die when journal_sector_buffer_count is too small	2020-05-24 17:26:47 +03:00
Vitaliy Filippov	36595eb669	Print "Ran out of journal sector buffers" warning	2020-05-24 16:48:50 +03:00
Vitaliy Filippov	e09d0e0678	Several bug fixes - Do not block flock() requests - Fix stop_client(0) attempts leading to std::bad_function_call - Fix degraded writes crashing due to an unset stripes[i].missing (at least with a missing parity device) - Fix recovery B/W reporting	2020-05-24 01:51:35 +03:00
Vitaliy Filippov	d1602b50b3	Fix BS_OP_ROLLBACK removing an incorrect version Instead of only removing versions with oid == X and version > Y it was also removing the previous version in list (with the previous oid or with version == Y)	2020-05-24 01:51:28 +03:00
Vitaliy Filippov	7df384031a	Re-peer PGs after stopping the peer Fixes the bug where two peers killed at once have lead to PG state PG_DEGRADED\|PG_HAS_INCOMPLETE instead of PG_INCOMPLETE	2020-05-23 18:45:12 +03:00
Vitaliy Filippov	e614a98543	Add a sad FIXME :-)	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	01dd3ef89e	Fix timerfd_manager triggering of multiple times at the same time	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	cdccc23aff	Print [OSD $osd_num] in stats, print B/W only for ops that log bytes	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	700428829a	Fix autosync_interval default not setting when autosync_interval is skipped in config	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	6488d0044a	Ignore EPOLL_CTL_DEL ENOENT, fix detection of the rollback version	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	393fe75900	Fix creepy (osd_op_t*)(long) casts	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	f036eecf1c	Fix osd_rmw object recovery case (len==0)	2020-05-23 15:43:37 +03:00
Vitaliy Filippov	e56909fb45	Remove tv_send (unused) and timerfd_interval from blockstore	2020-05-22 15:57:08 +03:00
Vitaliy Filippov	fac75b0b57	Handle reweights in mon	2020-05-22 12:52:27 +03:00
Vitaliy Filippov	9f842ec9a5	Remove connect callback because it is always the same	2020-05-22 12:45:12 +03:00
Vitaliy Filippov	f6a01a4819	Extract "state-watching" etcd client into a separate file	2020-05-22 12:38:40 +03:00
Vitaliy Filippov	6202260018	Extract HTTP client functions from osd_t	2020-05-21 11:39:01 +03:00
Vitaliy Filippov	a61ede9951	Remove io_uring usage from osd_http and timerfd_manager For better future interoperability with external event loops such as QEMU's one	2020-05-21 01:25:38 +03:00
Vitaliy Filippov	f57731f8ca	Calculate total stats in the monitor	2020-05-15 01:37:17 +03:00
Vitaliy Filippov	19f25c7cd5	Handle integer overflow of the op_stat_count	2020-05-15 01:37:17 +03:00
Vitaliy Filippov	2c3e84cc41	Implement stop_all_pgs()	2020-05-15 01:37:17 +03:00
Vitaliy Filippov	7bda66b866	Do not crash when optimising PGs in an undersized cluster	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	b467d0559f	Begin node.js storage monitor service	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	c2c2eefea4	Duplicate host in osd/state and osd/stats, take PGs from /config/pgs.items	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	5084ff7c6c	Measure & report recovery op count and bandwidth	2020-05-15 01:29:15 +03:00
Vitaliy Filippov	47b6f64106	Support level names	2020-05-11 15:57:21 +03:00
Vitaliy Filippov	f71d0c117b	Measure & report op bandwidth, include local blockstore ops in stats	2020-05-11 02:58:13 +03:00
Vitaliy Filippov	2b854948f9	Remove dead code	2020-05-09 16:15:02 +03:00
Vitaliy Filippov	e7f897ed65	Report hostname to etcd	2020-05-09 02:33:43 +03:00
Vitaliy Filippov	c26b6e1fc3	Support CRUSH-like multi-level placement trees	2020-05-09 00:55:24 +03:00
Vitaliy Filippov	aaa054e644	Fix optimize_change generating infeasible problems Mainly happened when removing PG combinations (removing OSDs) Also randomize OSD combinations when there's a lot of them Also remove Perl version	2020-05-08 16:42:40 +03:00
Vitaliy Filippov	706a44d4d4	Fix optimize_initial in both perl and js versions	2020-05-06 23:12:03 +03:00
Vitaliy Filippov	842f88f94f	Rewrite LPOptimizer.pm to nodejs	2020-05-06 02:08:15 +03:00
Vitaliy Filippov	e8149e5848	Implement OSD_OP_DELETE	2020-05-05 00:39:51 +03:00
Vitaliy Filippov	6355b968f4	Track osd_set history and all_peers separately	2020-05-04 15:28:07 +03:00
Vitaliy Filippov	00cf24fbd7	Split osd_primary.cpp	2020-05-03 11:04:20 +03:00
Vitaliy Filippov	1bc08174f9	Sync before listing objects so flushes do not fail thereafter	2020-05-01 12:56:49 +03:00
Vitaliy Filippov	cd87333091	Fix PG state comparison leading to unclean PGs not flushing (a & b == b) -> ((a & b) == b) !	2020-05-01 12:56:46 +03:00
Vitaliy Filippov	bd0fe6e4cc	Fix PGs not stopping during sync, fix state reporting autovivification of erased PGs	2020-05-01 01:33:14 +03:00
Vitaliy Filippov	ce78454215	Reply with -EROFS to write commands in readonly mode	2020-05-01 00:54:34 +03:00
Vitaliy Filippov	762bd42096	Fix use-after-free caused by "delete this" in handle_read	2020-04-30 02:15:53 +03:00
Vitaliy Filippov	7b57eeeeb3	Implement PG state locking and PG moving in response to etcd events	2020-04-29 22:23:38 +03:00
Vitaliy Filippov	ec4a52af48	Fix websocket (and timer!) bugs	2020-04-26 01:59:56 +03:00
Vitaliy Filippov	268b497c0b	Implement simple websocket client	2020-04-25 23:11:50 +03:00
Vitaliy Filippov	35481925b1	Implement very simple HTTP streaming to handle etcd watches	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	895a80dfc4	Fix etcd 3.2 compatibility (no compare.target == LEASE, /kv/lease/revoke), fix small bugs	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	caa01c6aaf	Acquire etcd leases, prevent starting two OSDs with the same number	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	d398ddfd3b	Use snake_case for etcd requests	2020-04-25 01:35:52 +03:00
Vitaliy Filippov	0f2b8dbf6f	Use a single timerfd_manager for all timers	2020-04-25 01:35:49 +03:00
Vitaliy Filippov	4f42e9659e	Use etcd instead of Consul	2020-04-24 01:03:55 +03:00
Vitaliy Filippov	7cf71a8031	Fix timerfd_manager: remove timer, then call callback	2020-04-21 12:45:18 +03:00
Vitaliy Filippov	9d22559bcf	Start peering immediately when loading PGs	2020-04-21 02:27:13 +03:00
Vitaliy Filippov	8c03e3ebab	Lock Blockstore devices exclusively by default	2020-04-21 01:59:11 +03:00
Vitaliy Filippov	2a640ba2e8	Remove range port selection (leads to races)	2020-04-21 00:10:59 +03:00
Vitaliy Filippov	6a21ea207e	Check peer config (at least, number) after connecting	2020-04-21 00:08:54 +03:00
Vitaliy Filippov	642802b595	Auto-select port numbers	2020-04-20 17:45:27 +03:00
Vitaliy Filippov	ff38b464a5	Add consul & connect timeouts, report state before loading PGs, move init_primary to osd_cluster	2020-04-20 15:43:07 +03:00
Vitaliy Filippov	663153713b	Reconnect to peers after connecting drops	2020-04-19 01:01:26 +03:00
Vitaliy Filippov	dc57c5c362	Report PG states again, clear PG history on reaching active+clean	2020-04-19 00:48:23 +03:00
Vitaliy Filippov	f95299b769	Take PG history into account when starting PGs	2020-04-19 00:20:18 +03:00
Vitaliy Filippov	9126ffb0f9	Fix PG loading - now it works, at least once	2020-04-17 02:33:44 +03:00
Vitaliy Filippov	2a8e40835e	Fix reporting to Consul, report even if we are purely secondary	2020-04-17 01:59:06 +03:00
Vitaliy Filippov	309486d746	Implement loading PGs from Consul (in theory)	2020-04-16 23:22:32 +03:00
Vitaliy Filippov	582f485578	Extract http & getifaddr_list into a separate file	2020-04-15 15:47:06 +03:00
Vitaliy Filippov	089b4eb208	Retry consul connection attempts and then die	2020-04-15 15:33:18 +03:00
Vitaliy Filippov	d78ce509c6	Add simple timer manager	2020-04-15 13:41:44 +03:00
Vitaliy Filippov	f3a7ccff50	Use 4K blockstore block by default, use MEM_ALIGNMENT in osd code	2020-04-14 19:19:56 +03:00
Vitaliy Filippov	37b27c3025	Implement basic OSD status reporting to Consul	2020-04-14 14:52:06 +03:00
Vitaliy Filippov	edf6d6f897	Fix http_request	2020-04-12 02:08:00 +03:00
Vitaliy Filippov	d11e8dcb5e	Do not flush or recover in readonly mode	2020-04-11 12:06:18 +03:00
Vitaliy Filippov	dd02bc1c44	Add base64 implementation	2020-04-11 12:06:18 +03:00
Vitaliy Filippov	298b013eae	Add simple http request function	2020-04-11 12:05:58 +03:00
Vitaliy Filippov	0880a77c1a	2 FIXME for the future	2020-04-06 00:55:47 +03:00
Vitaliy Filippov	aa849ea07b	Add a test for missing chunk overwrite	2020-04-05 16:14:03 +03:00
Vitaliy Filippov	d59be0e8b4	Delete misplaced chunks after moving the object, reset object state in primary_write	2020-04-05 15:51:22 +03:00
Vitaliy Filippov	cf7de0f181	(Almost) Implement misplaced recovery, integrating it into calc_rmw()	2020-04-05 15:50:53 +03:00
Vitaliy Filippov	6212195440	Implement parallel recovery	2020-04-04 19:23:12 +03:00
Vitaliy Filippov	dfb6e15eaa	Implement graceful stopping of PGs	2020-04-03 13:03:42 +03:00
Vitaliy Filippov	afe2e76c87	Implement regular automatic syncs, split osd_t constructor into some methods	2020-04-02 22:16:46 +03:00
Vitaliy Filippov	0f43f6d3f6	Fix crashes, print some stats Notably: - fix the `delete op` inside lambda callback crash (it frees the lambda itself which results in use-after-free with g++) - fix stop_client() reenterability - fix a bug in the blockstore layer which resulted in always returning version=0 for zero-length reads - change error codes for blockstore_stabilize	2020-03-31 17:55:31 +03:00
Vitaliy Filippov	92c800bb64	Forget unstable writes when re-peering, rename parity_block_size -> pg_stripe_size, pg_parity_size -> pg_block_size	2020-03-31 02:09:25 +03:00
Vitaliy Filippov	8a8b619875	Handle secondary OSD connection errors [in theory]	2020-03-30 19:51:34 +03:00
Vitaliy Filippov	43fe1d88e7	Fix memory leaks with subops, fix recovery crashes	2020-03-28 19:09:20 +03:00
Vitaliy Filippov	1b30120918	Fix stripe reconstruction in recovery, only write modified object parts	2020-03-28 13:58:42 +03:00
Vitaliy Filippov	c0a22d825d	Fix degraded object recovery (it seems to work now)	2020-03-25 02:17:41 +03:00
Vitaliy Filippov	7acfc95f75	CONFIG_HAVE_GETTID	2020-03-25 01:20:20 +03:00
Vitaliy Filippov	250f22c0b6	Implement basic degraded object recovery (integrated into primary_write)	2020-03-25 01:17:50 +03:00
Vitaliy Filippov	dbd8418798	Reply using a single finish_op() method, allow to call OSD ops from inside the OSD	2020-03-24 00:18:52 +03:00
Vitaliy Filippov	036f4c5bf3	Fix unstable flushing, include extra OSDs with old object versions in osd_set	2020-03-23 20:28:47 +03:00
Vitaliy Filippov	fd8e1a8418	Slightly reorganize object state check code	2020-03-23 00:42:17 +03:00
Vitaliy Filippov	a08e0bfacd	Treat misplaced and degraded as separate state parts	2020-03-23 00:40:31 +03:00
Vitaliy Filippov	ddc3e927d3	Solve it in integers	2020-03-20 13:58:54 +03:00
Vitaliy Filippov	2aa605f2bb	Do not check	2020-03-20 13:38:35 +03:00
Vitaliy Filippov	18915b264a	Extract to .pm + fix all_combinations	2020-03-19 21:35:47 +03:00
Vitaliy Filippov	60f795e7eb	Add lp_solve based data distribution optimizer	2020-03-19 17:23:24 +03:00
Vitaliy Filippov	3a4279adbf	Hash-based PG distribution experiments	2020-03-17 18:52:39 +03:00
Vitaliy Filippov	1ec9794376	Extract flushing into a separate file	2020-03-15 18:39:31 +03:00
Vitaliy Filippov	d8164e9d84	Print PG states on every change	2020-03-14 22:19:45 +03:00
Vitaliy Filippov	21d0b06959	Implement flushing (stabilize/rollback) of unstable entries on start of the PG	2020-03-14 02:49:34 +03:00
Vitaliy Filippov	46f9bd2a69	Make blockstore list operation return consistent snapshots	2020-03-14 02:10:25 +03:00
Vitaliy Filippov	6982fe1255	Do not block reads by previous unfinished writes	2020-03-13 21:28:49 +03:00
Vitaliy Filippov	eba053febe	Do not start small writes before finishing the last big write to the same object	2020-03-12 02:15:01 +03:00
Vitaliy Filippov	899946ff96	Add osd_test function to unblock an OSD blocked by the lack of journal space	2020-03-10 17:19:24 +03:00
Vitaliy Filippov	3dd1b22d55	Fix segfault with concurrent OP_SYNCs	2020-03-10 17:00:23 +03:00
Vitaliy Filippov	31f9445030	Use immediate_commit to benefit the primary OSD	2020-03-10 02:20:16 +03:00
Vitaliy Filippov	3f522c66e6	Implement immediate commit mode	2020-03-10 01:59:15 +03:00
Vitaliy Filippov	c3737ae3ff	Add journal fsync to stabilize/rollback	2020-03-09 00:35:58 +03:00
Vitaliy Filippov	c863543bfe	Fix possible journal corruption caused by concurrent flushing and writing of the same journal sector	2020-03-08 01:21:19 +03:00
Vitaliy Filippov	1696446545	Rename min/max _used to _flushed	2020-03-07 16:41:58 +03:00
Vitaliy Filippov	41dddddbf2	Fix some logging	2020-03-07 16:41:53 +03:00
Vitaliy Filippov	2d4e24c9ce	Add journal dumper debugging tool	2020-03-06 02:29:43 +03:00
Vitaliy Filippov	844cacd357	Allow incorrectly forbidden BS_OP_LIST in readonly mode	2020-03-06 02:29:39 +03:00
Vitaliy Filippov	e19d9fde5f	Fix peering_pg, begin tests	2020-03-06 02:02:49 +03:00
Vitaliy Filippov	9cb07d844b	Make [un]register_consumer operate on pointers, rename get_loop_again() to has_work()	2020-03-04 21:00:20 +03:00
Vitaliy Filippov	1e21555343	Add FIXME with Oops	2020-03-04 20:34:45 +03:00
Vitaliy Filippov	94cdbcd085	Stop reading when less than <buffer> data is available	2020-03-04 18:03:16 +03:00
Vitaliy Filippov	8315407558	Incoming data pre-buffering	2020-03-04 17:34:45 +03:00
Vitaliy Filippov	b27ad550cf	Use btree_map instead of sparsepp	2020-03-04 17:12:27 +03:00
Vitaliy Filippov	8e63995306	Allow to specify data area size	2020-03-04 02:32:49 +03:00