Release 0.5.13

Another followup to 0.5.11
Correct reenterability fix (now verified with a test)
2021-04-09 12:10:16 +03:00 · 2021-04-09 12:10:16 +03:00 · 2021-04-08 15:47:18 +03:00 · 2021-04-08 15:47:18 +03:00 · 2021-04-08 14:59:20 +03:00 · 2021-04-08 01:18:46 +03:00
59 changed files with 2489 additions and 1570 deletions
--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-vitastor (0.5.10-1) unstable; urgency=medium
+vitastor (0.5.13-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@ -40,10 +40,10 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.5.10; \
-    ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.10/qemu; \
-    ln -s /root/fio-build/fio-*/ vitastor-0.5.10/fio; \
-    cd vitastor-0.5.10; \
+    cp -r /root/vitastor vitastor-0.5.13; \
+    ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.13/qemu; \
+    ln -s /root/fio-build/fio-*/ vitastor-0.5.13/fio; \
+    cd vitastor-0.5.13; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    sh copy-qemu-includes.sh; \
@ -59,8 +59,8 @@ RUN set -e -x; \
    echo "dep:fio=$FIO" > debian/substvars; \
    echo "dep:qemu=$QEMU" >> debian/substvars; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.10.orig.tar.xz vitastor-0.5.10; \
-    cd vitastor-0.5.10; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.13.orig.tar.xz vitastor-0.5.13; \
+    cd vitastor-0.5.13; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@ -104,6 +104,17 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
    return res;
 }

+function shuffle(array)
+{
+    for (let i = array.length - 1, j, x; i > 0; i--)
+    {
+        j = Math.floor(Math.random() * (i + 1));
+        x = array[i];
+        array[i] = array[j];
+        array[j] = x;
+    }
+}
+
 function make_int_pgs(weights, pg_count)
 {
    const total_weight = Object.values(weights).reduce((a, c) => Number(a) + Number(c), 0);
@ -120,6 +131,7 @@ function make_int_pgs(weights, pg_count)
        weight_left -= weights[pg_name];
        pg_left -= n;
    }
+    shuffle(int_pgs);
    return int_pgs;
 }

--- a/mon/make-osd.sh
+++ b/mon/make-osd.sh
@ -53,7 +53,6 @@ ExecStart=/usr/bin/vitastor-osd \\
    --osd_num $OSD_NUM \\
    --disable_data_fsync 1 \\
    --immediate_commit all \\
-    --flusher_count 256 \\
    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
    --journal_no_same_sector_overwrites true \\
    --journal_sector_buffer_count 1024 \\
--- a/mon/make-units.sh
+++ b/mon/make-units.sh
@ -32,7 +32,8 @@ ExecStart=/usr/local/bin/etcd -name etcd$ETCD_NUM --data-dir /var/lib/etcd$ETCD_
    --advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
    --initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
    --initial-cluster-token vitastor-etcd-1 --initial-cluster $ETCD_HOSTS \\
-    --initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
+    --initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
+    --auto-compaction-retention=10 --auto-compaction-mode=revision
 WorkingDirectory=/var/lib/etcd$ETCD_NUM.etcd
 ExecStartPre=+chown -R etcd /var/lib/etcd$ETCD_NUM.etcd
 User=etcd
--- a/mon/mon.js
+++ b/mon/mon.js
@ -92,7 +92,8 @@ const etcd_tree = {
            disable_device_lock,
            // blockstore - configurable
            max_write_iodepth,
-            flusher_count,
+            min_flusher_count: 1,
+            max_flusher_count: 256,
            inmemory_metadata,
            inmemory_journal,
            journal_sector_buffer_count,
@ -182,7 +183,7 @@ const etcd_tree = {
            /* <pool_id>: {
                <pg_id>: {
                    primary: osd_num_t,
-                    state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
+                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
                        "has_invalid"|"left_on_dead")[],
                }
@ -541,7 +542,7 @@ class Mon
        for (const osd_num of this.all_osds().sort((a, b) => a - b))
        {
            const stat = this.state.osd.stats[osd_num];
-            if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
+            if (stat && stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
            {
                // Numeric IDs are reserved for OSDs
                const osd_cfg = this.state.config.osd[osd_num];
@ -692,6 +693,11 @@ class Mon
                pg_history[i].osd_sets = pg_history[i].osd_sets || [];
                pg_history[i].osd_sets.push(prev_pgs[i]);
            }
+            if (pg_history[i] && pg_history[i].osd_sets)
+            {
+                pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
+                    .reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
+            }
        });
        for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
        {
@ -842,7 +848,7 @@ class Mon
    {
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
-        // FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
+        // FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
        const { up_osds, levels, osd_tree } = this.get_osd_tree();
        const tree_cfg = {
            osd_tree,
@ -901,7 +907,14 @@ class Mon
                    prev_pgs[pg-1] = this.state.history.last_clean_pgs.items[pool_id][pg].osd_set;
                }
                prev_pgs = JSON.parse(JSON.stringify(prev_pgs.length ? prev_pgs : real_prev_pgs));
-                const old_pg_count = prev_pgs.length;
+                const old_pg_count = real_prev_pgs.length;
+                const optimize_cfg = {
+                    osd_tree: pool_tree,
+                    pg_count: pool_cfg.pg_count,
+                    pg_size: pool_cfg.pg_size,
+                    pg_minsize: pool_cfg.pg_minsize,
+                    max_combinations: pool_cfg.max_osd_combinations,
+                };
                let optimize_result;
                if (old_pg_count > 0)
                {
@ -928,24 +941,23 @@ class Mon
                            pg.pop();
                        }
                    }
-                    optimize_result = await LPOptimizer.optimize_change({
-                        prev_pgs,
-                        osd_tree: pool_tree,
-                        pg_size: pool_cfg.pg_size,
-                        pg_minsize: pool_cfg.pg_minsize,
-                        max_combinations: pool_cfg.max_osd_combinations,
-                    });
+                    if (!this.state.config.pgs.hash)
+                    {
+                        // Re-shuffle PGs
+                        optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
                    }
                    else
                    {
-                    optimize_result = await LPOptimizer.optimize_initial({
-                        osd_tree: pool_tree,
-                        pg_count: pool_cfg.pg_count,
-                        pg_size: pool_cfg.pg_size,
-                        pg_minsize: pool_cfg.pg_minsize,
-                        max_combinations: pool_cfg.max_osd_combinations,
+                        optimize_result = await LPOptimizer.optimize_change({
+                            prev_pgs,
+                            ...optimize_cfg,
                        });
                    }
+                }
+                else
+                {
+                    optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
+                }
                if (old_pg_count != optimize_result.int_pgs.length)
                {
                    console.log(
@ -1072,7 +1084,7 @@ class Mon
        const op_stats = {}, subop_stats = {}, recovery_stats = {};
        for (const osd in this.state.osd.stats)
        {
-            const st = this.state.osd.stats[osd];
+            const st = this.state.osd.stats[osd]||{};
            for (const op in st.op_stats||{})
            {
                op_stats[op] = op_stats[op] || { count: 0n, usec: 0n, bytes: 0n };
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
 QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
 perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.5.10/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.10$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.5.13/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.13$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@ -37,7 +37,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.5.10.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.5.13.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.5.10
+Version:        0.5.13
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.5.10.el7.tar.gz
+Source0:        vitastor-0.5.13.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.5.10.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.5.13.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.5.10
+Version:        0.5.13
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.5.10.el8.tar.gz
+Source0:        vitastor-0.5.13.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -14,7 +14,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 endif()

 add_definitions(-DVERSION="0.6-dev")
-add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith)
+add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
 	add_link_options(-fsanitize=address -fno-omit-frame-pointer)
@ -66,7 +66,8 @@ target_link_libraries(fio_vitastor_blk
 # vitastor-osd
 add_executable(vitastor-osd
 	osd_main.cpp osd.cpp osd_secondary.cpp msgr_receive.cpp msgr_send.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
-	osd_primary.cpp osd_primary_subops.cpp etcd_state_client.cpp messenger.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
+	osd_primary.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
+	etcd_state_client.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
 	osd_rmw.cpp base64.cpp timerfd_manager.cpp epoll_manager.cpp ../json11/json11.cpp
 )
 target_link_libraries(vitastor-osd
@ -86,7 +87,7 @@ target_link_libraries(fio_vitastor_sec
 # libvitastor_client.so
 add_library(vitastor_client SHARED
 	cluster_client.cpp epoll_manager.cpp etcd_state_client.cpp
-	messenger.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
+	messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
 	http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
 )
 target_link_libraries(vitastor_client
@ -161,7 +162,8 @@ target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)

 # stub_uring_osd
 add_executable(stub_uring_osd
-	stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
+	stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp
+	msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
 )
 target_link_libraries(stub_uring_osd
 	${LIBURING_LIBRARIES}
@ -175,6 +177,15 @@ target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
 # test_allocator
 add_executable(test_allocator test_allocator.cpp allocator.cpp)

+# test_cluster_client
+add_executable(test_cluster_client
+	test_cluster_client.cpp
+	pg_states.cpp osd_ops.cpp cluster_client.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
+	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
+)
+target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
+target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
+
 ## test_blockstore, test_shit
 #add_executable(test_blockstore test_blockstore.cpp timerfd_interval.cpp)
 #target_link_libraries(test_blockstore blockstore)
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@ -37,6 +37,21 @@ allocator::~allocator()
    delete[] mask;
 }

+bool allocator::get(uint64_t addr)
+{
+    if (addr >= size)
+    {
+        return false;
+    }
+    uint64_t p2 = 1, offset = 0;
+    while (p2 * 64 < size)
+    {
+        offset += p2;
+        p2 = p2 * 64;
+    }
+    return ((mask[offset + addr/64] >> (addr % 64)) & 1);
+}
+
 void allocator::set(uint64_t addr, bool value)
 {
    if (addr >= size)
--- a/src/allocator.h
+++ b/src/allocator.h
@ -16,6 +16,7 @@ class allocator
 public:
    allocator(uint64_t blocks);
    ~allocator();
+    bool get(uint64_t addr);
    void set(uint64_t addr, bool value);
    uint64_t find_free();
    uint64_t get_free_count();
--- a/src/blockstore.cpp
+++ b/src/blockstore.cpp
@ -58,7 +58,7 @@ uint64_t blockstore_t::get_free_block_count()
    return impl->get_free_block_count();
 }

-uint32_t blockstore_t::get_disk_alignment()
+uint32_t blockstore_t::get_bitmap_granularity()
 {
-    return impl->get_disk_alignment();
+    return impl->get_bitmap_granularity();
 }
--- a/src/blockstore.h
+++ b/src/blockstore.h
@ -183,5 +183,5 @@ public:
    uint64_t get_block_count();
    uint64_t get_free_block_count();

-    uint32_t get_disk_alignment();
+    uint32_t get_bitmap_granularity();
 };
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@ -3,12 +3,13 @@

 #include "blockstore_impl.h"

-journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
+journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
 {
    this->bs = bs;
-    this->flusher_count = flusher_count;
-    this->cur_flusher_count = 1;
-    this->target_flusher_count = 1;
+    this->max_flusher_count = bs->max_flusher_count;
+    this->min_flusher_count = bs->min_flusher_count;
+    this->cur_flusher_count = bs->min_flusher_count;
+    this->target_flusher_count = bs->min_flusher_count;
    dequeuing = false;
    trimming = false;
    active_flushers = 0;
@ -19,8 +20,8 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
    journal_trim_counter = 0;
    trim_wanted = 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
-    co = new journal_flusher_co[flusher_count];
-    for (int i = 0; i < flusher_count; i++)
+    co = new journal_flusher_co[max_flusher_count];
+    for (int i = 0; i < max_flusher_count; i++)
    {
        co[i].bs = bs;
        co[i].flusher = this;
@ -71,10 +72,10 @@ bool journal_flusher_t::is_active()
 void journal_flusher_t::loop()
 {
    target_flusher_count = bs->write_iodepth*2;
-    if (target_flusher_count <= 0)
-        target_flusher_count = 1;
-    else if (target_flusher_count > flusher_count)
-        target_flusher_count = flusher_count;
+    if (target_flusher_count < min_flusher_count)
+        target_flusher_count = min_flusher_count;
+    else if (target_flusher_count > max_flusher_count)
+        target_flusher_count = max_flusher_count;
    if (target_flusher_count > cur_flusher_count)
        cur_flusher_count = target_flusher_count;
    else if (target_flusher_count < cur_flusher_count)
@ -237,7 +238,8 @@ bool journal_flusher_co::loop()
    else if (wait_state == 21)
        goto resume_21;
 resume_0:
-    if (!flusher->flush_queue.size() || !flusher->dequeuing)
+    if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
+        !flusher->flush_queue.size() || !flusher->dequeuing)
    {
 stop_flusher:
        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
@ -482,6 +484,14 @@ resume_1:
        }
        if (has_delete)
        {
+            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
+            if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
+            {
+                printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx) while deleting %lx:%lx\n",
+                    clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
+                exit(1);
+            }
+            // zero out new metadata entry
            memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
        }
        else
@ -646,7 +656,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        {
            char err[1024];
            snprintf(
-                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
+                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
            );
            throw std::runtime_error(err);
@ -775,7 +785,10 @@ void journal_flusher_co::update_clean_db()
    if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
    {
 #ifdef BLOCKSTORE_DEBUG
-        printf("Free block %lu (new location is %lu)\n", old_clean_loc >> bs->block_order, clean_loc >> bs->block_order);
+        printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
+            old_clean_loc >> bs->block_order,
+            cur.oid.inode, cur.oid.stripe, cur.version,
+            clean_loc >> bs->block_order);
 #endif
        bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
    }
@ -783,6 +796,11 @@ void journal_flusher_co::update_clean_db()
    {
        auto clean_it = bs->clean_db.find(cur.oid);
        bs->clean_db.erase(clean_it);
+#ifdef BLOCKSTORE_DEBUG
+        printf("Free block %lu from %lx:%lx v%lu (delete)\n",
+            clean_loc >> bs->block_order,
+            cur.oid.inode, cur.oid.stripe, cur.version);
+#endif
        bs->data_alloc->set(clean_loc >> bs->block_order, false);
        clean_loc = UINT64_MAX;
    }
@ -804,7 +822,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
        goto resume_1;
    else if (wait_state == wait_base+2)
        goto resume_2;
-    if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_journal_fsync))
+    if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_data_fsync))
    {
        cur_sync = flusher->syncs.end();
        while (cur_sync != flusher->syncs.begin())
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@ -80,7 +80,7 @@ class journal_flusher_t
 {
    int trim_wanted = 0;
    bool dequeuing;
-    int flusher_count, cur_flusher_count, target_flusher_count;
+    int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
    int flusher_start_threshold;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
@ -99,7 +99,7 @@ class journal_flusher_t
    std::deque<object_id> flush_queue;
    std::map<object_id, uint64_t> flush_versions;
 public:
-    journal_flusher_t(int flusher_count, blockstore_impl_t *bs);
+    journal_flusher_t(blockstore_impl_t *bs);
    ~journal_flusher_t();
    void loop();
    bool is_active();
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@ -10,9 +10,9 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    ring_consumer.loop = [this]() { loop(); };
    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
-    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
    data_fd = meta_fd = journal.fd = -1;
    parse_config(config);
+    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
    try
    {
        open_data();
@ -31,7 +31,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
            close(journal.fd);
        throw;
    }
-    flusher = new journal_flusher_t(flusher_count, this);
+    flusher = new journal_flusher_t(this);
 }

 blockstore_impl_t::~blockstore_impl_t()
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@ -197,8 +197,8 @@ class blockstore_impl_t
    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
-    // Maximum flusher count
-    unsigned flusher_count;
+    // Maximum and minimum flusher count
+    unsigned max_flusher_count, min_flusher_count;
    // Maximum queue depth
    unsigned max_write_iodepth = 128;
    /******* END OF OPTIONS *******/
@ -210,6 +210,7 @@ class blockstore_impl_t
    blockstore_dirty_db_t dirty_db;
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
+    int unsynced_big_write_count = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;

@ -283,7 +284,7 @@ class blockstore_impl_t
    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
    int continue_stable(blockstore_op_t *op);
-    void mark_stable(const obj_ver_id & ov);
+    void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
    void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
    void stabilize_object(object_id oid, uint64_t max_ver);

@ -326,5 +327,5 @@ public:
    inline uint32_t get_block_size() { return block_size; }
    inline uint64_t get_block_count() { return block_count; }
    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
-    inline uint32_t get_disk_alignment() { return disk_alignment; }
+    inline uint32_t get_bitmap_granularity() { return disk_alignment; }
 };
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@ -111,7 +111,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
                {
                    // free the previous block
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i);
+                    printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
+                        clean_it->second.location >> block_order,
+                        clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
+                        done_cnt+i);
 #endif
                    bs->data_alloc->set(clean_it->second.location >> block_order, false);
                }
@ -399,6 +402,18 @@ resume_1:
            }
        }
    }
+    for (auto ov: double_allocs)
+    {
+        auto dirty_it = bs->dirty_db.find(ov);
+        if (dirty_it != bs->dirty_db.end() &&
+            IS_BIG_WRITE(dirty_it->second.state) &&
+            dirty_it->second.location == UINT64_MAX)
+        {
+            printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
+                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
+            exit(1);
+        }
+    }
    bs->flusher->mark_trim_possible();
    bs->journal.dirty_start = bs->journal.next_free;
    printf(
@ -549,7 +564,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_SMALL_WRITE_INSTANT)
                    {
-                        bs->mark_stable(ov);
+                        bs->mark_stable(ov, true);
                    }
                }
            }
@ -579,32 +594,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        // its data and metadata are already flushed.
                        // We don't know if newer versions are flushed, but
                        // the previous delete definitely is.
-                        // So we flush previous dirty entries, but retain the clean one.
+                        // So we forget previous dirty entries, but retain the clean one.
                        // This feature is required for writes happening shortly
                        // after deletes.
-                        auto dirty_end = dirty_it;
-                        dirty_end++;
-                        while (1)
-                        {
-                            if (dirty_it == bs->dirty_db.begin())
-                            {
-                                break;
-                            }
-                            dirty_it--;
-                            if (dirty_it->first.oid != je->big_write.oid)
-                            {
-                                dirty_it++;
-                                break;
-                            }
-                        }
-                        auto clean_it = bs->clean_db.find(je->big_write.oid);
-                        bs->erase_dirty(
-                            dirty_it, dirty_end,
-                            clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
-                        );
-                        // Remove it from the flusher's queue, too
-                        // Otherwise it may end up referring to a small unstable write after reading the rest of the journal
-                        bs->flusher->remove_flush(je->big_write.oid);
+                        erase_dirty_object(dirty_it);
                    }
                }
                auto clean_it = bs->clean_db.find(je->big_write.oid);
@ -616,18 +609,33 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .oid = je->big_write.oid,
                        .version = je->big_write.version,
                    };
-                    bs->dirty_db.emplace(ov, (dirty_entry){
+                    auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
                        .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
                        .flags = 0,
                        .location = je->big_write.location,
                        .offset = je->big_write.offset,
                        .len = je->big_write.len,
                        .journal_sector = proc_pos,
-                    });
+                    }).first;
+                    if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
+                    {
+                        // This is probably a big_write that's already flushed and freed, but it may
+                        // also indicate a bug. So we remember such entries and recheck them afterwards.
+                        // If it's not a bug they won't be present after reading the whole journal.
+                        dirty_it->second.location = UINT64_MAX;
+                        double_allocs.push_back(ov);
+                    }
+                    else
+                    {
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
+                        printf(
+                            "Allocate block (journal) %lu: %lx:%lx v%lu\n",
+                            je->big_write.location >> bs->block_order,
+                            ov.oid.inode, ov.oid.stripe, ov.version
+                        );
 #endif
                        bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
+                    }
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
                    printf(
@ -639,7 +647,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_BIG_WRITE_INSTANT)
                    {
-                        bs->mark_stable(ov);
+                        bs->mark_stable(ov, true);
                    }
                }
            }
@ -653,7 +661,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    .oid = je->stable.oid,
                    .version = je->stable.version,
                };
-                bs->mark_stable(ov);
+                bs->mark_stable(ov, true);
            }
            else if (je->type == JE_ROLLBACK)
            {
@ -672,9 +680,26 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
 #ifdef BLOCKSTORE_DEBUG
                printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
 #endif
+                bool dirty_exists = false;
+                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
+                    .oid = je->del.oid,
+                    .version = UINT64_MAX,
+                });
+                if (dirty_it != bs->dirty_db.begin())
+                {
+                    dirty_it--;
+                    dirty_exists = dirty_it->first.oid == je->del.oid;
+                }
                auto clean_it = bs->clean_db.find(je->del.oid);
-                if (clean_it == bs->clean_db.end() ||
-                    clean_it->second.version < je->del.version)
+                bool clean_exists = (clean_it != bs->clean_db.end() &&
+                    clean_it->second.version < je->del.version);
+                if (!clean_exists && dirty_exists)
+                {
+                    // Clean entry doesn't exist. This means that the delete is already flushed.
+                    // So we must not flush this object anymore.
+                    erase_dirty_object(dirty_it);
+                }
+                else if (clean_exists || dirty_exists)
                {
                    // oid, version
                    obj_ver_id ov = {
@ -692,8 +717,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    bs->journal.used_sectors[proc_pos]++;
                    // Deletions are treated as immediately stable, because
                    // "2-phase commit" (write->stabilize) isn't sufficient for them anyway
-                    bs->mark_stable(ov);
+                    bs->mark_stable(ov, true);
                }
+                // Ignore delete if neither preceding dirty entries nor the clean one are present
            }
            started = true;
            pos += je->size;
@ -704,3 +730,30 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
    bs->journal.next_free = next_free;
    return 1;
 }
+
+void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it)
+{
+    auto oid = dirty_it->first.oid;
+    auto dirty_end = dirty_it;
+    dirty_end++;
+    while (1)
+    {
+        if (dirty_it == bs->dirty_db.begin())
+        {
+            break;
+        }
+        dirty_it--;
+        if (dirty_it->first.oid != oid)
+        {
+            dirty_it++;
+            break;
+        }
+    }
+    auto clean_it = bs->clean_db.find(oid);
+    uint64_t clean_loc = clean_it != bs->clean_db.end()
+        ? clean_it->second.location : UINT64_MAX;
+    bs->erase_dirty(dirty_it, dirty_end, clean_loc);
+    // Remove it from the flusher's queue, too
+    // Otherwise it may end up referring to a small unstable write after reading the rest of the journal
+    bs->flusher->remove_flush(oid);
+}
--- a/src/blockstore_init.h
+++ b/src/blockstore_init.h
@ -36,6 +36,7 @@ class blockstore_init_journal
    bool started = false;
    uint64_t next_free;
    std::vector<bs_init_journal_done> done;
+    std::vector<obj_ver_id> double_allocs;
    uint64_t journal_pos = 0;
    uint64_t continue_pos = 0;
    void *init_write_buf = NULL;
@ -48,6 +49,7 @@ class blockstore_init_journal
    std::function<void(ring_data_t*)> simple_callback;
    int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
    void handle_event(ring_data_t *data);
+    void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
 public:
    blockstore_init_journal(blockstore_impl_t* bs);
    int loop();
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@ -69,7 +69,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
    meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
    bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
-    flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
+    max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
+    if (!max_flusher_count)
+        max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
+    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
    // Validate
    if (!block_size)
@ -80,9 +83,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        throw std::runtime_error("Bad block size");
    }
-    if (!flusher_count)
+    if (!max_flusher_count)
    {
-        flusher_count = 32;
+        max_flusher_count = 256;
+    }
+    if (!min_flusher_count)
+    {
+        min_flusher_count = 1;
    }
    if (!max_write_iodepth)
    {
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@ -163,10 +163,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
        auto rm_start = it;
        auto rm_end = it;
        it--;
-        while (it->first.oid == ov.oid &&
-            it->first.version > ov.version &&
-            !IS_IN_FLIGHT(it->second.state) &&
-            !IS_STABLE(it->second.state))
+        while (1)
        {
            if (it->first.oid != ov.oid)
                break;
@ -176,7 +173,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
                    max_unstable = it->first.version;
                break;
            }
-            else if (IS_STABLE(it->second.state))
+            else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
                break;
            // Remove entry
            rm_start = it;
@ -187,7 +184,6 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
        if (rm_start != rm_end)
        {
            erase_dirty(rm_start, rm_end, UINT64_MAX);
-        }
            auto unstab_it = unstable_writes.find(ov.oid);
            if (unstab_it != unstable_writes.end())
            {
@ -197,6 +193,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
                    unstab_it->second = max_unstable;
            }
        }
+    }
 }

 void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
@ -251,10 +248,12 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    }
    while (1)
    {
-        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
+        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
+            dirty_it->second.location != UINT64_MAX)
        {
 #ifdef BLOCKSTORE_DEBUG
-            printf("Free block %lu\n", dirty_it->second.location >> block_order);
+            printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> block_order,
+                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
 #endif
            data_alloc->set(dirty_it->second.location >> block_order, false);
        }
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@ -168,6 +168,9 @@ resume_5:
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        // Mark all dirty_db entries up to op->version as stable
+#ifdef BLOCKSTORE_DEBUG
+        printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
+#endif
        mark_stable(*v);
    }
    // Acknowledge op
@ -176,22 +179,39 @@ resume_5:
    return 2;
 }

-void blockstore_impl_t::mark_stable(const obj_ver_id & v)
+void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
 {
    auto dirty_it = dirty_db.find(v);
    if (dirty_it != dirty_db.end())
    {
        while (1)
        {
+            bool was_stable = IS_STABLE(dirty_it->second.state);
            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
            {
                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
            }
-            else if (IS_STABLE(dirty_it->second.state))
+            if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
+                IS_DELETE(dirty_it->second.state)))
            {
+                // Big write overrides all previous dirty entries
+                auto erase_end = dirty_it;
+                while (dirty_it != dirty_db.begin())
+                {
+                    dirty_it--;
+                    if (dirty_it->first.oid != v.oid)
+                    {
+                        dirty_it++;
                        break;
                    }
-            if (dirty_it == dirty_db.begin())
+                }
+                auto clean_it = clean_db.find(v.oid);
+                uint64_t clean_loc = clean_it != clean_db.end()
+                    ? clean_it->second.location : UINT64_MAX;
+                erase_dirty(dirty_it, erase_end, clean_loc);
+                break;
+            }
+            if (was_stable || dirty_it == dirty_db.begin())
            {
                break;
            }
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@ -24,6 +24,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
    if (PRIV(op)->op_state == 0)
    {
        stop_sync_submitted = false;
+        unsynced_big_write_count -= unsynced_big_writes.size();
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
        PRIV(op)->sync_small_checked = 0;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@ -201,7 +201,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
    if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
    {
        blockstore_journal_check_t space_check(this);
-        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
+        if (!space_check.check_available(op, unsynced_big_write_count + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@ -224,7 +224,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        dirty_it->second.location = loc << block_order;
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
 #ifdef BLOCKSTORE_DEBUG
-        printf("Allocate block %lu\n", loc);
+        printf(
+            "Allocate block %lu for %lx:%lx v%lu\n",
+            loc, op->oid.inode, op->oid.stripe, op->version
+        );
 #endif
        data_alloc->set(loc, true);
        uint64_t stripe_offset = (op->offset % bitmap_granularity);
@ -250,11 +253,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
        if (immediate_commit != IMMEDIATE_ALL)
        {
-            // Remember big write as unsynced
-            unsynced_big_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
+            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
+            unsynced_big_write_count++;
            PRIV(op)->op_state = 3;
        }
        else
@ -267,7 +267,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        // Small (journaled) write
        // First check if the journal has sufficient space
        blockstore_journal_check_t space_check(this);
-        if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
+        if (unsynced_big_write_count && !space_check.check_available(op, unsynced_big_write_count, sizeof(journal_entry_big_write), 0)
            || !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
@ -359,14 +359,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        {
            journal.next_free = journal_block_size;
        }
-        if (immediate_commit == IMMEDIATE_NONE)
-        {
-            // Remember small write as unsynced
-            unsynced_small_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-        }
        if (!PRIV(op)->pending_ops)
        {
            PRIV(op)->op_state = 4;
@ -431,7 +423,7 @@ resume_2:
 resume_4:
    // Switch object state
 #ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+    printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
 #endif
    bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
        ? (immediate_commit == IMMEDIATE_ALL)
@ -445,11 +437,31 @@ resume_4:
        | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
    if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
    {
-        // Deletions are treated as immediately stable
+        // Deletions and 'instant' operations are treated as immediately stable
        mark_stable(dirty_it->first);
    }
-    if (immediate_commit == IMMEDIATE_ALL)
+    if (!imm)
    {
+        if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
+        {
+            // Remember big write as unsynced
+            unsynced_big_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+        }
+        else
+        {
+            // Remember small write as unsynced
+            unsynced_small_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+        }
+    }
+    if (imm && (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
+    {
+        // Unblock small writes
        dirty_it++;
        while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
        {
@ -583,14 +595,6 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops++;
    }
-    else
-    {
-        // Remember delete as unsynced
-        unsynced_small_writes.push_back((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        });
-    }
    if (!PRIV(op)->pending_ops)
    {
        PRIV(op)->op_state = 4;
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@ -2,8 +2,17 @@
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)

 #include <stdexcept>
+#include <assert.h>
 #include "cluster_client.h"

+#define PART_SENT 1
+#define PART_DONE 2
+#define PART_ERROR 4
+#define CACHE_DIRTY 1
+#define CACHE_FLUSHING 2
+#define CACHE_REPEATING 3
+#define OP_FLUSH_BUFFER 2
+
 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
    this->ringloop = ringloop;
@ -20,39 +29,17 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
            // peer_osd just connected
            continue_ops();
        }
-        else if (unsynced_writes.size())
+        else if (dirty_buffers.size())
        {
            // peer_osd just dropped connection
-            for (auto op: syncing_writes)
+            // determine WHICH dirty_buffers are now obsolete and repeat them
+            for (auto & wr: dirty_buffers)
            {
-                for (auto & part: op->parts)
+                if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
+                    wr.second.state != CACHE_REPEATING)
                {
-                    if (part.osd_num == peer_osd && part.done)
-                    {
-                        // repeat this operation
-                        part.osd_num = 0;
-                        part.done = false;
-                        assert(!part.sent);
-                        op->done_count--;
-                    }
-                }
-            }
-            for (auto op: unsynced_writes)
-            {
-                for (auto & part: op->parts)
-                {
-                    if (part.osd_num == peer_osd && part.done)
-                    {
-                        // repeat this operation
-                        part.osd_num = 0;
-                        part.done = false;
-                        assert(!part.sent);
-                        op->done_count--;
-                    }
-                }
-                if (op->done_count < op->parts.size())
-                {
-                    cur_ops.insert(op);
+                    // FIXME: Flush in larger parts
+                    flush_buffer(wr.first, &wr.second);
                }
            }
            continue_ops();
@ -90,37 +77,87 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

 cluster_client_t::~cluster_client_t()
 {
+    for (auto bp: dirty_buffers)
+    {
+        free(bp.second.buf);
+    }
+    dirty_buffers.clear();
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
    }
 }

-void cluster_client_t::stop()
-{
-    while (msgr.clients.size() > 0)
-    {
-        msgr.stop_client(msgr.clients.begin()->first);
-    }
-}
-
 void cluster_client_t::continue_ops(bool up_retry)
 {
-    for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); )
+    if (!pgs_loaded)
    {
-        if ((*op_it)->up_wait)
+        // We're offline
+        return;
+    }
+    if (continuing_ops)
    {
-            if (up_retry)
+        // Attempt to reenter the function
+        continuing_ops = 2;
+        return;
+    }
+restart:
+    continuing_ops = 1;
+    op_queue_pos = 0;
+    bool has_flushes = false, has_writes = false;
+    while (op_queue_pos < op_queue.size())
    {
-                (*op_it)->up_wait = false;
-                continue_rw(*op_it++);
+        auto op = op_queue[op_queue_pos];
+        bool rm = false, is_flush = op->flags & OP_FLUSH_BUFFER;
+        auto opcode = op->opcode;
+        if (!op->up_wait || up_retry)
+        {
+            op->up_wait = false;
+            if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE)
+            {
+                if (is_flush || !has_flushes)
+                {
+                    // Regular writes can't proceed before buffer flushes
+                    rm = continue_rw(op);
+                }
+            }
+            else if (opcode == OSD_OP_SYNC)
+            {
+                if (!has_writes)
+                {
+                    // SYNC can't proceed before previous writes
+                    rm = continue_sync(op);
+                }
+            }
+        }
+        if (opcode == OSD_OP_WRITE)
+        {
+            has_writes = has_writes || !rm;
+            if (is_flush)
+            {
+                has_flushes = has_writes || !rm;
+            }
+        }
+        else if (opcode == OSD_OP_SYNC)
+        {
+            // Postpone writes until previous SYNC completes
+            // ...so dirty_writes can't contain anything newer than SYNC
+            has_flushes = has_writes || !rm;
+        }
+        if (rm)
+        {
+            op_queue.erase(op_queue.begin()+op_queue_pos, op_queue.begin()+op_queue_pos+1);
        }
        else
-                op_it++;
+        {
+            op_queue_pos++;
        }
-        else
-            continue_rw(*op_it++);
+        if (continuing_ops == 2)
+        {
+            goto restart;
        }
+    }
+    continuing_ops = 0;
 }

 static uint32_t is_power_of_two(uint64_t value)
@ -141,16 +178,11 @@ static uint32_t is_power_of_two(uint64_t value)
 void cluster_client_t::on_load_config_hook(json11::Json::object & config)
 {
    bs_block_size = config["block_size"].uint64_value();
-    bs_disk_alignment = config["disk_alignment"].uint64_value();
    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
    if (!bs_block_size)
    {
        bs_block_size = DEFAULT_BLOCK_SIZE;
    }
-    if (!bs_disk_alignment)
-    {
-        bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
-    }
    if (!bs_bitmap_granularity)
    {
        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
@ -165,13 +197,26 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
        // Cluster-wide immediate_commit mode
        immediate_commit = true;
    }
+    if (config.find("client_max_dirty_bytes") != config.end())
+    {
+        client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
+    }
    else if (config.find("client_dirty_limit") != config.end())
    {
-        client_dirty_limit = config["client_dirty_limit"].uint64_value();
+        // Old name
+        client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
    }
-    if (!client_dirty_limit)
+    if (config.find("client_max_dirty_ops") != config.end())
    {
-        client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
+        client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
+    }
+    if (!client_max_dirty_bytes)
+    {
+        client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
+    }
+    if (!client_max_dirty_ops)
+    {
+        client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
    }
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
@ -215,23 +260,10 @@ void cluster_client_t::on_change_hook(json11::Json::object & changes)
        {
            // At this point, all pool operations should have been suspended
            // And now they have to be resliced!
-            for (auto op: cur_ops)
+            for (auto op: op_queue)
            {
-                if (INODE_POOL(op->inode) == pool_item.first)
-                {
-                    op->needs_reslice = true;
-                }
-            }
-            for (auto op: unsynced_writes)
-            {
-                if (INODE_POOL(op->inode) == pool_item.first)
-                {
-                    op->needs_reslice = true;
-                }
-            }
-            for (auto op: syncing_writes)
-            {
-                if (INODE_POOL(op->inode) == pool_item.first)
+                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
+                    INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
@ -250,6 +282,11 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
    }
 }

+bool cluster_client_t::is_ready()
+{
+    return pgs_loaded;
+}
+
 void cluster_client_t::on_ready(std::function<void(void)> fn)
 {
    if (pgs_loaded)
@ -265,21 +302,15 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 /**
 * How writes are synced when immediate_commit is false
 *
- * 1) accept up to <client_dirty_limit> write operations for execution,
- *    queue all subsequent writes into <next_writes>
- * 2) accept exactly one SYNC, queue all subsequent SYNCs into <next_writes>, too
- * 3) "continue" all accepted writes
- *
 * "Continue" WRITE:
- * 1) if the operation is not a copy yet - copy it (required for replay)
- * 2) if the operation is not sliced yet - slice it
- * 3) if the operation doesn't require reslice - try to connect & send all remaining parts
- * 4) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
- * 5) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
- * 6) if PG count changes before all parts are done, wait for all in-progress parts to finish,
+ * 1) if the operation is not sliced yet - slice it
+ * 2) if the operation doesn't require reslice - try to connect & send all remaining parts
+ * 3) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
+ * 4) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
+ * 5) if PG count changes before all parts are done, wait for all in-progress parts to finish,
 *    throw all results away, reslice and resubmit op
- * 7) when all parts are done, try to "continue" the current SYNC
- * 8) if the operation succeeds, but then some OSDs drop their connections, repeat
+ * 6) when all parts are done, try to "continue" the current SYNC
+ * 7) if the operation succeeds, but then some OSDs drop their connections, repeat
 *    parts from the current "unsynced batch" previously sent to those OSDs in any order
 *
 * "Continue" current SYNC:
@ -289,181 +320,277 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
 * 5) if any of them fail due to other errors, fail the SYNC operation
 */
-
 void cluster_client_t::execute(cluster_op_t *op)
 {
-    if (!pgs_loaded)
-    {
-        // We're offline
-        offline_ops.push_back(op);
-        return;
-    }
-    op->retval = 0;
-    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE ||
-        (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && (!op->inode || !op->len ||
-        op->offset % bs_disk_alignment || op->len % bs_disk_alignment))
+    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return;
    }
-    if (op->opcode == OSD_OP_SYNC)
-    {
-        execute_sync(op);
-        return;
-    }
+    op->retval = 0;
    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
    {
-        if (next_writes.size() > 0)
-        {
-            assert(cur_sync);
-            next_writes.push_back(op);
-            return;
-        }
-        if (queued_bytes >= client_dirty_limit)
+        if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
        {
            // Push an extra SYNC operation to flush previous writes
-            next_writes.push_back(op);
            cluster_op_t *sync_op = new cluster_op_t;
-            sync_op->is_internal = true;
            sync_op->opcode = OSD_OP_SYNC;
-            sync_op->callback = [](cluster_op_t* sync_op) {};
-            execute_sync(sync_op);
-            return;
+            sync_op->callback = [](cluster_op_t* sync_op)
+            {
+                delete sync_op;
+            };
+            op_queue.push_back(sync_op);
+            dirty_bytes = 0;
+            dirty_ops = 0;
        }
-        queued_bytes += op->len;
+        dirty_bytes += op->len;
+        dirty_ops++;
    }
-    cur_ops.insert(op);
-    continue_rw(op);
+    else if (op->opcode == OSD_OP_SYNC)
+    {
+        dirty_bytes = 0;
+        dirty_ops = 0;
+    }
+    op_queue.push_back(op);
+    continue_ops();
 }

-void cluster_client_t::continue_rw(cluster_op_t *op)
+void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
 {
+    // Save operation for replay when one of PGs goes out of sync
+    // (primary OSD drops our connection in this case)
+    auto dirty_it = dirty_buffers.lower_bound((object_id){
+        .inode = op->inode,
+        .stripe = op->offset,
+    });
+    while (dirty_it != dirty_buffers.begin())
+    {
+        dirty_it--;
+        if (dirty_it->first.inode != op->inode ||
+            (dirty_it->first.stripe + dirty_it->second.len) <= op->offset)
+        {
+            dirty_it++;
+            break;
+        }
+    }
+    uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
+    while (len > 0)
+    {
+        uint64_t new_len = 0;
+        if (dirty_it == dirty_buffers.end())
+        {
+            new_len = len;
+        }
+        else if (dirty_it->first.inode != op->inode || dirty_it->first.stripe > pos)
+        {
+            new_len = dirty_it->first.stripe - pos;
+            if (new_len > len)
+            {
+                new_len = len;
+            }
+        }
+        if (new_len > 0)
+        {
+            dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
+                .inode = op->inode,
+                .stripe = pos,
+            }, (cluster_buffer_t){
+                .buf = malloc_or_die(new_len),
+                .len = new_len,
+            });
+        }
+        // FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
+        dirty_it->second.state = CACHE_DIRTY;
+        uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
+        if (cur_len > len)
+        {
+            cur_len = len;
+        }
+        while (cur_len > 0 && iov_idx < op->iov.count)
+        {
+            unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
+            if (iov_len <= cur_len)
+            {
+                memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
+                    op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
+                pos += iov_len;
+                len -= iov_len;
+                cur_len -= iov_len;
+                iov_pos = 0;
+                iov_idx++;
+            }
+            else
+            {
+                memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
+                    op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
+                pos += cur_len;
+                len -= cur_len;
+                iov_pos += cur_len;
+                cur_len = 0;
+            }
+        }
+        dirty_it++;
+    }
+}
+
+void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
+{
+    wr->state = CACHE_REPEATING;
+    cluster_op_t *op = new cluster_op_t;
+    op->flags = OP_FLUSH_BUFFER;
+    op->opcode = OSD_OP_WRITE;
+    op->inode = oid.inode;
+    op->offset = oid.stripe;
+    op->len = wr->len;
+    op->iov.push_back(wr->buf, wr->len);
+    op->callback = [wr](cluster_op_t* op)
+    {
+        if (wr->state == CACHE_REPEATING)
+        {
+            wr->state = CACHE_DIRTY;
+        }
+        delete op;
+    };
+    op_queue.insert(op_queue.begin(), op);
+    if (continuing_ops)
+    {
+        continuing_ops = 2;
+        op_queue_pos++;
+    }
+}
+
+int cluster_client_t::continue_rw(cluster_op_t *op)
+{
+    if (op->state == 0)
+        goto resume_0;
+    else if (op->state == 1)
+        goto resume_1;
+    else if (op->state == 2)
+        goto resume_2;
+    else if (op->state == 3)
+        goto resume_3;
+resume_0:
+    if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
+    {
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return 1;
+    }
+    {
        pool_id_t pool_id = INODE_POOL(op->inode);
        if (!pool_id)
        {
            op->retval = -EINVAL;
            std::function<void(cluster_op_t*)>(op->callback)(op);
-        return;
+            return 1;
        }
        if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
            st_cli.pool_config[pool_id].real_pg_count == 0)
        {
            // Postpone operations to unknown pools
-        return;
+            return 0;
        }
-    if (op->opcode == OSD_OP_WRITE && !immediate_commit && !op->is_internal)
-    {
-        // Save operation for replay when PG goes out of sync
-        // (primary OSD drops our connection in this case)
-        cluster_op_t *op_copy = new cluster_op_t();
-        op_copy->is_internal = true;
-        op_copy->orig_op = op;
-        op_copy->opcode = op->opcode;
-        op_copy->inode = op->inode;
-        op_copy->offset = op->offset;
-        op_copy->len = op->len;
-        op_copy->buf = malloc_or_die(op->len);
-        op_copy->iov.push_back(op_copy->buf, op->len);
-        op_copy->callback = [](cluster_op_t* op_copy)
-        {
-            if (op_copy->orig_op)
-            {
-                // Acknowledge write and forget the original pointer
-                op_copy->orig_op->retval = op_copy->retval;
-                std::function<void(cluster_op_t*)>(op_copy->orig_op->callback)(op_copy->orig_op);
-                op_copy->orig_op = NULL;
    }
-        };
-        void *cur_buf = op_copy->buf;
-        for (int i = 0; i < op->iov.count; i++)
+    if (op->opcode == OSD_OP_WRITE)
    {
-            memcpy(cur_buf, op->iov.buf[i].iov_base, op->iov.buf[i].iov_len);
-            cur_buf += op->iov.buf[i].iov_len;
-        }
-        unsynced_writes.push_back(op_copy);
-        cur_ops.erase(op);
-        cur_ops.insert(op_copy);
-        op = op_copy;
-    }
-    if (!op->parts.size())
+        if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
        {
+            copy_write(op, dirty_buffers);
+        }
+    }
+resume_1:
    // Slice the operation into parts
    slice_rw(op);
-    }
-    if (!op->needs_reslice)
-    {
+    op->needs_reslice = false;
+resume_2:
    // Send unsent parts, if they're not subject to change
-        for (auto & op_part: op->parts)
+    op->state = 3;
+    if (op->needs_reslice)
    {
-            if (!op_part.sent && !op_part.done)
+        for (int i = 0; i < op->parts.size(); i++)
        {
-                try_send(op, &op_part);
+            if (!(op->parts[i].flags & PART_SENT) && op->retval)
+            {
+                op->retval = -EPIPE;
+            }
+        }
+        goto resume_3;
+    }
+    for (int i = 0; i < op->parts.size(); i++)
+    {
+        if (!(op->parts[i].flags & PART_SENT))
+        {
+            if (!try_send(op, i))
+            {
+                // We'll need to retry again
+                op->up_wait = true;
+                if (!retry_timeout_id)
+                {
+                    retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
+                    {
+                        retry_timeout_id = 0;
+                        continue_ops(true);
+                    });
+                }
+                op->state = 2;
            }
        }
    }
-    if (!op->sent_count)
+    if (op->state == 2)
    {
+        return 0;
+    }
+resume_3:
+    if (op->inflight_count > 0)
+    {
+        op->state = 3;
+        return 0;
+    }
    if (op->done_count >= op->parts.size())
    {
        // Finished successfully
        // Even if the PG count has changed in meanwhile we treat it as success
        // because if some operations were invalid for the new PG count we'd get errors
-            cur_ops.erase(op);
        op->retval = op->len;
        std::function<void(cluster_op_t*)>(op->callback)(op);
-            continue_sync();
-            return;
+        return 1;
    }
    else if (op->retval != 0 && op->retval != -EPIPE)
    {
        // Fatal error (not -EPIPE)
-            cur_ops.erase(op);
-            if (!immediate_commit && op->opcode == OSD_OP_WRITE)
-            {
-                for (int i = 0; i < unsynced_writes.size(); i++)
-                {
-                    if (unsynced_writes[i] == op)
-                    {
-                        unsynced_writes.erase(unsynced_writes.begin()+i, unsynced_writes.begin()+i+1);
-                        break;
-                    }
-                }
-            }
-            bool del = op->is_internal;
        std::function<void(cluster_op_t*)>(op->callback)(op);
-            if (del)
-            {
-                if (op->buf)
-                    free(op->buf);
-                delete op;
-            }
-            continue_sync();
-            return;
+        return 1;
    }
    else
    {
-            // -EPIPE or no error - clear the error
+        // -EPIPE - clear the error and retry
        op->retval = 0;
        if (op->needs_reslice)
        {
            op->parts.clear();
            op->done_count = 0;
-                op->needs_reslice = false;
-                continue_rw(op);
-            }
+            goto resume_1;
+        }
+        else
+        {
+            for (int i = 0; i < op->parts.size(); i++)
+            {
+                op->parts[i].flags = 0;
+            }
+            goto resume_2;
        }
    }
+    return 0;
 }

 void cluster_client_t::slice_rw(cluster_op_t *op)
 {
    // Slice the request into individual object stripe requests
    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
-    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
-    uint64_t pg_block_size = bs_block_size * (
-        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
-    );
+    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
+    uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
+    uint64_t pg_block_size = bs_block_size * pg_data_size;
    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
    op->retval = 0;
@ -482,8 +609,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            .offset = begin,
            .len = (uint32_t)(end - begin),
            .pg_num = pg_num,
-            .sent = false,
-            .done = false,
+            .flags = 0,
        };
        int left = end-begin;
        while (left > 0 && iov_idx < op->iov.count)
@ -507,8 +633,28 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    }
 }

-bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
+bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd)
 {
+    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
+    uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
+    uint64_t pg_block_size = bs_block_size * pg_data_size;
+    uint64_t first_stripe = (offset / pg_block_size) * pg_block_size;
+    uint64_t last_stripe = ((offset + len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
+    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
+    {
+        pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
+        auto pg_it = pool_cfg.pg_config.find(pg_num);
+        if (pg_it != pool_cfg.pg_config.end() && pg_it->second.cur_primary == osd)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool cluster_client_t::try_send(cluster_op_t *op, int i)
+{
+    auto part = &op->parts[i];
    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
    if (pg_it != pool_cfg.pg_config.end() &&
@ -520,8 +666,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
        {
            int peer_fd = peer_it->second;
            part->osd_num = primary_osd;
-            part->sent = true;
-            op->sent_count++;
+            part->flags |= PART_SENT;
+            op->inflight_count++;
            part->op = (osd_op_t){
                .op_type = OSD_OP_OUT,
                .peer_fd = peer_fd,
@ -552,137 +698,99 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
    return false;
 }

-void cluster_client_t::execute_sync(cluster_op_t *op)
+int cluster_client_t::continue_sync(cluster_op_t *op)
 {
-    if (immediate_commit)
+    if (op->state == 1)
+        goto resume_1;
+    if (immediate_commit || !dirty_osds.size())
    {
-        // Syncs are not required in the immediate_commit mode
+        // Sync is not required in the immediate_commit mode or if there are no dirty_osds
        op->retval = 0;
        std::function<void(cluster_op_t*)>(op->callback)(op);
-    }
-    else if (cur_sync != NULL)
-    {
-        next_writes.push_back(op);
-    }
-    else
-    {
-        cur_sync = op;
-        continue_sync();
-    }
-}
-
-void cluster_client_t::continue_sync()
-{
-    if (!cur_sync || cur_sync->parts.size() > 0)
-    {
-        // Already submitted
-        return;
-    }
-    cur_sync->retval = 0;
-    std::set<osd_num_t> sync_osds;
-    for (auto prev_op: unsynced_writes)
-    {
-        if (prev_op->done_count < prev_op->parts.size())
-        {
-            // Writes not finished yet
-            return;
-        }
-        for (auto & part: prev_op->parts)
-        {
-            if (part.osd_num)
-            {
-                sync_osds.insert(part.osd_num);
-            }
-        }
-    }
-    if (!sync_osds.size())
-    {
-        // No dirty writes
-        finish_sync();
-        return;
+        return 1;
    }
    // Check that all OSD connections are still alive
-    for (auto sync_osd: sync_osds)
+    for (auto sync_osd: dirty_osds)
    {
        auto peer_it = msgr.osd_peer_fds.find(sync_osd);
        if (peer_it == msgr.osd_peer_fds.end())
        {
-            // SYNC is pointless to send to a non connected OSD
-            return;
+            return 0;
        }
    }
-    syncing_writes.swap(unsynced_writes);
    // Post sync to affected OSDs
-    cur_sync->parts.resize(sync_osds.size());
-    int i = 0;
-    for (auto sync_osd: sync_osds)
+    for (auto & prev_op: dirty_buffers)
    {
-        cur_sync->parts[i] = {
-            .parent = cur_sync,
+        if (prev_op.second.state == CACHE_DIRTY)
+        {
+            prev_op.second.state = CACHE_FLUSHING;
+        }
+    }
+    op->parts.resize(dirty_osds.size());
+    op->retval = 0;
+    {
+        int i = 0;
+        for (auto sync_osd: dirty_osds)
+        {
+            op->parts[i] = {
+                .parent = op,
                .osd_num = sync_osd,
-            .sent = false,
-            .done = false,
+                .flags = 0,
            };
-        send_sync(cur_sync, &cur_sync->parts[i]);
+            send_sync(op, &op->parts[i]);
            i++;
        }
-}
-
-void cluster_client_t::finish_sync()
-{
-    int retval = cur_sync->retval;
-    if (retval != 0)
+    }
+    dirty_osds.clear();
+resume_1:
+    if (op->inflight_count > 0)
    {
-        for (auto op: syncing_writes)
+        op->state = 1;
+        return 0;
+    }
+    if (op->retval != 0)
    {
-            if (op->done_count < op->parts.size())
+        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
        {
-                cur_ops.insert(op);
+            if (uw_it->second.state == CACHE_FLUSHING)
+            {
+                uw_it->second.state = CACHE_DIRTY;
            }
        }
-        unsynced_writes.insert(unsynced_writes.begin(), syncing_writes.begin(), syncing_writes.end());
-        syncing_writes.clear();
-    }
-    if (retval == -EPIPE)
+        if (op->retval == -EPIPE)
        {
            // Retry later
-        cur_sync->parts.clear();
-        cur_sync->retval = 0;
-        cur_sync->sent_count = 0;
-        cur_sync->done_count = 0;
-        return;
+            op->parts.clear();
+            op->retval = 0;
+            op->inflight_count = 0;
+            op->done_count = 0;
+            op->state = 0;
+            return 0;
        }
-    std::function<void(cluster_op_t*)>(cur_sync->callback)(cur_sync);
-    if (!retval)
+    }
+    else
    {
-        for (auto op: syncing_writes)
+        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
        {
-            assert(op->sent_count == 0);
-            if (op->is_internal)
+            if (uw_it->second.state == CACHE_FLUSHING)
            {
-                if (op->buf)
-                    free(op->buf);
-                delete op;
+                free(uw_it->second.buf);
+                dirty_buffers.erase(uw_it++);
+            }
+            else
+                uw_it++;
        }
    }
-        syncing_writes.clear();
-    }
-    cur_sync = NULL;
-    queued_bytes = 0;
-    std::vector<cluster_op_t*> next_wr_copy;
-    next_wr_copy.swap(next_writes);
-    for (auto next_op: next_wr_copy)
-    {
-        execute(next_op);
-    }
+    std::function<void(cluster_op_t*)>(op->callback)(op);
+    return 1;
 }

 void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
 {
    auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
    assert(peer_it != msgr.osd_peer_fds.end());
-    part->sent = true;
-    op->sent_count++;
+    part->flags |= PART_SENT;
+    op->inflight_count++;
    part->op = (osd_op_t){
        .op_type = OSD_OP_OUT,
        .peer_fd = peer_it->second,
@ -704,19 +812,18 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
 void cluster_client_t::handle_op_part(cluster_op_part_t *part)
 {
    cluster_op_t *op = part->parent;
-    part->sent = false;
-    op->sent_count--;
+    op->inflight_count--;
    int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
    if (part->op.reply.hdr.retval != expected)
    {
        // Operation failed, retry
        printf(
-            "Operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
-            part->osd_num, part->op.reply.hdr.retval, expected
+            "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
+            osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
        );
-        msgr.stop_client(part->op.peer_fd);
        if (part->op.reply.hdr.retval == -EPIPE)
        {
+            // Mark op->up_wait = true before stopping the client
            op->up_wait = true;
            if (!retry_timeout_id)
            {
@ -732,23 +839,18 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
            // Don't overwrite other errors with -EPIPE
            op->retval = part->op.reply.hdr.retval;
        }
+        msgr.stop_client(part->op.peer_fd);
+        part->flags |= PART_ERROR;
    }
    else
    {
        // OK
-        part->done = true;
+        dirty_osds.insert(part->osd_num);
+        part->flags |= PART_DONE;
        op->done_count++;
    }
-    if (op->sent_count == 0)
+    if (op->inflight_count == 0)
    {
-        if (op->opcode == OSD_OP_SYNC)
-        {
-            assert(op == cur_sync);
-            finish_sync();
-        }
-        else if (!op->up_wait)
-        {
-            continue_rw(op);
-        }
+        continue_ops();
    }
 }
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@ -10,7 +10,8 @@
 #define MAX_BLOCK_SIZE 128*1024*1024
 #define DEFAULT_DISK_ALIGNMENT 4096
 #define DEFAULT_BITMAP_GRANULARITY 4096
-#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
+#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
+#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024

 struct cluster_op_t;

@ -22,8 +23,7 @@ struct cluster_op_part_t
    pg_num_t pg_num;
    osd_num_t osd_num;
    osd_op_buf_list_t iov;
-    bool sent;
-    bool done;
+    unsigned flags;
    osd_op_t op;
 };

@ -37,47 +37,53 @@ struct cluster_op_t
    osd_op_buf_list_t iov;
    std::function<void(cluster_op_t*)> callback;
 protected:
+    int flags = 0;
+    int state = 0;
    void *buf = NULL;
    cluster_op_t *orig_op = NULL;
-    bool is_internal = false;
    bool needs_reslice = false;
    bool up_wait = false;
-    int sent_count = 0, done_count = 0;
+    int inflight_count = 0, done_count = 0;
    std::vector<cluster_op_part_t> parts;
    friend class cluster_client_t;
 };

+struct cluster_buffer_t
+{
+    void *buf;
+    uint64_t len;
+    int state;
+};
+
+// FIXME: Split into public and private interfaces
 class cluster_client_t
 {
    timerfd_manager_t *tfd;
    ring_loop_t *ringloop;

    uint64_t bs_block_size = 0;
-    uint64_t bs_disk_alignment = 0;
    uint64_t bs_bitmap_granularity = 0;
    std::map<pool_id_t, uint64_t> pg_counts;
    bool immediate_commit = false;
    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
-    uint64_t client_dirty_limit = 0;
+    uint64_t client_max_dirty_bytes = 0;
+    uint64_t client_max_dirty_ops = 0;
    int log_level;
    int up_wait_retry_interval = 500; // ms

-    uint64_t op_id = 1;
-    ring_consumer_t consumer;
-    // operations currently in progress
-    std::set<cluster_op_t*> cur_ops;
    int retry_timeout_id = 0;
-    // unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
-    // unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering)
-    std::vector<cluster_op_t*> unsynced_writes;
-    std::vector<cluster_op_t*> syncing_writes;
-    cluster_op_t* cur_sync = NULL;
-    std::vector<cluster_op_t*> next_writes;
+    uint64_t op_id = 1;
    std::vector<cluster_op_t*> offline_ops;
-    uint64_t queued_bytes = 0;
+    std::vector<cluster_op_t*> op_queue;
+    std::map<object_id, cluster_buffer_t> dirty_buffers;
+    std::set<osd_num_t> dirty_osds;
+    uint64_t dirty_bytes = 0, dirty_ops = 0;

    bool pgs_loaded = false;
+    ring_consumer_t consumer;
    std::vector<std::function<void(void)>> on_ready_hooks;
+    int continuing_ops = 0;
+    int op_queue_pos = 0;

 public:
    etcd_state_client_t st_cli;
@ -87,21 +93,22 @@ public:
    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
+    bool is_ready();
    void on_ready(std::function<void(void)> fn);
-    void stop();

-protected:
+    static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
    void continue_ops(bool up_retry = false);
+protected:
+    bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
+    void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
    void on_load_config_hook(json11::Json::object & config);
    void on_load_pgs_hook(bool success);
    void on_change_hook(json11::Json::object & changes);
    void on_change_osd_state_hook(uint64_t peer_osd);
-    void continue_rw(cluster_op_t *op);
+    int continue_rw(cluster_op_t *op);
    void slice_rw(cluster_op_t *op);
-    bool try_send(cluster_op_t *op, cluster_op_part_t *part);
-    void execute_sync(cluster_op_t *op);
-    void continue_sync();
-    void finish_sync();
+    bool try_send(cluster_op_t *op, int i);
+    int continue_sync(cluster_op_t *op);
    void send_sync(cluster_op_t *op, cluster_op_part_t *part);
    void handle_op_part(cluster_op_part_t *part);
 };
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@ -4,19 +4,24 @@
 #include "osd_ops.h"
 #include "pg_states.h"
 #include "etcd_state_client.h"
+#ifndef __MOCK__
 #include "http_client.h"
 #include "base64.h"
+#endif

 etcd_state_client_t::~etcd_state_client_t()
 {
    etcd_watches_initialised = -1;
+#ifndef __MOCK__
    if (etcd_watch_ws)
    {
        etcd_watch_ws->close();
        etcd_watch_ws = NULL;
    }
+#endif
 }

+#ifndef __MOCK__
 json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
 {
    json_kv_t kv;
@ -323,6 +328,26 @@ void etcd_state_client_t::load_pgs()
        start_etcd_watcher();
    });
 }
+#else
+void etcd_state_client_t::parse_config(json11::Json & config)
+{
+}
+
+void etcd_state_client_t::load_global_config()
+{
+    json11::Json::object global_config;
+    on_load_config_hook(global_config);
+}
+
+void etcd_state_client_t::load_pgs()
+{
+}
+#endif
+
+void etcd_state_client_t::parse_state(const json_kv_t & kv)
+{
+    parse_state(kv.key, kv.value);
+}

 void etcd_state_client_t::parse_state(const std::string & key, const json11::Json & value)
 {
@ -336,8 +361,10 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        {
            pool_config_t pc;
            // ID
-            pool_id_t pool_id = stoull_full(pool_item.first);
-            if (!pool_id || pool_id >= POOL_ID_MAX)
+            pool_id_t pool_id;
+            char null_byte = 0;
+            sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
+            if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
            {
                printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
@ -449,16 +476,19 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        }
        for (auto & pool_item: value["items"].object_items())
        {
-            pool_id_t pool_id = stoull_full(pool_item.first);
-            if (!pool_id || pool_id >= POOL_ID_MAX)
+            pool_id_t pool_id;
+            char null_byte = 0;
+            sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
+            if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
            {
                printf("Pool ID %s is invalid in PG configuration (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
            }
            for (auto & pg_item: pool_item.second.object_items())
            {
-                pg_num_t pg_num = stoull_full(pg_item.first);
-                if (!pg_num)
+                pg_num_t pg_num = 0;
+                sscanf(pg_item.first.c_str(), "%u%c", &pg_num, &null_byte);
+                if (!pg_num || null_byte != 0)
                {
                    printf("Bad key in pool %u PG configuration: %s (must be a number), skipped\n", pool_id, pg_item.first.c_str());
                    continue;
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@ -3,8 +3,8 @@

 #pragma once

+#include "json11/json11.hpp"
 #include "osd_id.h"
-#include "http_client.h"
 #include "timerfd_manager.h"

 #define ETCD_CONFIG_WATCH_ID 1
@ -52,9 +52,13 @@ struct pool_config_t
    std::map<pg_num_t, pg_config_t> pg_config;
 };

+struct websocket_t;
+
 struct etcd_state_client_t
 {
 protected:
+    websocket_t *etcd_watch_ws = NULL;
+    uint64_t bs_block_size = DEFAULT_BLOCK_SIZE;
    void add_etcd_url(std::string);
 public:
    std::vector<std::string> etcd_addresses;
@ -64,8 +68,6 @@ public:

    int etcd_watches_initialised = 0;
    uint64_t etcd_watch_revision = 0;
-    websocket_t *etcd_watch_ws = NULL;
-    uint64_t bs_block_size = 0;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;

@ -82,6 +84,7 @@ public:
    void start_etcd_watcher();
    void load_global_config();
    void load_pgs();
+    void parse_state(const json_kv_t & kv);
    void parse_state(const std::string & key, const json11::Json & value);
    void parse_config(json11::Json & config);
    ~etcd_state_client_t();
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@ -10,30 +10,16 @@

 #include "messenger.h"

-osd_op_t::~osd_op_t()
-{
-    assert(!bs_op);
-    assert(!op_data);
-    if (rmw_buf)
-    {
-        free(rmw_buf);
-    }
-    if (buf)
-    {
-        // Note: reusing osd_op_t WILL currently lead to memory leaks
-        // So we don't reuse it, but free it every time
-        free(buf);
-    }
-}
-
 void osd_messenger_t::init()
 {
    keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
    {
-        for (auto cl_it = clients.begin(); cl_it != clients.end();)
+        std::vector<int> to_stop;
+        std::vector<osd_op_t*> to_ping;
+        for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
        {
-            auto cl = (cl_it++)->second;
-            if (!cl->osd_num)
+            auto cl = cl_it->second;
+            if (!cl->osd_num || cl->peer_state != PEER_CONNECTED)
            {
                // Do not run keepalive on regular clients
                continue;
@ -44,7 +30,8 @@ void osd_messenger_t::init()
                if (!cl->ping_time_remaining)
                {
                    // Ping timed out, stop the client
-                    stop_client(cl->peer_fd, true);
+                    printf("Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
+                    to_stop.push_back(cl->peer_fd);
                }
            }
            else if (cl->idle_time_remaining > 0)
@ -70,10 +57,11 @@ void osd_messenger_t::init()
                        delete op;
                        if (fail_fd >= 0)
                        {
+                            printf("Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
                            stop_client(fail_fd, true);
                        }
                    };
-                    outbox_push(op);
+                    to_ping.push_back(op);
                    cl->ping_time_remaining = osd_ping_timeout;
                    cl->idle_time_remaining = osd_idle_timeout;
                }
@ -83,6 +71,15 @@ void osd_messenger_t::init()
                cl->idle_time_remaining = osd_idle_timeout;
            }
        }
+        // Don't stop clients while a 'clients' iterator is still active
+        for (int peer_fd: to_stop)
+        {
+            stop_client(peer_fd, true);
+        }
+        for (auto op: to_ping)
+        {
+            outbox_push(op);
+        }
    });
 }

@ -141,17 +138,14 @@ void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
        wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
    }
    wanted_peers[peer_osd].address_changed = true;
-    if (!wanted_peers[peer_osd].connecting &&
-        (time(NULL) - wanted_peers[peer_osd].last_connect_attempt) >= peer_connect_interval)
-    {
    try_connect_peer(peer_osd);
-    }
 }

 void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
 {
    auto wp_it = wanted_peers.find(peer_osd);
-    if (wp_it == wanted_peers.end())
+    if (wp_it == wanted_peers.end() || wp_it->second.connecting ||
+        (time(NULL) - wp_it->second.last_connect_attempt) < peer_connect_interval)
    {
        return;
    }
@ -197,10 +191,22 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        on_connect_peer(peer_osd, -errno);
        return;
    }
-    int timeout_id = -1;
+    clients[peer_fd] = new osd_client_t();
+    clients[peer_fd]->peer_addr = addr;
+    clients[peer_fd]->peer_port = peer_port;
+    clients[peer_fd]->peer_fd = peer_fd;
+    clients[peer_fd]->peer_state = PEER_CONNECTING;
+    clients[peer_fd]->connect_timeout_id = -1;
+    clients[peer_fd]->osd_num = peer_osd;
+    clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
+    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
+    {
+        // Either OUT (connected) or HUP
+        handle_connect_epoll(peer_fd);
+    });
    if (peer_connect_timeout > 0)
    {
-        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
+        clients[peer_fd]->connect_timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
        {
            osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
            stop_client(peer_fd, true);
@ -208,20 +214,6 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
            return;
        });
    }
-    clients[peer_fd] = new osd_client_t((osd_client_t){
-        .peer_addr = addr,
-        .peer_port = peer_port,
-        .peer_fd = peer_fd,
-        .peer_state = PEER_CONNECTING,
-        .connect_timeout_id = timeout_id,
-        .osd_num = peer_osd,
-        .in_buf = malloc_or_die(receive_buffer_size),
-    });
-    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
-    {
-        // Either OUT (connected) or HUP
-        handle_connect_epoll(peer_fd);
-    });
 }

 void osd_messenger_t::handle_connect_epoll(int peer_fd)
@ -373,123 +365,6 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
    outbox_push(op);
 }

-void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
-{
-    for (auto p: cl->sent_ops)
-    {
-        cancel_op(p.second);
-    }
-    cl->sent_ops.clear();
-    cl->outbox.clear();
-}
-
-void osd_messenger_t::cancel_op(osd_op_t *op)
-{
-    if (op->op_type == OSD_OP_OUT)
-    {
-        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        op->reply.hdr.id = op->req.hdr.id;
-        op->reply.hdr.opcode = op->req.hdr.opcode;
-        op->reply.hdr.retval = -EPIPE;
-        // Copy lambda to be unaffected by `delete op`
-        std::function<void(osd_op_t*)>(op->callback)(op);
-    }
-    else
-    {
-        // This function is only called in stop_client(), so it's fine to destroy the operation
-        delete op;
-    }
-}
-
-void osd_messenger_t::stop_client(int peer_fd, bool force)
-{
-    assert(peer_fd != 0);
-    auto it = clients.find(peer_fd);
-    if (it == clients.end())
-    {
-        return;
-    }
-    uint64_t repeer_osd = 0;
-    osd_client_t *cl = it->second;
-    if (cl->peer_state == PEER_CONNECTED)
-    {
-        if (cl->osd_num)
-        {
-            // Reload configuration from etcd when the connection is dropped
-            if (log_level > 0)
-                printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
-            repeer_osd = cl->osd_num;
-        }
-        else
-        {
-            if (log_level > 0)
-                printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
-        }
-    }
-    else if (!force)
-    {
-        return;
-    }
-    cl->peer_state = PEER_STOPPED;
-    clients.erase(it);
-    tfd->set_fd_handler(peer_fd, false, NULL);
-    if (cl->connect_timeout_id >= 0)
-    {
-        tfd->clear_timer(cl->connect_timeout_id);
-        cl->connect_timeout_id = -1;
-    }
-    if (cl->osd_num)
-    {
-        osd_peer_fds.erase(cl->osd_num);
-    }
-    if (cl->read_op)
-    {
-        if (cl->read_op->callback)
-        {
-            cancel_op(cl->read_op);
-        }
-        else
-        {
-            delete cl->read_op;
-        }
-        cl->read_op = NULL;
-    }
-    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
-    {
-        if (*rit == peer_fd)
-        {
-            read_ready_clients.erase(rit);
-            break;
-        }
-    }
-    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
-    {
-        if (*wit == peer_fd)
-        {
-            write_ready_clients.erase(wit);
-            break;
-        }
-    }
-    free(cl->in_buf);
-    cl->in_buf = NULL;
-    close(peer_fd);
-    if (repeer_osd)
-    {
-        // First repeer PGs as canceling OSD ops may push new operations
-        // and we need correct PG states when we do that
-        repeer_pgs(repeer_osd);
-    }
-    if (cl->osd_num)
-    {
-        // Cancel outbound operations
-        cancel_osd_ops(cl);
-    }
-    if (cl->refs <= 0)
-    {
-        delete cl;
-    }
-}
-
 void osd_messenger_t::accept_connections(int listen_fd)
 {
    // Accept new connections
@ -505,13 +380,12 @@ void osd_messenger_t::accept_connections(int listen_fd)
        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
        int one = 1;
        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        clients[peer_fd] = new osd_client_t((osd_client_t){
-            .peer_addr = addr,
-            .peer_port = ntohs(addr.sin_port),
-            .peer_fd = peer_fd,
-            .peer_state = PEER_CONNECTED,
-            .in_buf = malloc_or_die(receive_buffer_size),
-        });
+        clients[peer_fd] = new osd_client_t();
+        clients[peer_fd]->peer_addr = addr;
+        clients[peer_fd]->peer_port = ntohs(addr.sin_port);
+        clients[peer_fd]->peer_fd = peer_fd;
+        clients[peer_fd]->peer_state = PEER_CONNECTED;
+        clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
        // Add FD to epoll
        tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
        {
--- a/src/messenger.h
+++ b/src/messenger.h
@ -14,19 +14,15 @@

 #include "malloc_or_die.h"
 #include "json11/json11.hpp"
-#include "osd_ops.h"
+#include "msgr_op.h"
 #include "timerfd_manager.h"
-#include "ringloop.h"
-
-#define OSD_OP_IN 0
-#define OSD_OP_OUT 1
+#include <ringloop.h>

 #define CL_READ_HDR 1
 #define CL_READ_DATA 2
 #define CL_READ_REPLY_DATA 3
 #define CL_WRITE_READY 1
 #define CL_WRITE_REPLY 2
-#define OSD_OP_INLINE_BUF_COUNT 16

 #define PEER_CONNECTING 1
 #define PEER_CONNECTED 2
@ -36,160 +32,6 @@
 #define DEFAULT_PEER_CONNECT_TIMEOUT 5
 #define DEFAULT_OSD_PING_TIMEOUT 5

-// Kind of a vector with small-list-optimisation
-struct osd_op_buf_list_t
-{
-    int count = 0, alloc = OSD_OP_INLINE_BUF_COUNT, done = 0;
-    iovec *buf = NULL;
-    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
-
-    inline osd_op_buf_list_t()
-    {
-        buf = inline_buf;
-    }
-
-    inline osd_op_buf_list_t(const osd_op_buf_list_t & other)
-    {
-        buf = inline_buf;
-        append(other);
-    }
-
-    inline osd_op_buf_list_t & operator = (const osd_op_buf_list_t & other)
-    {
-        reset();
-        append(other);
-        return *this;
-    }
-
-    inline ~osd_op_buf_list_t()
-    {
-        if (buf && buf != inline_buf)
-        {
-            free(buf);
-        }
-    }
-
-    inline void reset()
-    {
-        count = 0;
-        done = 0;
-    }
-
-    inline iovec* get_iovec()
-    {
-        return buf + done;
-    }
-
-    inline int get_size()
-    {
-        return count - done;
-    }
-
-    inline void append(const osd_op_buf_list_t & other)
-    {
-        if (count+other.count > alloc)
-        {
-            if (buf == inline_buf)
-            {
-                int old = alloc;
-                alloc = (((count+other.count+15)/16)*16);
-                buf = (iovec*)malloc(sizeof(iovec) * alloc);
-                if (!buf)
-                {
-                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
-                    exit(1);
-                }
-                memcpy(buf, inline_buf, sizeof(iovec) * old);
-            }
-            else
-            {
-                alloc = (((count+other.count+15)/16)*16);
-                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
-                if (!buf)
-                {
-                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
-                    exit(1);
-                }
-            }
-        }
-        for (int i = 0; i < other.count; i++)
-        {
-            buf[count++] = other.buf[i];
-        }
-    }
-
-    inline void push_back(void *nbuf, size_t len)
-    {
-        if (count >= alloc)
-        {
-            if (buf == inline_buf)
-            {
-                int old = alloc;
-                alloc = ((alloc/16)*16 + 1);
-                buf = (iovec*)malloc(sizeof(iovec) * alloc);
-                if (!buf)
-                {
-                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
-                    exit(1);
-                }
-                memcpy(buf, inline_buf, sizeof(iovec)*old);
-            }
-            else
-            {
-                alloc = alloc < 16 ? 16 : (alloc+16);
-                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
-                if (!buf)
-                {
-                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
-                    exit(1);
-                }
-            }
-        }
-        buf[count++] = { .iov_base = nbuf, .iov_len = len };
-    }
-
-    inline void eat(int result)
-    {
-        while (result > 0 && done < count)
-        {
-            iovec & iov = buf[done];
-            if (iov.iov_len <= result)
-            {
-                result -= iov.iov_len;
-                done++;
-            }
-            else
-            {
-                iov.iov_len -= result;
-                iov.iov_base += result;
-                break;
-            }
-        }
-    }
-};
-
-struct blockstore_op_t;
-
-struct osd_primary_op_data_t;
-
-struct osd_op_t
-{
-    timespec tv_begin;
-    uint64_t op_type = OSD_OP_IN;
-    int peer_fd;
-    osd_any_op_t req;
-    osd_any_reply_t reply;
-    blockstore_op_t *bs_op = NULL;
-    void *buf = NULL;
-    void *rmw_buf = NULL;
-    osd_primary_op_data_t* op_data = NULL;
-    std::function<void(osd_op_t*)> callback;
-
-    osd_op_buf_list_t iov;
-
-    ~osd_op_t();
-};
-
 struct osd_client_t
 {
    int refs = 0;
@ -228,6 +70,12 @@ struct osd_client_t
    int write_state = 0;
    std::vector<iovec> send_list, next_send_list;
    std::vector<osd_op_t*> outbox, next_outbox;
+
+    ~osd_client_t()
+    {
+        free(in_buf);
+        in_buf = NULL;
+    }
 };

 struct osd_wanted_peer_t
@ -252,12 +100,9 @@ struct osd_op_stats_t

 struct osd_messenger_t
 {
-    timerfd_manager_t *tfd;
-    ring_loop_t *ringloop;
+protected:
    int keepalive_timer_id = -1;

-    // osd_num_t is only for logging and asserts
-    osd_num_t osd_num;
    // FIXME: make receive_buffer_size configurable
    int receive_buffer_size = 64*1024;
    int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
@ -267,19 +112,22 @@ struct osd_messenger_t
    int log_level = 0;
    bool use_sync_send_recv = false;

-    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
-    std::map<uint64_t, int> osd_peer_fds;
-    uint64_t next_subop_id = 1;
-
-    std::map<int, osd_client_t*> clients;
    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
    std::vector<std::function<void()>> set_immediate;

+public:
+    timerfd_manager_t *tfd;
+    ring_loop_t *ringloop;
+    // osd_num_t is only for logging and asserts
+    osd_num_t osd_num;
+    uint64_t next_subop_id = 1;
+    std::map<int, osd_client_t*> clients;
+    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
+    std::map<uint64_t, int> osd_peer_fds;
    // op statistics
    osd_op_stats_t stats;

-public:
    void init();
    void parse_config(const json11::Json & config);
    void connect_peer(uint64_t osd_num, json11::Json peer_state);
@ -287,7 +135,6 @@ public:
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
-    void handle_peer_epoll(int peer_fd, int epoll_events);
    void read_requests();
    void send_replies();
    void accept_connections(int listen_fd);
@ -296,6 +143,7 @@ public:
 protected:
    void try_connect_peer(uint64_t osd_num);
    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
+    void handle_peer_epoll(int peer_fd, int epoll_events);
    void handle_connect_epoll(int peer_fd);
    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
    void check_peer_config(osd_client_t *cl);
--- a/src/mock/build.sh
+++ b/src/mock/build.sh
@ -0,0 +1 @@
+g++ -D__MOCK__ -fsanitize=address -g -Wno-pointer-arith pg_states.cpp osd_ops.cpp test_cluster_client.cpp cluster_client.cpp msgr_op.cpp msgr_stop.cpp mock/messenger.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp -I mock -I . -I ..; ./a.out
--- a/src/mock/messenger.cpp
+++ b/src/mock/messenger.cpp
@ -0,0 +1,44 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <unistd.h>
+#include <stdexcept>
+#include <assert.h>
+
+#include "messenger.h"
+
+void osd_messenger_t::init()
+{
+}
+
+osd_messenger_t::~osd_messenger_t()
+{
+    while (clients.size() > 0)
+    {
+        stop_client(clients.begin()->first, true);
+    }
+}
+
+void osd_messenger_t::outbox_push(osd_op_t *cur_op)
+{
+    clients[cur_op->peer_fd]->sent_ops[cur_op->req.hdr.id] = cur_op;
+}
+
+void osd_messenger_t::parse_config(const json11::Json & config)
+{
+}
+
+void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
+{
+    wanted_peers[peer_osd] = (osd_wanted_peer_t){
+        .port = 1,
+    };
+}
+
+void osd_messenger_t::read_requests()
+{
+}
+
+void osd_messenger_t::send_replies()
+{
+}
--- a/src/mock/ringloop.h
+++ b/src/mock/ringloop.h
@ -0,0 +1,25 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include <functional>
+
+struct ring_consumer_t
+{
+    std::function<void(void)> loop;
+};
+
+class ring_loop_t
+{
+public:
+    void register_consumer(ring_consumer_t *consumer)
+    {
+    }
+    void unregister_consumer(ring_consumer_t *consumer)
+    {
+    }
+    void submit()
+    {
+    }
+};
--- a/src/msgr_op.cpp
+++ b/src/msgr_op.cpp
@ -0,0 +1,22 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <assert.h>
+
+#include "msgr_op.h"
+
+osd_op_t::~osd_op_t()
+{
+    assert(!bs_op);
+    assert(!op_data);
+    if (rmw_buf)
+    {
+        free(rmw_buf);
+    }
+    if (buf)
+    {
+        // Note: reusing osd_op_t WILL currently lead to memory leaks
+        // So we don't reuse it, but free it every time
+        free(buf);
+    }
+}
--- a/src/msgr_op.h
+++ b/src/msgr_op.h
@ -0,0 +1,171 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include <sys/uio.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "osd_ops.h"
+
+#define OSD_OP_IN 0
+#define OSD_OP_OUT 1
+
+#define OSD_OP_INLINE_BUF_COUNT 16
+
+// Kind of a vector with small-list-optimisation
+struct osd_op_buf_list_t
+{
+    int count = 0, alloc = OSD_OP_INLINE_BUF_COUNT, done = 0;
+    iovec *buf = NULL;
+    iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
+
+    inline osd_op_buf_list_t()
+    {
+        buf = inline_buf;
+    }
+
+    inline osd_op_buf_list_t(const osd_op_buf_list_t & other)
+    {
+        buf = inline_buf;
+        append(other);
+    }
+
+    inline osd_op_buf_list_t & operator = (const osd_op_buf_list_t & other)
+    {
+        reset();
+        append(other);
+        return *this;
+    }
+
+    inline ~osd_op_buf_list_t()
+    {
+        if (buf && buf != inline_buf)
+        {
+            free(buf);
+        }
+    }
+
+    inline void reset()
+    {
+        count = 0;
+        done = 0;
+    }
+
+    inline iovec* get_iovec()
+    {
+        return buf + done;
+    }
+
+    inline int get_size()
+    {
+        return count - done;
+    }
+
+    inline void append(const osd_op_buf_list_t & other)
+    {
+        if (count+other.count > alloc)
+        {
+            if (buf == inline_buf)
+            {
+                int old = alloc;
+                alloc = (((count+other.count+15)/16)*16);
+                buf = (iovec*)malloc(sizeof(iovec) * alloc);
+                if (!buf)
+                {
+                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    exit(1);
+                }
+                memcpy(buf, inline_buf, sizeof(iovec) * old);
+            }
+            else
+            {
+                alloc = (((count+other.count+15)/16)*16);
+                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
+                if (!buf)
+                {
+                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    exit(1);
+                }
+            }
+        }
+        for (int i = 0; i < other.count; i++)
+        {
+            buf[count++] = other.buf[i];
+        }
+    }
+
+    inline void push_back(void *nbuf, size_t len)
+    {
+        if (count >= alloc)
+        {
+            if (buf == inline_buf)
+            {
+                int old = alloc;
+                alloc = ((alloc/16)*16 + 1);
+                buf = (iovec*)malloc(sizeof(iovec) * alloc);
+                if (!buf)
+                {
+                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    exit(1);
+                }
+                memcpy(buf, inline_buf, sizeof(iovec)*old);
+            }
+            else
+            {
+                alloc = alloc < 16 ? 16 : (alloc+16);
+                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
+                if (!buf)
+                {
+                    printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    exit(1);
+                }
+            }
+        }
+        buf[count++] = { .iov_base = nbuf, .iov_len = len };
+    }
+
+    inline void eat(int result)
+    {
+        while (result > 0 && done < count)
+        {
+            iovec & iov = buf[done];
+            if (iov.iov_len <= result)
+            {
+                result -= iov.iov_len;
+                done++;
+            }
+            else
+            {
+                iov.iov_len -= result;
+                iov.iov_base += result;
+                break;
+            }
+        }
+    }
+};
+
+struct blockstore_op_t;
+
+struct osd_primary_op_data_t;
+
+struct osd_op_t
+{
+    timespec tv_begin;
+    uint64_t op_type = OSD_OP_IN;
+    int peer_fd;
+    osd_any_op_t req;
+    osd_any_reply_t reply;
+    blockstore_op_t *bs_op = NULL;
+    void *buf = NULL;
+    void *rmw_buf = NULL;
+    osd_primary_op_data_t* op_data = NULL;
+    std::function<void(osd_op_t*)> callback;
+
+    osd_op_buf_list_t iov;
+
+    ~osd_op_t();
+};
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@ -180,7 +180,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
-        if (!cl->refs)
+        if (cl->refs <= 0)
        {
            delete cl;
        }
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@ -0,0 +1,137 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <unistd.h>
+#include <assert.h>
+
+#include "messenger.h"
+
+void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
+{
+    std::vector<osd_op_t*> cancel_ops;
+    cancel_ops.resize(cl->sent_ops.size());
+    int i = 0;
+    for (auto p: cl->sent_ops)
+    {
+        cancel_ops[i++] = p.second;
+    }
+    cl->sent_ops.clear();
+    cl->outbox.clear();
+    for (auto op: cancel_ops)
+    {
+        cancel_op(op);
+    }
+}
+
+void osd_messenger_t::cancel_op(osd_op_t *op)
+{
+    if (op->op_type == OSD_OP_OUT)
+    {
+        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        op->reply.hdr.id = op->req.hdr.id;
+        op->reply.hdr.opcode = op->req.hdr.opcode;
+        op->reply.hdr.retval = -EPIPE;
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(op->callback)(op);
+    }
+    else
+    {
+        // This function is only called in stop_client(), so it's fine to destroy the operation
+        delete op;
+    }
+}
+
+void osd_messenger_t::stop_client(int peer_fd, bool force)
+{
+    assert(peer_fd != 0);
+    auto it = clients.find(peer_fd);
+    if (it == clients.end())
+    {
+        return;
+    }
+    osd_client_t *cl = it->second;
+    if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
+    {
+        return;
+    }
+    if (log_level > 0)
+    {
+        if (cl->osd_num)
+        {
+            printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
+        }
+        else
+        {
+            printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
+        }
+    }
+    // First set state to STOPPED so another stop_client() call doesn't try to free it again
+    cl->refs++;
+    cl->peer_state = PEER_STOPPED;
+    if (cl->osd_num)
+    {
+        // ...and forget OSD peer
+        osd_peer_fds.erase(cl->osd_num);
+    }
+#ifndef __MOCK__
+    // Then remove FD from the eventloop so we don't accidentally read something
+    tfd->set_fd_handler(peer_fd, false, NULL);
+    if (cl->connect_timeout_id >= 0)
+    {
+        tfd->clear_timer(cl->connect_timeout_id);
+        cl->connect_timeout_id = -1;
+    }
+    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
+    {
+        if (*rit == peer_fd)
+        {
+            read_ready_clients.erase(rit);
+            break;
+        }
+    }
+    for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
+    {
+        if (*wit == peer_fd)
+        {
+            write_ready_clients.erase(wit);
+            break;
+        }
+    }
+#endif
+    if (cl->osd_num)
+    {
+        // Then repeer PGs because cancel_op() callbacks can try to perform
+        // some actions and we need correct PG states to not do something silly
+        repeer_pgs(cl->osd_num);
+    }
+    // Then cancel all operations
+    if (cl->read_op)
+    {
+        if (!cl->read_op->callback)
+        {
+            delete cl->read_op;
+        }
+        cl->read_op = NULL;
+    }
+    if (cl->osd_num)
+    {
+        // Cancel outbound operations
+        cancel_osd_ops(cl);
+    }
+#ifndef __MOCK__
+    // And close the FD only when everything is done
+    // ...because peer_fd number can get reused after close()
+    close(peer_fd);
+#endif
+    // Find the item again because it can be invalidated at this point
+    it = clients.find(peer_fd);
+    if (it != clients.end())
+    {
+        clients.erase(it);
+    }
+    cl->refs--;
+    if (cl->refs <= 0)
+    {
+        delete cl;
+    }
+}
--- a/src/osd.cpp
+++ b/src/osd.cpp
@ -8,16 +8,20 @@
 #include <arpa/inet.h>

 #include "osd.h"
+#include "http_client.h"

-osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
+osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
 {
+    config["entry_attr_size"] = "0";
+
    this->config = config;
-    this->bs = bs;
    this->ringloop = ringloop;

+    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
+    this->bs = new blockstore_t(config, ringloop);
+
    this->bs_block_size = bs->get_block_size();
-    // FIXME: use bitmap granularity instead
-    this->bs_disk_alignment = bs->get_disk_alignment();
+    this->bs_bitmap_granularity = bs->get_bitmap_granularity();

    parse_config(config);

@ -49,6 +53,7 @@ osd_t::~osd_t()
 {
    ringloop->unregister_consumer(&consumer);
    delete epmgr;
+    delete bs;
    close(listen_fd);
 }

@ -171,7 +176,7 @@ bool osd_t::shutdown()
    {
        return false;
    }
-    return bs->is_safe_to_stop();
+    return !bs || bs->is_safe_to_stop();
 }

 void osd_t::loop()
@ -191,6 +196,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
        delete cur_op;
        return;
    }
+    // Clear the reply buffer
+    memset(cur_op->reply.buf, 0, OSD_PACKET_SIZE);
    inflight_ops++;
    if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
        cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
@ -198,14 +205,14 @@ void osd_t::exec_op(osd_op_t *cur_op)
            cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
            cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
            (cur_op->req.sec_rw.len > OSD_RW_MAX ||
-            cur_op->req.sec_rw.len % bs_disk_alignment ||
-            cur_op->req.sec_rw.offset % bs_disk_alignment)) ||
+            cur_op->req.sec_rw.len % bs_bitmap_granularity ||
+            cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
        ((cur_op->req.hdr.opcode == OSD_OP_READ ||
            cur_op->req.hdr.opcode == OSD_OP_WRITE ||
            cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
            (cur_op->req.rw.len > OSD_RW_MAX ||
-            cur_op->req.rw.len % bs_disk_alignment ||
-            cur_op->req.rw.offset % bs_disk_alignment)))
+            cur_op->req.rw.len % bs_bitmap_granularity ||
+            cur_op->req.rw.offset % bs_bitmap_granularity)))
    {
        // Bad command
        finish_op(cur_op, -EINVAL);
--- a/src/osd.h
+++ b/src/osd.h
@ -115,7 +115,7 @@ class osd_t
    bool stopping = false;
    int inflight_ops = 0;
    blockstore_t *bs;
-    uint32_t bs_block_size, bs_disk_alignment;
+    uint32_t bs_block_size, bs_bitmap_granularity;
    ring_loop_t *ringloop;
    timerfd_manager_t *tfd = NULL;
    epoll_manager_t *epmgr = NULL;
@ -198,6 +198,7 @@ class osd_t
    void continue_primary_del(osd_op_t *cur_op);
    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
+    void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
    void handle_primary_bs_subop(osd_op_t *subop);
@ -206,9 +207,11 @@ class osd_t
    void submit_primary_subops(int submit_type, uint64_t op_version, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
    void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set);
    void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
-    void submit_primary_sync_subops(osd_op_t *cur_op);
+    int submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);

+    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
+
    inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
    {
        uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
@ -218,7 +221,7 @@ class osd_t
    }

 public:
-    osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
+    osd_t(blockstore_config_t & config, ring_loop_t *ringloop);
    ~osd_t();
    void force_stop(int exitcode);
    bool shutdown();
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@ -4,6 +4,7 @@
 #include "osd.h"
 #include "base64.h"
 #include "etcd_state_client.h"
+#include "http_client.h"
 #include "osd_rmw.h"

 // Startup sequence:
@ -557,7 +558,7 @@ void osd_t::apply_pg_config()
                }
                if (currently_taken)
                {
-                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
+                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
                    {
                        if (pg_it->second.target_set == pg_cfg.target_set)
                        {
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@ -149,10 +149,14 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
        {
            continue_primary_write(op);
        }
-        if (pg.inflight == 0 && (pg.state & PG_STOPPING))
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
        {
            finish_stop_pg(pg);
        }
+        else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
+        {
+            start_pg_peering(pg);
+        }
    }
 }

@ -231,7 +235,8 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
    {
        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
        {
-            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+            // Don't try to "recover" misplaced objects if "recovery" would make them degraded
+            if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
            {
                for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
                {
--- a/src/osd_main.cpp
+++ b/src/osd_main.cpp
@ -41,16 +41,13 @@ int main(int narg, char *args[])
    signal(SIGINT, handle_sigint);
    signal(SIGTERM, handle_sigint);
    ring_loop_t *ringloop = new ring_loop_t(512);
-    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
-    blockstore_t *bs = new blockstore_t(config, ringloop);
-    osd = new osd_t(config, bs, ringloop);
+    osd = new osd_t(config, ringloop);
    while (1)
    {
        ringloop->loop();
        ringloop->wait();
    }
    delete osd;
-    delete bs;
    delete ringloop;
    return 0;
 }
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@ -77,10 +77,11 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
    // Re-peer affected PGs
    for (auto & p: pgs)
    {
+        auto & pg = p.second;
        bool repeer = false;
-        if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
+        if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
        {
-            for (osd_num_t pg_osd: p.second.all_peers)
+            for (osd_num_t pg_osd: pg.all_peers)
            {
                if (pg_osd == peer_osd)
                {
@ -91,8 +92,17 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
            if (repeer)
            {
                // Repeer this pg
-                printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd);
-                start_pg_peering(p.second);
+                printf("[PG %u/%u] Repeer because of OSD %lu\n", pg.pool_id, pg.pg_num, peer_osd);
+                if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
+                {
+                    start_pg_peering(pg);
+                }
+                else
+                {
+                    // Stop accepting new operations, wait for current ones to finish or fail
+                    pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
+                    report_pg_state(pg);
+                }
            }
        }
    }
@ -334,9 +344,10 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
            {
                // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
                printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
+                int fail_fd = op->peer_fd;
                ps->list_ops.erase(role_osd);
-                c_cli.stop_client(op->peer_fd);
                delete op;
+                c_cli.stop_client(fail_fd);
                return;
            }
            delete op;
@ -413,9 +424,10 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
            if (op->reply.hdr.retval < 0)
            {
                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
+                int fail_fd = op->peer_fd;
                ps->list_ops.erase(role_osd);
-                c_cli.stop_client(op->peer_fd);
                delete op;
+                c_cli.stop_client(fail_fd);
                return;
            }
            printf(
@ -484,15 +496,13 @@ bool osd_t::stop_pg(pg_t & pg)
    {
        return false;
    }
-    if (!(pg.state & PG_ACTIVE))
+    if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
    {
        finish_stop_pg(pg);
        return true;
    }
-    pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
-    if (pg.inflight == 0 && !pg.flush_batch &&
-        // We must either forget all PG's unstable writes or wait for it to become clean
-        dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
+    pg.state = pg.state & ~PG_ACTIVE & ~PG_REPEERING | PG_STOPPING;
+    if (pg.inflight == 0 && !pg.flush_batch)
    {
        finish_stop_pg(pg);
    }
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@ -430,12 +430,13 @@ void pg_t::calc_object_states(int log_level)
 void pg_t::print_state()
 {
    printf(
-        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
        (state & PG_INCOMPLETE) ? "incomplete" : "",
        (state & PG_ACTIVE) ? "active" : "",
+        (state & PG_REPEERING) ? "repeering" : "",
        (state & PG_STOPPING) ? "stopping" : "",
        (state & PG_DEGRADED) ? " + degraded" : "",
        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@ -18,7 +18,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
    // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
    pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
-    // FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
+    // Note: We read pool config here, so we must NOT change it when PGs are active
    auto pool_cfg_it = st_cli.pool_config.find(pool_id);
    if (pool_cfg_it == st_cli.pool_config.end())
    {
@ -44,8 +44,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        return false;
    }
    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
-        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
-        (cur_op->req.rw.len % bs_disk_alignment) != 0)
+        (cur_op->req.rw.offset % bs_bitmap_granularity) != 0 ||
+        (cur_op->req.rw.len % bs_bitmap_granularity) != 0)
    {
        finish_op(cur_op, -EINVAL);
        return false;
@ -64,7 +64,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    return true;
 }

-static uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
+uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
 {
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
@ -177,609 +177,6 @@ resume_2:
    finish_op(cur_op, cur_op->req.rw.len);
 }

-bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    // Check if actions are pending for this object
-    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
-        .oid = op_data->oid,
-        .osd_num = 0,
-    });
-    if (act_it != pg.flush_actions.end() &&
-        act_it->first.oid.inode == op_data->oid.inode &&
-        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
-    {
-        pg.write_queue.emplace(op_data->oid, cur_op);
-        return false;
-    }
-    // Check if there are other write requests to the same object
-    auto vo_it = pg.write_queue.find(op_data->oid);
-    if (vo_it != pg.write_queue.end())
-    {
-        op_data->st = 1;
-        pg.write_queue.emplace(op_data->oid, cur_op);
-        return false;
-    }
-    pg.write_queue.emplace(op_data->oid, cur_op);
-    return true;
-}
-
-void osd_t::continue_primary_write(osd_op_t *cur_op)
-{
-    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
-    {
-        return;
-    }
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-    if (op_data->st == 1)      goto resume_1;
-    else if (op_data->st == 2) goto resume_2;
-    else if (op_data->st == 3) goto resume_3;
-    else if (op_data->st == 4) goto resume_4;
-    else if (op_data->st == 5) goto resume_5;
-    else if (op_data->st == 6) goto resume_6;
-    else if (op_data->st == 7) goto resume_7;
-    else if (op_data->st == 8) goto resume_8;
-    else if (op_data->st == 9) goto resume_9;
-    else if (op_data->st == 10) goto resume_10;
-    assert(op_data->st == 0);
-    if (!check_write_queue(cur_op, pg))
-    {
-        return;
-    }
-resume_1:
-    // Determine blocks to read and write
-    // Missing chunks are allowed to be overwritten even in incomplete objects
-    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-    if (op_data->scheme == POOL_SCHEME_REPLICATED)
-    {
-        // Simplified algorithm
-        op_data->stripes[0].write_start = op_data->stripes[0].req_start;
-        op_data->stripes[0].write_end = op_data->stripes[0].req_end;
-        op_data->stripes[0].write_buf = cur_op->buf;
-        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
-            op_data->stripes[0].write_end != bs_block_size))
-        {
-            // Object is degraded/misplaced and will be moved to <write_osd_set>
-            op_data->stripes[0].read_start = 0;
-            op_data->stripes[0].read_end = bs_block_size;
-            cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
-        }
-    }
-    else
-    {
-        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
-            pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
-        if (!cur_op->rmw_buf)
-        {
-            // Refuse partial overwrite of an incomplete object
-            cur_op->reply.hdr.retval = -EINVAL;
-            goto continue_others;
-        }
-    }
-    // Read required blocks
-    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
-resume_2:
-    op_data->st = 2;
-    return;
-resume_3:
-    if (op_data->errors > 0)
-    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
-    }
-    // Save version override for parallel reads
-    pg.ver_override[op_data->oid] = op_data->fact_ver;
-    if (op_data->scheme == POOL_SCHEME_REPLICATED)
-    {
-        // Only (possibly) copy new data from the request into the recovery buffer
-        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
-            op_data->stripes[0].write_end != bs_block_size))
-        {
-            memcpy(
-                op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
-                op_data->stripes[0].write_buf,
-                op_data->stripes[0].req_end - op_data->stripes[0].req_start
-            );
-            op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
-            op_data->stripes[0].write_start = 0;
-            op_data->stripes[0].write_end = bs_block_size;
-        }
-    }
-    else
-    {
-        // Recover missing stripes, calculate parity
-        if (pg.scheme == POOL_SCHEME_XOR)
-        {
-            calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
-        }
-        else if (pg.scheme == POOL_SCHEME_JERASURE)
-        {
-            calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
-        }
-    }
-    // Send writes
-    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
-    {
-        op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
-    }
-    else
-    {
-        if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
-        {
-            assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
-            pg.epoch++;
-        }
-        op_data->target_ver = op_data->fact_ver + 1;
-    }
-    if (pg.epoch > pg.reported_epoch)
-    {
-        // Report newer epoch before writing
-        // FIXME: We may report only one PG state here...
-        this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-        pg.history_changed = true;
-        report_pg_states();
-resume_10:
-        if (pg.epoch > pg.reported_epoch)
-        {
-            op_data->st = 10;
-            return;
-        }
-    }
-    submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
-resume_4:
-    op_data->st = 4;
-    return;
-resume_5:
-    if (op_data->errors > 0)
-    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
-    }
-resume_6:
-resume_7:
-    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
-    {
-        // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
-        return;
-    }
-    if (op_data->fact_ver == 1)
-    {
-        // Object is created
-        pg.clean_count++;
-        pg.total_count++;
-    }
-    if (op_data->object_state)
-    {
-        {
-            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
-            recovery_stat_count[0][recovery_type]++;
-            if (!recovery_stat_count[0][recovery_type])
-            {
-                recovery_stat_count[0][recovery_type]++;
-                recovery_stat_bytes[0][recovery_type] = 0;
-            }
-            for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
-            {
-                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
-            }
-        }
-        // Any kind of a non-clean object can have extra chunks, because we don't record objects
-        // as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
-        if (immediate_commit != IMMEDIATE_ALL)
-        {
-            // We can't remove extra chunks yet if fsyncs are explicit, because
-            // new copies may not be committed to stable storage yet
-            // We can only remove extra chunks after a successful SYNC for this PG
-            for (auto & chunk: op_data->object_state->osd_set)
-            {
-                // Check is the same as in submit_primary_del_subops()
-                if (op_data->scheme == POOL_SCHEME_REPLICATED
-                    ? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
-                    : (chunk.osd_num != pg.cur_set[chunk.role]))
-                {
-                    pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
-                        .osd_num = chunk.osd_num,
-                        .oid = {
-                            .inode = op_data->oid.inode,
-                            .stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
-                        },
-                        .version = op_data->fact_ver,
-                    });
-                    copies_to_delete_after_sync_count++;
-                }
-            }
-        }
-        else
-        {
-            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
-            if (op_data->n_subops > 0)
-            {
-resume_8:
-                op_data->st = 8;
-                return;
-resume_9:
-                if (op_data->errors > 0)
-                {
-                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-                    return;
-                }
-            }
-        }
-        // Clear object state
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
-        pg.clean_count++;
-    }
-    cur_op->reply.hdr.retval = cur_op->req.rw.len;
-continue_others:
-    // Remove version override
-    pg.ver_override.erase(op_data->oid);
-    object_id oid = op_data->oid;
-    // Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
-    auto next_it = pg.write_queue.find(oid);
-    if (next_it != pg.write_queue.end() && next_it->second == cur_op)
-    {
-        pg.write_queue.erase(next_it++);
-    }
-    // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
-    finish_op(cur_op, cur_op->reply.hdr.retval);
-    // Continue other write operations to the same object
-    if (next_it != pg.write_queue.end() && next_it->first == oid)
-    {
-        osd_op_t *next_op = next_it->second;
-        continue_primary_write(next_op);
-    }
-}
-
-bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (op_data->st == base_state)
-    {
-        goto resume_6;
-    }
-    else if (op_data->st == base_state+1)
-    {
-        goto resume_7;
-    }
-    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
-    if (immediate_commit == IMMEDIATE_ALL)
-    {
-        if (op_data->scheme != POOL_SCHEME_REPLICATED)
-        {
-            // Send STABILIZE ops immediately
-            op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-            op_data->unstable_writes = new obj_ver_id[loc_set.size()];
-            {
-                int last_start = 0;
-                for (auto & chunk: loc_set)
-                {
-                    op_data->unstable_writes[last_start] = (obj_ver_id){
-                        .oid = {
-                            .inode = op_data->oid.inode,
-                            .stripe = op_data->oid.stripe | chunk.role,
-                        },
-                        .version = op_data->fact_ver,
-                    };
-                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
-                        .osd_num = chunk.osd_num,
-                        .start = last_start,
-                        .len = 1,
-                    });
-                    last_start++;
-                }
-            }
-            submit_primary_stab_subops(cur_op);
-resume_6:
-            op_data->st = 6;
-            return false;
-resume_7:
-            // FIXME: Free those in the destructor?
-            delete op_data->unstable_write_osds;
-            delete[] op_data->unstable_writes;
-            op_data->unstable_writes = NULL;
-            op_data->unstable_write_osds = NULL;
-            if (op_data->errors > 0)
-            {
-                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
-                return false;
-            }
-        }
-    }
-    else
-    {
-        if (op_data->scheme != POOL_SCHEME_REPLICATED)
-        {
-            // Remember version as unstable for EC/XOR
-            for (auto & chunk: loc_set)
-            {
-                this->dirty_osds.insert(chunk.osd_num);
-                this->unstable_writes[(osd_object_id_t){
-                    .osd_num = chunk.osd_num,
-                    .oid = {
-                        .inode = op_data->oid.inode,
-                        .stripe = op_data->oid.stripe | chunk.role,
-                    },
-                }] = op_data->fact_ver;
-            }
-        }
-        else
-        {
-            // Only remember to sync OSDs for replicated pools
-            for (auto & chunk: loc_set)
-            {
-                this->dirty_osds.insert(chunk.osd_num);
-            }
-        }
-        // Remember PG as dirty to drop the connection when PG goes offline
-        // (this is required because of the "lazy sync")
-        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
-        if (cl_it != c_cli.clients.end())
-        {
-            cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-        }
-        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-    }
-    return true;
-}
-
-// Save and clear unstable_writes -> SYNC all -> STABLE all
-void osd_t::continue_primary_sync(osd_op_t *cur_op)
-{
-    if (!cur_op->op_data)
-    {
-        cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
-    }
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (op_data->st == 1)      goto resume_1;
-    else if (op_data->st == 2) goto resume_2;
-    else if (op_data->st == 3) goto resume_3;
-    else if (op_data->st == 4) goto resume_4;
-    else if (op_data->st == 5) goto resume_5;
-    else if (op_data->st == 6) goto resume_6;
-    else if (op_data->st == 7) goto resume_7;
-    else if (op_data->st == 8) goto resume_8;
-    assert(op_data->st == 0);
-    if (syncs_in_progress.size() > 0)
-    {
-        // Wait for previous syncs, if any
-        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
-        syncs_in_progress.push_back(cur_op);
-        op_data->st = 1;
-resume_1:
-        return;
-    }
-    else
-    {
-        syncs_in_progress.push_back(cur_op);
-    }
-resume_2:
-    if (dirty_osds.size() == 0)
-    {
-        // Nothing to sync
-        goto finish;
-    }
-    // Save and clear unstable_writes
-    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
-    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
-    if (unstable_writes.size() > 0)
-    {
-        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
-        osd_num_t last_osd = 0;
-        int last_start = 0, last_end = 0;
-        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
-        {
-            if (last_osd != it->first.osd_num)
-            {
-                if (last_osd != 0)
-                {
-                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
-                        .osd_num = last_osd,
-                        .start = last_start,
-                        .len = last_end - last_start,
-                    });
-                }
-                last_osd = it->first.osd_num;
-                last_start = last_end;
-            }
-            op_data->unstable_writes[last_end] = (obj_ver_id){
-                .oid = it->first.oid,
-                .version = it->second,
-            };
-            last_end++;
-        }
-        if (last_osd != 0)
-        {
-            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
-                .osd_num = last_osd,
-                .start = last_start,
-                .len = last_end - last_start,
-            });
-        }
-        this->unstable_writes.clear();
-    }
-    {
-        void *dirty_buf = malloc_or_die(
-            sizeof(pool_pg_num_t)*dirty_pgs.size() +
-            sizeof(osd_num_t)*dirty_osds.size() +
-            sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
-        );
-        op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
-        op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
-        op_data->dirty_pg_count = dirty_pgs.size();
-        op_data->dirty_osd_count = dirty_osds.size();
-        if (this->copies_to_delete_after_sync_count)
-        {
-            op_data->copies_to_delete_count = 0;
-            op_data->copies_to_delete = (obj_ver_osd_t*)(op_data->dirty_osds + op_data->dirty_osd_count);
-            for (auto dirty_pg_num: dirty_pgs)
-            {
-                auto & pg = pgs.at(dirty_pg_num);
-                assert(pg.copies_to_delete_after_sync.size() <= this->copies_to_delete_after_sync_count);
-                memcpy(
-                    op_data->copies_to_delete + op_data->copies_to_delete_count,
-                    pg.copies_to_delete_after_sync.data(),
-                    sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
-                );
-                op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
-                this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
-                pg.copies_to_delete_after_sync.clear();
-            }
-            assert(this->copies_to_delete_after_sync_count == 0);
-        }
-        int dpg = 0;
-        for (auto dirty_pg_num: dirty_pgs)
-        {
-            pgs.at(dirty_pg_num).inflight++;
-            op_data->dirty_pgs[dpg++] = dirty_pg_num;
-        }
-        dirty_pgs.clear();
-        dpg = 0;
-        for (auto osd_num: dirty_osds)
-        {
-            op_data->dirty_osds[dpg++] = osd_num;
-        }
-        dirty_osds.clear();
-    }
-    if (immediate_commit != IMMEDIATE_ALL)
-    {
-        // SYNC
-        submit_primary_sync_subops(cur_op);
-resume_3:
-        op_data->st = 3;
-        return;
-resume_4:
-        if (op_data->errors > 0)
-        {
-            goto resume_6;
-        }
-    }
-    if (op_data->unstable_writes)
-    {
-        // Stabilize version sets, if any
-        submit_primary_stab_subops(cur_op);
-resume_5:
-        op_data->st = 5;
-        return;
-    }
-resume_6:
-    if (op_data->errors > 0)
-    {
-        // Return PGs and OSDs back into their dirty sets
-        for (int i = 0; i < op_data->dirty_pg_count; i++)
-        {
-            dirty_pgs.insert(op_data->dirty_pgs[i]);
-        }
-        for (int i = 0; i < op_data->dirty_osd_count; i++)
-        {
-            dirty_osds.insert(op_data->dirty_osds[i]);
-        }
-        if (op_data->unstable_writes)
-        {
-            // Return objects back into the unstable write set
-            for (auto unstable_osd: *(op_data->unstable_write_osds))
-            {
-                for (int i = 0; i < unstable_osd.len; i++)
-                {
-                    // Except those from peered PGs
-                    auto & w = op_data->unstable_writes[i];
-                    pool_pg_num_t wpg = {
-                        .pool_id = INODE_POOL(w.oid.inode),
-                        .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
-                    };
-                    if (pgs.at(wpg).state & PG_ACTIVE)
-                    {
-                        uint64_t & dest = this->unstable_writes[(osd_object_id_t){
-                            .osd_num = unstable_osd.osd_num,
-                            .oid = w.oid,
-                        }];
-                        dest = dest < w.version ? w.version : dest;
-                        dirty_pgs.insert(wpg);
-                    }
-                }
-            }
-        }
-        if (op_data->copies_to_delete)
-        {
-            // Return 'copies to delete' back into respective PGs
-            for (int i = 0; i < op_data->copies_to_delete_count; i++)
-            {
-                auto & w = op_data->copies_to_delete[i];
-                auto & pg = pgs.at((pool_pg_num_t){
-                    .pool_id = INODE_POOL(w.oid.inode),
-                    .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
-                });
-                if (pg.state & PG_ACTIVE)
-                {
-                    pg.copies_to_delete_after_sync.push_back(w);
-                    copies_to_delete_after_sync_count++;
-                }
-            }
-        }
-    }
-    else if (op_data->copies_to_delete)
-    {
-        // Actually delete copies which we wanted to delete
-        submit_primary_del_batch(cur_op, op_data->copies_to_delete, op_data->copies_to_delete_count);
-resume_7:
-        op_data->st = 7;
-        return;
-resume_8:
-        if (op_data->errors > 0)
-        {
-            goto resume_6;
-        }
-    }
-    for (int i = 0; i < op_data->dirty_pg_count; i++)
-    {
-        auto & pg = pgs.at(op_data->dirty_pgs[i]);
-        pg.inflight--;
-        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
-            // We must either forget all PG's unstable writes or wait for it to become clean
-            dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
-        {
-            finish_stop_pg(pg);
-        }
-    }
-    // FIXME: Free those in the destructor?
-    free(op_data->dirty_pgs);
-    op_data->dirty_pgs = NULL;
-    op_data->dirty_osds = NULL;
-    if (op_data->unstable_writes)
-    {
-        delete op_data->unstable_write_osds;
-        delete[] op_data->unstable_writes;
-        op_data->unstable_writes = NULL;
-        op_data->unstable_write_osds = NULL;
-    }
-    if (op_data->errors > 0)
-    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
-    }
-    else
-    {
-finish:
-        if (cur_op->peer_fd)
-        {
-            auto it = c_cli.clients.find(cur_op->peer_fd);
-            if (it != c_cli.clients.end())
-                it->second->dirty_pgs.clear();
-        }
-        finish_op(cur_op, 0);
-    }
-    assert(syncs_in_progress.front() == cur_op);
-    syncs_in_progress.pop_front();
-    if (syncs_in_progress.size() > 0)
-    {
-        cur_op = syncs_in_progress.front();
-        op_data = cur_op->op_data;
-        op_data->st++;
-        goto resume_2;
-    }
-}
-
 // Decrement pg_osd_set_state_t's object_count and change PG state accordingly
 void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
 {
@ -818,10 +215,14 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
    {
        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
    }
-    object_state->object_count--;
-    if (!object_state->object_count)
+}
+
+void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
+{
+    if (*object_state && !(--(*object_state)->object_count))
    {
-        pg.state_dict.erase(object_state->osd_set);
+        pg.state_dict.erase((*object_state)->osd_set);
+        *object_state = NULL;
    }
 }

@ -887,22 +288,21 @@ resume_5:
    else
    {
        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        free_object_state(pg, &op_data->object_state);
    }
    pg.total_count--;
-    object_id oid = op_data->oid;
+    osd_op_t *next_op = NULL;
+    auto next_it = pg.write_queue.find(op_data->oid);
+    if (next_it != pg.write_queue.end() && next_it->second == cur_op)
+    {
+        pg.write_queue.erase(next_it++);
+        if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
+            next_op = next_it->second;
+    }
    finish_op(cur_op, cur_op->req.rw.len);
-    // Continue other write operations to the same object
-    auto next_it = pg.write_queue.find(oid);
-    auto this_it = next_it;
-    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
+    if (next_op)
    {
-        next_it++;
-        pg.write_queue.erase(this_it);
-        if (next_it != pg.write_queue.end() &&
-            next_it->first == oid)
-        {
-            osd_op_t *next_op = next_it->second;
+        // Continue next write to the same object
        continue_primary_write(next_op);
    }
-    }
 }
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@ -43,12 +43,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
            auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num });
            pg.inflight--;
            assert(pg.inflight >= 0);
-            if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
-                // We must either forget all PG's unstable writes or wait for it to become clean
-                dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
+            if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
            {
                finish_stop_pg(pg);
            }
+            else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
+            {
+                start_pg_peering(pg);
+            }
        }
        assert(!cur_op->op_data->subops);
        assert(!cur_op->op_data->unstable_write_osds);
@ -194,14 +196,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
                }
                subops[i].callback = [cur_op, this](osd_op_t *subop)
                {
-                    int fail_fd = subop->req.hdr.opcode == OSD_OP_SEC_WRITE &&
-                        subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
                    handle_primary_subop(subop, cur_op);
-                    if (fail_fd >= 0)
-                    {
-                        // write operation failed, drop the connection
-                        c_cli.stop_client(fail_fd);
-                    }
                };
                c_cli.outbox_push(&subops[i]);
            }
@ -247,6 +242,7 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
    }
    delete bs_op;
    subop->bs_op = NULL;
+    subop->peer_fd = -1;
    handle_primary_subop(subop, cur_op);
 }

@ -288,6 +284,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
            op_data->epipe++;
        }
        op_data->errors++;
+        if (subop->peer_fd >= 0)
+        {
+            // Drop connection on any error
+            c_cli.stop_client(subop->peer_fd);
+        }
    }
    else
    {
@ -427,7 +428,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
        {
            subops[i].op_type = OSD_OP_OUT;
            subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
-            subops[i].req.sec_del = {
+            subops[i].req = (osd_any_op_t){ .sec_del = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
                    .id = c_cli.next_subop_id++,
@ -435,23 +436,17 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
                },
                .oid = chunk.oid,
                .version = chunk.version,
-            };
+            } };
            subops[i].callback = [cur_op, this](osd_op_t *subop)
            {
-                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
                handle_primary_subop(subop, cur_op);
-                if (fail_fd >= 0)
-                {
-                    // delete operation failed, drop the connection
-                    c_cli.stop_client(fail_fd);
-                }
            };
            c_cli.outbox_push(&subops[i]);
        }
    }
 }

-void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
+int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
    int n_osds = op_data->dirty_osd_count;
@ -459,6 +454,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
    op_data->done = op_data->errors = 0;
    op_data->n_subops = n_osds;
    op_data->subops = subops;
+    std::map<uint64_t, int>::iterator peer_it;
    for (int i = 0; i < n_osds; i++)
    {
        osd_num_t sync_osd = op_data->dirty_osds[i];
@ -475,30 +471,35 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
            });
            bs->enqueue_op(subops[i].bs_op);
        }
-        else
+        else if ((peer_it = c_cli.osd_peer_fds.find(sync_osd)) != c_cli.osd_peer_fds.end())
        {
            subops[i].op_type = OSD_OP_OUT;
-            subops[i].peer_fd = c_cli.osd_peer_fds.at(sync_osd);
-            subops[i].req.sec_sync = {
+            subops[i].peer_fd = peer_it->second;
+            subops[i].req = (osd_any_op_t){ .sec_sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
                    .id = c_cli.next_subop_id++,
                    .opcode = OSD_OP_SEC_SYNC,
                },
-            };
+            } };
            subops[i].callback = [cur_op, this](osd_op_t *subop)
            {
-                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
                handle_primary_subop(subop, cur_op);
-                if (fail_fd >= 0)
-                {
-                    // sync operation failed, drop the connection
-                    c_cli.stop_client(fail_fd);
-                }
            };
            c_cli.outbox_push(&subops[i]);
        }
+        else
+        {
+            op_data->done++;
        }
+    }
+    if (op_data->done >= op_data->n_subops)
+    {
+        delete[] op_data->subops;
+        op_data->subops = NULL;
+        return 0;
+    }
+    return 1;
 }

 void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
@ -531,24 +532,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
        {
            subops[i].op_type = OSD_OP_OUT;
            subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
-            subops[i].req.sec_stab = {
+            subops[i].req = (osd_any_op_t){ .sec_stab = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
                    .id = c_cli.next_subop_id++,
                    .opcode = OSD_OP_SEC_STABILIZE,
                },
                .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
-            };
+            } };
            subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
            subops[i].callback = [cur_op, this](osd_op_t *subop)
            {
-                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
                handle_primary_subop(subop, cur_op);
-                if (fail_fd >= 0)
-                {
-                    // sync operation failed, drop the connection
-                    c_cli.stop_client(fail_fd);
-                }
            };
            c_cli.outbox_push(&subops[i]);
        }
@ -566,7 +561,7 @@ void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid,
        return;
    }
    std::vector<osd_op_t*> cancel_ops;
-    while (it != pg.write_queue.end())
+    while (it != pg.write_queue.end() && it->first == oid)
    {
        cancel_ops.push_back(it->second);
        it++;
--- a/src/osd_primary_sync.cpp
+++ b/src/osd_primary_sync.cpp
@ -0,0 +1,265 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "osd_primary.h"
+
+// Save and clear unstable_writes -> SYNC all -> STABLE all
+void osd_t::continue_primary_sync(osd_op_t *cur_op)
+{
+    if (!cur_op->op_data)
+    {
+        cur_op->op_data = (osd_primary_op_data_t*)calloc_or_die(1, sizeof(osd_primary_op_data_t));
+    }
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == 1)      goto resume_1;
+    else if (op_data->st == 2) goto resume_2;
+    else if (op_data->st == 3) goto resume_3;
+    else if (op_data->st == 4) goto resume_4;
+    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    else if (op_data->st == 7) goto resume_7;
+    else if (op_data->st == 8) goto resume_8;
+    assert(op_data->st == 0);
+    if (syncs_in_progress.size() > 0)
+    {
+        // Wait for previous syncs, if any
+        // FIXME: We may try to execute the current one in parallel, like in Blockstore, but I'm not sure if it matters at all
+        syncs_in_progress.push_back(cur_op);
+        op_data->st = 1;
+resume_1:
+        return;
+    }
+    else
+    {
+        syncs_in_progress.push_back(cur_op);
+    }
+resume_2:
+    if (dirty_osds.size() == 0)
+    {
+        // Nothing to sync
+        goto finish;
+    }
+    // Save and clear unstable_writes
+    // In theory it is possible to do in on a per-client basis, but this seems to be an unnecessary complication
+    // It would be cool not to copy these here at all, but someone has to deduplicate them by object IDs anyway
+    if (unstable_writes.size() > 0)
+    {
+        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
+        osd_num_t last_osd = 0;
+        int last_start = 0, last_end = 0;
+        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
+        {
+            if (last_osd != it->first.osd_num)
+            {
+                if (last_osd != 0)
+                {
+                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                        .osd_num = last_osd,
+                        .start = last_start,
+                        .len = last_end - last_start,
+                    });
+                }
+                last_osd = it->first.osd_num;
+                last_start = last_end;
+            }
+            op_data->unstable_writes[last_end] = (obj_ver_id){
+                .oid = it->first.oid,
+                .version = it->second,
+            };
+            last_end++;
+        }
+        if (last_osd != 0)
+        {
+            op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                .osd_num = last_osd,
+                .start = last_start,
+                .len = last_end - last_start,
+            });
+        }
+        this->unstable_writes.clear();
+    }
+    {
+        void *dirty_buf = malloc_or_die(
+            sizeof(pool_pg_num_t)*dirty_pgs.size() +
+            sizeof(osd_num_t)*dirty_osds.size() +
+            sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
+        );
+        op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
+        op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
+        op_data->dirty_pg_count = dirty_pgs.size();
+        op_data->dirty_osd_count = dirty_osds.size();
+        if (this->copies_to_delete_after_sync_count)
+        {
+            op_data->copies_to_delete_count = 0;
+            op_data->copies_to_delete = (obj_ver_osd_t*)(op_data->dirty_osds + op_data->dirty_osd_count);
+            for (auto dirty_pg_num: dirty_pgs)
+            {
+                auto & pg = pgs.at(dirty_pg_num);
+                assert(pg.copies_to_delete_after_sync.size() <= this->copies_to_delete_after_sync_count);
+                memcpy(
+                    op_data->copies_to_delete + op_data->copies_to_delete_count,
+                    pg.copies_to_delete_after_sync.data(),
+                    sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
+                );
+                op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
+                this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
+                pg.copies_to_delete_after_sync.clear();
+            }
+            assert(this->copies_to_delete_after_sync_count == 0);
+        }
+        int dpg = 0;
+        for (auto dirty_pg_num: dirty_pgs)
+        {
+            pgs.at(dirty_pg_num).inflight++;
+            op_data->dirty_pgs[dpg++] = dirty_pg_num;
+        }
+        dirty_pgs.clear();
+        dpg = 0;
+        for (auto osd_num: dirty_osds)
+        {
+            op_data->dirty_osds[dpg++] = osd_num;
+        }
+        dirty_osds.clear();
+    }
+    if (immediate_commit != IMMEDIATE_ALL)
+    {
+        // SYNC
+        if (!submit_primary_sync_subops(cur_op))
+        {
+            goto resume_4;
+        }
+resume_3:
+        op_data->st = 3;
+        return;
+resume_4:
+        if (op_data->errors > 0)
+        {
+            goto resume_6;
+        }
+    }
+    if (op_data->unstable_writes)
+    {
+        // Stabilize version sets, if any
+        submit_primary_stab_subops(cur_op);
+resume_5:
+        op_data->st = 5;
+        return;
+    }
+resume_6:
+    if (op_data->errors > 0)
+    {
+        // Return PGs and OSDs back into their dirty sets
+        for (int i = 0; i < op_data->dirty_pg_count; i++)
+        {
+            dirty_pgs.insert(op_data->dirty_pgs[i]);
+        }
+        for (int i = 0; i < op_data->dirty_osd_count; i++)
+        {
+            dirty_osds.insert(op_data->dirty_osds[i]);
+        }
+        if (op_data->unstable_writes)
+        {
+            // Return objects back into the unstable write set
+            for (auto unstable_osd: *(op_data->unstable_write_osds))
+            {
+                for (int i = 0; i < unstable_osd.len; i++)
+                {
+                    // Except those from peered PGs
+                    auto & w = op_data->unstable_writes[i];
+                    pool_pg_num_t wpg = {
+                        .pool_id = INODE_POOL(w.oid.inode),
+                        .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
+                    };
+                    if (pgs.at(wpg).state & PG_ACTIVE)
+                    {
+                        uint64_t & dest = this->unstable_writes[(osd_object_id_t){
+                            .osd_num = unstable_osd.osd_num,
+                            .oid = w.oid,
+                        }];
+                        dest = dest < w.version ? w.version : dest;
+                        dirty_pgs.insert(wpg);
+                    }
+                }
+            }
+        }
+        if (op_data->copies_to_delete)
+        {
+            // Return 'copies to delete' back into respective PGs
+            for (int i = 0; i < op_data->copies_to_delete_count; i++)
+            {
+                auto & w = op_data->copies_to_delete[i];
+                auto & pg = pgs.at((pool_pg_num_t){
+                    .pool_id = INODE_POOL(w.oid.inode),
+                    .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
+                });
+                if (pg.state & PG_ACTIVE)
+                {
+                    pg.copies_to_delete_after_sync.push_back(w);
+                    copies_to_delete_after_sync_count++;
+                }
+            }
+        }
+    }
+    else if (op_data->copies_to_delete)
+    {
+        // Actually delete copies which we wanted to delete
+        submit_primary_del_batch(cur_op, op_data->copies_to_delete, op_data->copies_to_delete_count);
+resume_7:
+        op_data->st = 7;
+        return;
+resume_8:
+        if (op_data->errors > 0)
+        {
+            goto resume_6;
+        }
+    }
+    for (int i = 0; i < op_data->dirty_pg_count; i++)
+    {
+        auto & pg = pgs.at(op_data->dirty_pgs[i]);
+        pg.inflight--;
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
+        {
+            finish_stop_pg(pg);
+        }
+        else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
+        {
+            start_pg_peering(pg);
+        }
+    }
+    // FIXME: Free those in the destructor?
+    free(op_data->dirty_pgs);
+    op_data->dirty_pgs = NULL;
+    op_data->dirty_osds = NULL;
+    if (op_data->unstable_writes)
+    {
+        delete op_data->unstable_write_osds;
+        delete[] op_data->unstable_writes;
+        op_data->unstable_writes = NULL;
+        op_data->unstable_write_osds = NULL;
+    }
+    if (op_data->errors > 0)
+    {
+        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+    }
+    else
+    {
+finish:
+        if (cur_op->peer_fd)
+        {
+            auto it = c_cli.clients.find(cur_op->peer_fd);
+            if (it != c_cli.clients.end())
+                it->second->dirty_pgs.clear();
+        }
+        finish_op(cur_op, 0);
+    }
+    assert(syncs_in_progress.front() == cur_op);
+    syncs_in_progress.pop_front();
+    if (syncs_in_progress.size() > 0)
+    {
+        cur_op = syncs_in_progress.front();
+        op_data = cur_op->op_data;
+        op_data->st++;
+        goto resume_2;
+    }
+}
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@ -0,0 +1,378 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "osd_primary.h"
+#include "allocator.h"
+
+bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    // Check if actions are pending for this object
+    auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
+        .oid = op_data->oid,
+        .osd_num = 0,
+    });
+    if (act_it != pg.flush_actions.end() &&
+        act_it->first.oid.inode == op_data->oid.inode &&
+        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
+    {
+        pg.write_queue.emplace(op_data->oid, cur_op);
+        return false;
+    }
+    // Check if there are other write requests to the same object
+    auto vo_it = pg.write_queue.find(op_data->oid);
+    if (vo_it != pg.write_queue.end())
+    {
+        op_data->st = 1;
+        pg.write_queue.emplace(op_data->oid, cur_op);
+        return false;
+    }
+    pg.write_queue.emplace(op_data->oid, cur_op);
+    return true;
+}
+
+void osd_t::continue_primary_write(osd_op_t *cur_op)
+{
+    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
+    {
+        return;
+    }
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+    if (op_data->st == 1)      goto resume_1;
+    else if (op_data->st == 2) goto resume_2;
+    else if (op_data->st == 3) goto resume_3;
+    else if (op_data->st == 4) goto resume_4;
+    else if (op_data->st == 5) goto resume_5;
+    else if (op_data->st == 6) goto resume_6;
+    else if (op_data->st == 7) goto resume_7;
+    else if (op_data->st == 8) goto resume_8;
+    else if (op_data->st == 9) goto resume_9;
+    else if (op_data->st == 10) goto resume_10;
+    assert(op_data->st == 0);
+    if (!check_write_queue(cur_op, pg))
+    {
+        return;
+    }
+resume_1:
+    // Determine blocks to read and write
+    // Missing chunks are allowed to be overwritten even in incomplete objects
+    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    if (op_data->scheme == POOL_SCHEME_REPLICATED)
+    {
+        // Simplified algorithm
+        op_data->stripes[0].write_start = op_data->stripes[0].req_start;
+        op_data->stripes[0].write_end = op_data->stripes[0].req_end;
+        op_data->stripes[0].write_buf = cur_op->buf;
+        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
+            op_data->stripes[0].write_end != bs_block_size))
+        {
+            // Object is degraded/misplaced and will be moved to <write_osd_set>
+            op_data->stripes[0].read_start = 0;
+            op_data->stripes[0].read_end = bs_block_size;
+            cur_op->rmw_buf = op_data->stripes[0].read_buf = memalign_or_die(MEM_ALIGNMENT, bs_block_size);
+        }
+    }
+    else
+    {
+        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
+            pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
+        if (!cur_op->rmw_buf)
+        {
+            // Refuse partial overwrite of an incomplete object
+            cur_op->reply.hdr.retval = -EINVAL;
+            goto continue_others;
+        }
+    }
+    // Read required blocks
+    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
+resume_2:
+    op_data->st = 2;
+    return;
+resume_3:
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    if (op_data->scheme == POOL_SCHEME_REPLICATED)
+    {
+        // Only (possibly) copy new data from the request into the recovery buffer
+        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
+            op_data->stripes[0].write_end != bs_block_size))
+        {
+            memcpy(
+                op_data->stripes[0].read_buf + op_data->stripes[0].req_start,
+                op_data->stripes[0].write_buf,
+                op_data->stripes[0].req_end - op_data->stripes[0].req_start
+            );
+            op_data->stripes[0].write_buf = op_data->stripes[0].read_buf;
+            op_data->stripes[0].write_start = 0;
+            op_data->stripes[0].write_end = bs_block_size;
+        }
+    }
+    else
+    {
+        // For EC/XOR pools, save version override to make it impossible
+        // for parallel reads to read different versions of data and parity
+        pg.ver_override[op_data->oid] = op_data->fact_ver;
+        // Recover missing stripes, calculate parity
+        if (pg.scheme == POOL_SCHEME_XOR)
+        {
+            calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        }
+        else if (pg.scheme == POOL_SCHEME_JERASURE)
+        {
+            calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        }
+    }
+    // Send writes
+    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
+    {
+        op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
+    }
+    else
+    {
+        if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
+        {
+            assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
+            pg.epoch++;
+        }
+        op_data->target_ver = op_data->fact_ver + 1;
+    }
+    if (pg.epoch > pg.reported_epoch)
+    {
+        // Report newer epoch before writing
+        // FIXME: We may report only one PG state here...
+        this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        pg.history_changed = true;
+        report_pg_states();
+resume_10:
+        if (pg.epoch > pg.reported_epoch)
+        {
+            op_data->st = 10;
+            return;
+        }
+    }
+    submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.pg_size, pg.cur_set.data(), cur_op);
+resume_4:
+    op_data->st = 4;
+    return;
+resume_5:
+    if (op_data->scheme != POOL_SCHEME_REPLICATED)
+    {
+        // Remove version override just after the write, but before stabilizing
+        pg.ver_override.erase(op_data->oid);
+    }
+    if (op_data->errors > 0)
+    {
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        return;
+    }
+    if (op_data->object_state)
+    {
+        // We must forget the unclean state of the object before deleting it
+        // so the next reads don't accidentally read a deleted version
+        // And it should be done at the same time as the removal of the version override
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        pg.clean_count++;
+    }
+resume_6:
+resume_7:
+    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
+    {
+        return;
+    }
+    if (op_data->fact_ver == 1)
+    {
+        // Object is created
+        pg.clean_count++;
+        pg.total_count++;
+    }
+    if (op_data->object_state)
+    {
+        {
+            int recovery_type = op_data->object_state->state & (OBJ_DEGRADED|OBJ_INCOMPLETE) ? 0 : 1;
+            recovery_stat_count[0][recovery_type]++;
+            if (!recovery_stat_count[0][recovery_type])
+            {
+                recovery_stat_count[0][recovery_type]++;
+                recovery_stat_bytes[0][recovery_type] = 0;
+            }
+            for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size); role++)
+            {
+                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
+            }
+        }
+        // Any kind of a non-clean object can have extra chunks, because we don't record objects
+        // as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
+        if (immediate_commit != IMMEDIATE_ALL)
+        {
+            // We can't remove extra chunks yet if fsyncs are explicit, because
+            // new copies may not be committed to stable storage yet
+            // We can only remove extra chunks after a successful SYNC for this PG
+            for (auto & chunk: op_data->object_state->osd_set)
+            {
+                // Check is the same as in submit_primary_del_subops()
+                if (op_data->scheme == POOL_SCHEME_REPLICATED
+                    ? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
+                    : (chunk.osd_num != pg.cur_set[chunk.role]))
+                {
+                    pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
+                        .osd_num = chunk.osd_num,
+                        .oid = {
+                            .inode = op_data->oid.inode,
+                            .stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
+                        },
+                        .version = op_data->fact_ver,
+                    });
+                    copies_to_delete_after_sync_count++;
+                }
+            }
+            free_object_state(pg, &op_data->object_state);
+        }
+        else
+        {
+            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
+            free_object_state(pg, &op_data->object_state);
+            if (op_data->n_subops > 0)
+            {
+resume_8:
+                op_data->st = 8;
+                return;
+resume_9:
+                if (op_data->errors > 0)
+                {
+                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                    return;
+                }
+            }
+        }
+    }
+    cur_op->reply.hdr.retval = cur_op->req.rw.len;
+continue_others:
+    osd_op_t *next_op = NULL;
+    auto next_it = pg.write_queue.find(op_data->oid);
+    // Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
+    if (next_it != pg.write_queue.end() && next_it->second == cur_op)
+    {
+        pg.write_queue.erase(next_it++);
+        if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
+            next_op = next_it->second;
+    }
+    // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
+    finish_op(cur_op, cur_op->req.rw.len);
+    if (next_op)
+    {
+        // Continue next write to the same object
+        continue_primary_write(next_op);
+    }
+}
+
+bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == base_state)
+    {
+        goto resume_6;
+    }
+    else if (op_data->st == base_state+1)
+    {
+        goto resume_7;
+    }
+    if (immediate_commit == IMMEDIATE_ALL)
+    {
+immediate:
+        if (op_data->scheme != POOL_SCHEME_REPLICATED)
+        {
+            // Send STABILIZE ops immediately
+            op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
+            op_data->unstable_writes = new obj_ver_id[loc_set.size()];
+            {
+                int last_start = 0;
+                for (auto & chunk: loc_set)
+                {
+                    op_data->unstable_writes[last_start] = (obj_ver_id){
+                        .oid = {
+                            .inode = op_data->oid.inode,
+                            .stripe = op_data->oid.stripe | chunk.role,
+                        },
+                        .version = op_data->fact_ver,
+                    };
+                    op_data->unstable_write_osds->push_back((unstable_osd_num_t){
+                        .osd_num = chunk.osd_num,
+                        .start = last_start,
+                        .len = 1,
+                    });
+                    last_start++;
+                }
+            }
+            submit_primary_stab_subops(cur_op);
+resume_6:
+            op_data->st = 6;
+            return false;
+resume_7:
+            // FIXME: Free those in the destructor?
+            delete op_data->unstable_write_osds;
+            delete[] op_data->unstable_writes;
+            op_data->unstable_writes = NULL;
+            op_data->unstable_write_osds = NULL;
+            if (op_data->errors > 0)
+            {
+                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                return false;
+            }
+        }
+    }
+    else if (immediate_commit == IMMEDIATE_SMALL)
+    {
+        int stripe_count = (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : op_data->pg_size);
+        for (int role = 0; role < stripe_count; role++)
+        {
+            if (op_data->stripes[role].write_start == 0 &&
+                op_data->stripes[role].write_end == bs_block_size)
+            {
+                // Big write. Treat write as unsynced
+                goto lazy;
+            }
+        }
+        goto immediate;
+    }
+    else
+    {
+lazy:
+        if (op_data->scheme != POOL_SCHEME_REPLICATED)
+        {
+            // Remember version as unstable for EC/XOR
+            for (auto & chunk: loc_set)
+            {
+                this->dirty_osds.insert(chunk.osd_num);
+                this->unstable_writes[(osd_object_id_t){
+                    .osd_num = chunk.osd_num,
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | chunk.role,
+                    },
+                }] = op_data->fact_ver;
+            }
+        }
+        else
+        {
+            // Only remember to sync OSDs for replicated pools
+            for (auto & chunk: loc_set)
+            {
+                this->dirty_osds.insert(chunk.osd_num);
+            }
+        }
+        // Remember PG as dirty to drop the connection when PG goes offline
+        // (this is required because of the "lazy sync")
+        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
+        if (cl_it != c_cli.clients.end())
+        {
+            cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        }
+        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+    }
+    return true;
+}
--- a/src/pg_states.cpp
+++ b/src/pg_states.cpp
@ -3,13 +3,14 @@

 #include "pg_states.h"

-const int pg_state_bit_count = 14;
+const int pg_state_bit_count = 15;

-const int pg_state_bits[14] = {
+const int pg_state_bits[15] = {
    PG_STARTING,
    PG_PEERING,
    PG_INCOMPLETE,
    PG_ACTIVE,
+    PG_REPEERING,
    PG_STOPPING,
    PG_OFFLINE,
    PG_DEGRADED,
@ -21,11 +22,12 @@ const int pg_state_bits[14] = {
    PG_LEFT_ON_DEAD,
 };

-const char *pg_state_names[14] = {
+const char *pg_state_names[15] = {
    "starting",
    "peering",
    "incomplete",
    "active",
+    "repeering",
    "stopping",
    "offline",
    "degraded",
--- a/src/pg_states.h
+++ b/src/pg_states.h
@ -10,16 +10,17 @@
 #define PG_PEERING (1<<1)
 #define PG_INCOMPLETE (1<<2)
 #define PG_ACTIVE (1<<3)
-#define PG_STOPPING (1<<4)
-#define PG_OFFLINE (1<<5)
+#define PG_REPEERING (1<<4)
+#define PG_STOPPING (1<<5)
+#define PG_OFFLINE (1<<6)
 // Plus any of these:
-#define PG_DEGRADED (1<<6)
-#define PG_HAS_INCOMPLETE (1<<7)
-#define PG_HAS_DEGRADED (1<<8)
-#define PG_HAS_MISPLACED (1<<9)
-#define PG_HAS_UNCLEAN (1<<10)
-#define PG_HAS_INVALID (1<<11)
-#define PG_LEFT_ON_DEAD (1<<12)
+#define PG_DEGRADED (1<<7)
+#define PG_HAS_INCOMPLETE (1<<8)
+#define PG_HAS_DEGRADED (1<<9)
+#define PG_HAS_MISPLACED (1<<10)
+#define PG_HAS_UNCLEAN (1<<11)
+#define PG_HAS_INVALID (1<<12)
+#define PG_LEFT_ON_DEAD (1<<13)

 // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
 // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
--- a/src/qemu_proxy.cpp
+++ b/src/qemu_proxy.cpp
@ -47,7 +47,6 @@ public:

    ~QemuProxy()
    {
-        cli->stop();
        delete cli;
        delete tfd;
    }
--- a/src/test_allocator.cpp
+++ b/src/test_allocator.cpp
@ -20,7 +20,15 @@ void alloc_all(int size)
        {
            printf("incorrect block allocated: expected %d, got %lu\n", i, x);
        }
+        if (a->get(x))
+        {
+            printf("not free before set at %d\n", i);
+        }
        a->set(x, true);
+        if (!a->get(x))
+        {
+            printf("free after set at %d\n", i);
+        }
    }
    uint64_t x = a->find_free();
    if (x != UINT64_MAX)
--- a/src/test_cluster_client.cpp
+++ b/src/test_cluster_client.cpp
@ -0,0 +1,407 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "cluster_client.h"
+
+void configure_single_pg_pool(cluster_client_t *cli)
+{
+    cli->st_cli.on_load_pgs_hook(true);
+    cli->st_cli.parse_state((json_kv_t){
+        .key = "/config/pools",
+        .value = json11::Json::object {
+            { "1", json11::Json::object {
+                { "name", "hddpool" },
+                { "scheme", "replicated" },
+                { "pg_size", 2 },
+                { "pg_minsize", 1 },
+                { "pg_count", 1 },
+                { "failure_domain", "osd" },
+            } }
+        },
+    });
+    cli->st_cli.parse_state((json_kv_t){
+        .key = "/config/pgs",
+        .value = json11::Json::object {
+            { "items", json11::Json::object {
+                { "1", json11::Json::object {
+                    { "1", json11::Json::object {
+                        { "osd_set", json11::Json::array { 1, 2 } },
+                        { "primary", 1 },
+                    } }
+                } }
+            } }
+        },
+    });
+    cli->st_cli.parse_state((json_kv_t){
+        .key = "/pg/state/1/1",
+        .value = json11::Json::object {
+            { "peers", json11::Json::array { 1, 2 } },
+            { "primary", 1 },
+            { "state", json11::Json::array { "active" } },
+        },
+    });
+    json11::Json::object changes;
+    cli->st_cli.on_change_hook(changes);
+}
+
+int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL)
+{
+    printf("Post write %lx+%lx\n", offset, len);
+    int *r = new int;
+    *r = -1;
+    cluster_op_t *op = new cluster_op_t();
+    op->opcode = OSD_OP_WRITE;
+    op->inode = 0x1000000000001;
+    op->offset = offset;
+    op->len = len;
+    op->iov.push_back(malloc_or_die(len), len);
+    memset(op->iov.buf[0].iov_base, c, len);
+    op->callback = [r, cb](cluster_op_t *op)
+    {
+        if (*r == -1)
+            printf("Error: Not allowed to complete yet\n");
+        assert(*r != -1);
+        *r = op->retval == op->len ? 1 : 0;
+        free(op->iov.buf[0].iov_base);
+        printf("Done write %lx+%lx r=%d\n", op->offset, op->len, op->retval);
+        delete op;
+        if (cb != NULL)
+            cb();
+    };
+    cli->execute(op);
+    return r;
+}
+
+int *test_sync(cluster_client_t *cli)
+{
+    printf("Post sync\n");
+    int *r = new int;
+    *r = -1;
+    cluster_op_t *op = new cluster_op_t();
+    op->opcode = OSD_OP_SYNC;
+    op->callback = [r](cluster_op_t *op)
+    {
+        if (*r == -1)
+            printf("Error: Not allowed to complete yet\n");
+        assert(*r != -1);
+        *r = op->retval == 0 ? 1 : 0;
+        printf("Done sync r=%d\n", op->retval);
+        delete op;
+    };
+    cli->execute(op);
+    return r;
+}
+
+void can_complete(int *r)
+{
+    // Allow the operation to proceed so the test verifies
+    // that it doesn't complete earlier than expected
+    *r = -2;
+}
+
+void check_completed(int *r)
+{
+    assert(*r == 1);
+    delete r;
+}
+
+void pretend_connected(cluster_client_t *cli, osd_num_t osd_num)
+{
+    printf("OSD %lu connected\n", osd_num);
+    int peer_fd = cli->msgr.clients.size() ? std::prev(cli->msgr.clients.end())->first+1 : 10;
+    cli->msgr.osd_peer_fds[osd_num] = peer_fd;
+    cli->msgr.clients[peer_fd] = new osd_client_t();
+    cli->msgr.clients[peer_fd]->osd_num = osd_num;
+    cli->msgr.clients[peer_fd]->peer_state = PEER_CONNECTED;
+    cli->msgr.wanted_peers.erase(osd_num);
+    cli->msgr.repeer_pgs(osd_num);
+}
+
+void pretend_disconnected(cluster_client_t *cli, osd_num_t osd_num)
+{
+    printf("OSD %lu disconnected\n", osd_num);
+    cli->msgr.stop_client(cli->msgr.osd_peer_fds.at(osd_num));
+}
+
+void check_disconnected(cluster_client_t *cli, osd_num_t osd_num)
+{
+    if (cli->msgr.osd_peer_fds.find(osd_num) != cli->msgr.osd_peer_fds.end())
+    {
+        printf("OSD %lu not disconnected as it ought to be\n", osd_num);
+        assert(0);
+    }
+}
+
+void check_op_count(cluster_client_t *cli, osd_num_t osd_num, int ops)
+{
+    int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
+    int real_ops = cli->msgr.clients[peer_fd]->sent_ops.size();
+    if (real_ops != ops)
+    {
+        printf("error: %d ops expected, but %d queued\n", ops, real_ops);
+        assert(0);
+    }
+}
+
+osd_op_t *find_op(cluster_client_t *cli, osd_num_t osd_num, uint64_t opcode, uint64_t offset, uint64_t len)
+{
+    int peer_fd = cli->msgr.osd_peer_fds.at(osd_num);
+    auto op_it = cli->msgr.clients[peer_fd]->sent_ops.begin();
+    while (op_it != cli->msgr.clients[peer_fd]->sent_ops.end())
+    {
+        auto op = op_it->second;
+        if (op->req.hdr.opcode == opcode && (opcode == OSD_OP_SYNC ||
+            op->req.rw.inode == 0x1000000000001 && op->req.rw.offset == offset && op->req.rw.len == len))
+        {
+            return op;
+        }
+        op_it++;
+    }
+    return NULL;
+}
+
+void pretend_op_completed(cluster_client_t *cli, osd_op_t *op, int64_t retval)
+{
+    assert(op);
+    printf("Pretend completed %s %lx+%x\n", op->req.hdr.opcode == OSD_OP_SYNC
+        ? "sync" : (op->req.hdr.opcode == OSD_OP_WRITE ? "write" : "read"), op->req.rw.offset, op->req.rw.len);
+    uint64_t op_id = op->req.hdr.id;
+    int peer_fd = op->peer_fd;
+    cli->msgr.clients[peer_fd]->sent_ops.erase(op_id);
+    op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+    op->reply.hdr.id = op->req.hdr.id;
+    op->reply.hdr.opcode = op->req.hdr.opcode;
+    op->reply.hdr.retval = retval < 0 ? retval : (op->req.hdr.opcode == OSD_OP_SYNC ? 0 : op->req.rw.len);
+    // Copy lambda to be unaffected by `delete op`
+    std::function<void(osd_op_t*)>(op->callback)(op);
+}
+
+void test1()
+{
+    json11::Json config;
+    timerfd_manager_t *tfd = new timerfd_manager_t([](int fd, bool wr, std::function<void(int, int)> callback){});
+    cluster_client_t *cli = new cluster_client_t(NULL, tfd, config);
+
+    int *r1 = test_write(cli, 0, 4096, 0x55);
+    configure_single_pg_pool(cli);
+    pretend_connected(cli, 1);
+    cli->continue_ops(true);
+    can_complete(r1);
+    check_op_count(cli, 1, 1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
+    check_completed(r1);
+    pretend_disconnected(cli, 1);
+    int *r2 = test_sync(cli);
+    pretend_connected(cli, 1);
+    check_op_count(cli, 1, 0);
+    cli->continue_ops(true);
+    check_op_count(cli, 1, 1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
+    check_op_count(cli, 1, 1);
+    can_complete(r2);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
+    check_completed(r2);
+    // Check that the client doesn't repeat operations once more
+    pretend_disconnected(cli, 1);
+    pretend_connected(cli, 1);
+    check_op_count(cli, 1, 0);
+
+    // Case:
+    // Write(1) -> Complete Write(1) -> Overwrite(2) -> Complete Write(2)
+    // -> Overwrite(3) -> Drop OSD connection -> Reestablish OSD connection
+    // -> Complete All Posted Writes -> Sync -> Complete Sync
+    // The resulting state of the block must be (3) over (2) over (1).
+    // I.e. the part overwritten by (3) must remain as in (3) and so on.
+
+    // More interesting case:
+    // Same, but both Write(2) and Write(3) must consist of two parts:
+    // one from an OSD 2 that drops connection and other from OSD 1 that doesn't.
+    // The idea is that if the whole Write(2) is repeated when OSD 2 drops connection
+    // then it may also overwrite a part in OSD 1 which shouldn't be overwritten.
+
+    // Another interesting case:
+    // A new operation added during replay (would also break with the previous implementation)
+
+    r1 = test_write(cli, 0, 0x10000, 0x56);
+    can_complete(r1);
+    check_op_count(cli, 1, 1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x10000), 0);
+    check_completed(r1);
+
+    r1 = test_write(cli, 0xE000, 0x4000, 0x57);
+    can_complete(r1);
+    check_op_count(cli, 1, 1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0xE000, 0x4000), 0);
+    check_completed(r1);
+
+    r1 = test_write(cli, 0x10000, 0x4000, 0x58);
+
+    pretend_disconnected(cli, 1);
+    pretend_connected(cli, 1);
+    cli->continue_ops(true);
+
+    // Check replay
+    {
+        uint64_t replay_start = UINT64_MAX;
+        uint64_t replay_end = 0;
+        std::vector<osd_op_t*> replay_ops;
+        auto osd_cl = cli->msgr.clients.at(cli->msgr.osd_peer_fds.at(1));
+        for (auto & op_p: osd_cl->sent_ops)
+        {
+            auto op = op_p.second;
+            assert(op->req.hdr.opcode == OSD_OP_WRITE);
+            uint64_t offset = op->req.rw.offset;
+            if (op->req.rw.offset < replay_start)
+                replay_start = op->req.rw.offset;
+            if (op->req.rw.offset+op->req.rw.len > replay_end)
+                replay_end = op->req.rw.offset+op->req.rw.len;
+            for (int buf_idx = 0; buf_idx < op->iov.count; buf_idx++)
+            {
+                for (int i = 0; i < op->iov.buf[buf_idx].iov_len; i++, offset++)
+                {
+                    uint8_t c = offset < 0xE000 ? 0x56 : (offset < 0x10000 ? 0x57 : 0x58);
+                    if (((uint8_t*)op->iov.buf[buf_idx].iov_base)[i] != c)
+                    {
+                        printf("Write replay: mismatch at %lu\n", offset-op->req.rw.offset);
+                        goto fail;
+                    }
+                }
+            }
+        fail:
+            assert(offset == op->req.rw.offset+op->req.rw.len);
+            replay_ops.push_back(op);
+        }
+        if (replay_start != 0 || replay_end != 0x14000)
+        {
+            printf("Write replay: range mismatch: %lx-%lx\n", replay_start, replay_end);
+            assert(0);
+        }
+        for (auto op: replay_ops)
+        {
+            pretend_op_completed(cli, op, 0);
+        }
+    }
+    // Check that the following write finally proceeds
+    check_op_count(cli, 1, 1);
+    can_complete(r1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x10000, 0x4000), 0);
+    check_completed(r1);
+    check_op_count(cli, 1, 0);
+
+    // Check sync
+    r2 = test_sync(cli);
+    can_complete(r2);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
+    check_completed(r2);
+
+    // Check disconnect during write
+    r1 = test_write(cli, 0, 4096, 0x59);
+    check_op_count(cli, 1, 1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
+    check_disconnected(cli, 1);
+    pretend_connected(cli, 1);
+    check_op_count(cli, 1, 0);
+    cli->continue_ops(true);
+    check_op_count(cli, 1, 1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
+    check_op_count(cli, 1, 1);
+    can_complete(r1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
+    check_completed(r1);
+
+    // Check disconnect inside operation callback (reenterability)
+    // Probably doesn't happen too often, but possible in theory
+    r1 = test_write(cli, 0, 0x1000, 0x60, [cli]()
+    {
+        pretend_disconnected(cli, 1);
+    });
+    r2 = test_write(cli, 0x1000, 0x1000, 0x61);
+    check_op_count(cli, 1, 2);
+    can_complete(r1);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
+    check_completed(r1);
+    check_disconnected(cli, 1);
+    pretend_connected(cli, 1);
+    cli->continue_ops(true);
+    check_op_count(cli, 1, 2);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);
+    check_op_count(cli, 1, 1);
+    can_complete(r2);
+    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);
+    check_completed(r2);
+
+    // Free client
+    delete cli;
+    delete tfd;
+    printf("[ok] write replay test\n");
+}
+
+void test2()
+{
+    std::map<object_id, cluster_buffer_t> unsynced_writes;
+    cluster_op_t *op = new cluster_op_t();
+    op->opcode = OSD_OP_WRITE;
+    op->inode = 1;
+    op->offset = 0;
+    op->len = 4096;
+    op->iov.push_back(malloc_or_die(4096*1024), 4096);
+    // 0-4k = 0x55
+    memset(op->iov.buf[0].iov_base, 0x55, op->iov.buf[0].iov_len);
+    cluster_client_t::copy_write(op, unsynced_writes);
+    // 8k-12k = 0x66
+    op->offset = 8192;
+    memset(op->iov.buf[0].iov_base, 0x66, op->iov.buf[0].iov_len);
+    cluster_client_t::copy_write(op, unsynced_writes);
+    // 4k-1M+4k = 0x77
+    op->len = op->iov.buf[0].iov_len = 1048576;
+    op->offset = 4096;
+    memset(op->iov.buf[0].iov_base, 0x77, op->iov.buf[0].iov_len);
+    cluster_client_t::copy_write(op, unsynced_writes);
+    // check it
+    assert(unsynced_writes.size() == 4);
+    auto uit = unsynced_writes.begin();
+    int i;
+    assert(uit->first.inode == 1);
+    assert(uit->first.stripe == 0);
+    assert(uit->second.len == 4096);
+    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x55; i++) {}
+    assert(i == uit->second.len);
+    uit++;
+    assert(uit->first.inode == 1);
+    assert(uit->first.stripe == 4096);
+    assert(uit->second.len == 4096);
+    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
+    assert(i == uit->second.len);
+    uit++;
+    assert(uit->first.inode == 1);
+    assert(uit->first.stripe == 8192);
+    assert(uit->second.len == 4096);
+    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
+    assert(i == uit->second.len);
+    uit++;
+    assert(uit->first.inode == 1);
+    assert(uit->first.stripe == 12*1024);
+    assert(uit->second.len == 1016*1024);
+    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
+    assert(i == uit->second.len);
+    uit++;
+    // free memory
+    free(op->iov.buf[0].iov_base);
+    delete op;
+    for (auto p: unsynced_writes)
+    {
+        free(p.second.buf);
+    }
+    printf("[ok] copy_write test\n");
+}
+
+int main(int narg, char *args[])
+{
+    test1();
+    test2();
+    return 0;
+}
--- a/src/timerfd_manager.cpp
+++ b/src/timerfd_manager.cpp
@ -121,7 +121,7 @@ again:
            exp.it_value.tv_sec--;
            exp.it_value.tv_nsec += 1000000000;
        }
-        if (exp.it_value.tv_sec < 0 || !exp.it_value.tv_sec && !exp.it_value.tv_nsec)
+        if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
        {
            // It already happened
            trigger_nearest();
@ -159,6 +159,6 @@ void timerfd_manager_t::trigger_nearest()
    {
        timers.erase(timers.begin()+nearest, timers.begin()+nearest+1);
    }
-    cb(nearest_id);
    nearest = -1;
+    cb(nearest_id);
 }
--- a/tests/test_change_pg_count.sh
+++ b/tests/test_change_pg_count.sh
@ -2,6 +2,14 @@

 . `dirname $0`/common.sh

+if [ "$EC" != "" ]; then
+    POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
+    NOBJ=512
+else
+    POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
+    NOBJ=1024
+fi
+
 dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
 dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
 dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
@ -28,7 +36,7 @@ cd ..
 node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
 MON_PID=$!

-$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
+$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'

 sleep 2

@ -52,7 +60,7 @@ try_change()
        echo --- Change PG count to $n --- >>testdata/osd$i.log
    done

-    $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
+    $ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'

    for i in {1..10}; do
        ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \
@ -82,8 +90,8 @@ try_change()

    # Check that no objects are lost !
    nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
-    if [ "$nobj" -ne 1024 ]; then
-        format_error "Data lost after changing PG count to $n: 1024 objects expected, but got $nobj"
+    if [ "$nobj" -ne $NOBJ ]; then
+        format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
    fi
 }

--- a/tests/test_write.sh
+++ b/tests/test_write.sh
@ -35,6 +35,18 @@ fi
 #    fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M

 LD_PRELOAD=libasan.so.5 \
-    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=1G -cluster_log_level=10
+    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
+
+LD_PRELOAD=libasan.so.5 \
+    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=1 -fsync=32 -buffer_pattern=0xdeadface \
+        -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=1024
+
+qemu-img convert -S 4096 -p \
+    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=$((128*1024*1024))" \
+    -O raw ./testdata/read.bin
+
+qemu-img convert -S 4096 -p \
+    -f raw ./testdata/read.bin \
+    -O raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=$((128*1024*1024))"

 format_green OK
Author	SHA1	Message	Date
Vitaliy Filippov	ec90fe6ec1	Release 0.5.13 Another followup to 0.5.11	2021-04-09 12:10:16 +03:00
Vitaliy Filippov	18c72f4835	Correct reenterability fix (now verified with a test) It's rather funny but 0.5.12 has to be re-published again	2021-04-09 12:10:16 +03:00
Vitaliy Filippov	59fbcef734	Release 0.5.12 Fix qemu driver broken in 0.5.11 :)	2021-04-08 15:47:18 +03:00
Vitaliy Filippov	40b7c21fb1	Followup to `307c1731c1` - fix mark_stable	2021-04-08 15:47:18 +03:00
Vitaliy Filippov	efb3678606	Fix qemu-img broken in 0.5.11 Caused by the lack of reenterability of the main cluster_client function	2021-04-08 14:59:20 +03:00
Vitaliy Filippov	462650134e	Release 0.5.11 Another bunch of fixes, including important ones. Now OSDs are stable in SSD+HDD configurations and everything is mostly ready for the merge of master branch. Features: - Add min_flusher_count configuration (good for HDDs) - Shuffle PGs for better data device utilisation - Make OSDs benefit from the immediate_commit=small setting if it's applicable Bug fixes: - Rework client code to fix write ordering during operation replay - Rework error handling code so OSDs don't crash in reaction to a crash of their peer OSDs - Fix several block layer problems related to the journal, some of which were leading to double allocations of the same block during journal replay - Fix monitors crashing during the removal of OSD keys from etcd - Fix data fsyncs being incorrectly disabled when only disable_journal_fsync was set - Always zero out unused part of request/reply headers - Fix some theoretically possible read/write ordering issues - Don't try to "recover" misplaced objects if it would make them degraded - Fix heartbeats sometimes preventing OSD to establish connections	2021-04-08 01:18:46 +03:00
Vitaliy Filippov	8d87e32175	Fix msgr_op.h includes	2021-04-08 01:18:46 +03:00
Vitaliy Filippov	b0b2e7df3c	Fix use-after-free in keepalive_timer and rework stop_client() The bug reproduced if fio was temporarily stopped with SIGSTOP during write test and then resumed after 10 seconds. In this case "pings" were failed for all clients and fio process crashed with 'use-after-free' in keepalive_timer. It happened because it called stop_client while having a live iterator to the map.	2021-04-07 11:06:31 +03:00
Vitaliy Filippov	97efb9e299	Do not crash on PG re-peering events when operations are in progress	2021-04-07 11:06:31 +03:00
Vitaliy Filippov	f6d705383a	Fix client connection recovery bugs, add dirty_ops limit	2021-04-07 11:06:31 +03:00
Vitaliy Filippov	68567c0e1f	Fix messenger possibly trying to connect to the same OSD twice	2021-04-07 01:30:38 +03:00
Vitaliy Filippov	04b00003e9	Log ping failures	2021-04-07 01:30:38 +03:00
Vitaliy Filippov	307c1731c1	Forget all dirty_entries before stable big_write or delete during initialisation This fixes a 'double_alloc' assertion in the following case: - big_write object #1 v1 to block #100 - big_write object #1 v2 to block #101 - big_write object #2 v1 to block #100	2021-04-07 01:30:38 +03:00
Vitaliy Filippov	75a6a556b5	Shuffle PGs for better data device utilisation	2021-04-07 01:30:38 +03:00
Vitaliy Filippov	a48e2bbf18	Fix write replay ordering when immediate_commit != all Previous implementation didn't respect write ordering and could lead to corrupted data when restarting writes after an OSD outage Also rework cluster_client queueing logic and add tests for it to verify the correct behaviour	2021-04-03 14:51:52 +03:00
Vitaliy Filippov	688821665a	Remove stoull_full() from etcd_state_client.cpp	2021-04-03 14:36:04 +03:00
Vitaliy Filippov	3e162d95a0	Remove http_client.h include from etcd_state_client.h	2021-04-03 14:36:04 +03:00
Vitaliy Filippov	829381b335	Extract some definitions to msgr_op.{cpp,h}	2021-04-03 14:36:04 +03:00
Vitaliy Filippov	54f2353f24	Use bitmap granularity for alignment checks	2021-04-03 14:36:04 +03:00
Vitaliy Filippov	e47f6fba60	Remove cluster_client_t::stop()	2021-04-03 14:35:42 +03:00
Vitaliy Filippov	883bf84a16	Fix build	2021-04-03 01:47:15 +03:00
Vitaliy Filippov	52097c4856	Stop flushing when less than min_flusher_count operations are available (unless a trim is forced)	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	e1355cbc74	Report failed operation name in cluster_client	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	8f8b90be7a	Add min_flusher_count configuration	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	ad9f619370	Skip double allocs when reading journal	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	f4769ba7c7	Collapse create+delete journal entry pairs if they're already flushed Old journal replay mechanism could lead to a double allocation of the same block and a "Fatal error: tried to overwrite non-zero metadata entry"	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	843b7052d2	Add an assertion when clearing deleted metadata entries, add debug details when freeing blocks	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	df99e232ee	Deduplicate osd_sets in pg history + raise request size limit for etcd	2021-04-03 00:53:28 +03:00
Vitaliy Filippov	3a40fa4127	Fix monitor errors in case of OSD removal	2021-03-27 01:15:18 +03:00
Vitaliy Filippov	4095bcc558	Do not ignore object deletion journal entries when they are preceded by a big write	2021-03-25 11:00:10 +03:00
Vitaliy Filippov	564d64e271	Add some details for debug prints	2021-03-25 11:00:10 +03:00
Vitaliy Filippov	cf54741c95	Followup to `05db1308aa` Don't do anything with the object state after errors because it's freed by PG re-peer in this case	2021-03-25 11:00:10 +03:00
Vitaliy Filippov	18a5fafa2a	Fix rollback	2021-03-25 02:41:58 +03:00
Vitaliy Filippov	06f4978085	Fix fsync check in blockstore_flush (data fsyncs were disabled instead of journal fsyncs)	2021-03-25 02:41:58 +03:00
Vitaliy Filippov	7ebf1588c5	Check for immediate_commit==small in the OSD code	2021-03-25 02:41:58 +03:00
Vitaliy Filippov	b0ad1e1e6d	Remember writes as "unsynced" only after completing them Previously BS_OP_SYNC could take unfinished writes and add them into the journal before they were actually completed. This was leading to crashes with the message "BUG: Unexpected dirty_entry 2000000000001:9f2a0000 v3 unstable state during flush: 338"	2021-03-25 02:41:58 +03:00
Vitaliy Filippov	0949f08407	Extract osd_primary write and sync code into separate files	2021-03-24 14:20:56 +03:00
Vitaliy Filippov	04a1f18fa5	Assign .req as a whole to always zero out the remaining part Also clear .reply before processing the operation	2021-03-24 14:20:56 +03:00
Vitaliy Filippov	cf9a641d66	Skip disconnected OSDs during sync	2021-03-24 14:20:56 +03:00
Vitaliy Filippov	05db1308aa	Fix two potential read/write ordering problems (even though not yet seen in tests) - Write operations could be 'stabilized' and previous versions could be purged from OSDs before the removal of version_override and following reads could potentially hit different version in EC pools - Object was marked clean after completing the delete during recovery, so reads could in theory hit a deleted version and return nothing	2021-03-24 14:20:56 +03:00
Vitaliy Filippov	98b54ca948	Don't try to "recover" misplaced objects if it would make them degraded	2021-03-21 01:37:23 +03:00
Vitaliy Filippov	23225c5e62	Do not run ping on clients that are not yet connected	2021-03-21 01:37:23 +03:00
				`@ -0,0 +1 @@`
				`g++ -D__MOCK__ -fsanitize=address -g -Wno-pointer-arith pg_states.cpp osd_ops.cpp test_cluster_client.cpp cluster_client.cpp msgr_op.cpp msgr_stop.cpp mock/messenger.cpp etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp -I mock -I . -I ..; ./a.out`