26 changed files with 433 additions and 1099 deletions
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@ -205,8 +205,9 @@ This parameter usually doesn't require to be changed.
 - Default: 131072

 Block size for this pool. The value from /vitastor/config/global is used when
-unspecified. Only OSDs with matching block_size are used for each pool. If you
-want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).
+unspecified. If your cluster has OSDs with different block sizes then pool must
+be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
+size.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).

@ -215,9 +216,10 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Type: integer
 - Default: 4096

-"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
-is used when unspecified. Similarly to block_size, only OSDs with matching
-bitmap_granularity are used for each pool.
+"Sector" size of virtual disks in this pool. The value from
+/vitastor/config/global is used when unspecified. Similar to block_size, the
+pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
+matching bitmap_granularity.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).

@ -227,11 +229,10 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Default: none

 Immediate commit setting for this pool. The value from /vitastor/config/global
-is used when unspecified. Similarly to block_size, only OSDs with compatible
-bitmap_granularity are used for each pool. "Compatible" means that a pool with
-non-immediate commit will use OSDs with immediate commit enabled, but not vice
-versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
-with "all" or "small", and pools with "all" only use OSDs with "all".
+is used when unspecified. Similar to block_size, the pool must be restricted by
+[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
+Compatible means that a pool with non-immediate commit will work with OSDs with
+immediate commit enabled, but not vice versa.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).

--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@ -208,9 +208,8 @@ PG в Vitastor эферемерны, то есть вы можете менят

 Размер блока для данного пула. Если не задан, используется значение из
 /vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
-блока, пул будет использовать только OSD с размером блока, равным размеру блока
-пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
-используйте [osd_tags](#osd_tags).
+блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
+с помощью [osd_tags](#osd_tags).

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).

@ -220,8 +219,9 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: 4096

 Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
-значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
-использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.
+значение из /vitastor/config/global. Аналогично block_size, пул должен быть
+ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
+[osd_tags](#osd_tags).

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).

@ -231,13 +231,11 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: none

 Настройка мгновенного коммита для данного пула. Если не задана, используется
-значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
-использовать только OSD с *совместимыми* настройками immediate_commit.
-"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
-использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
-пул со значением "none" будет использовать все OSD, пул со "small" будет
-использовать OSD с "all" или "small", а пул с "all" будет использовать только
-OSD с "all".
+значение из /vitastor/config/global. Аналогично block_size, пул должен быть
+ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
+помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
+мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
+не наоборот.

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).

--- a/mon/mon.js
+++ b/mon/mon.js
@ -78,15 +78,9 @@ const etcd_tree = {
            disk_alignment: 4096,
            bitmap_granularity: 4096,
            immediate_commit: false, // 'all' or 'small'
-            // client - configurable online
-            client_max_dirty_bytes: 33554432,
-            client_max_dirty_ops: 1024,
-            client_enable_writeback: false,
-            client_max_buffered_bytes: 33554432,
-            client_max_buffered_ops: 1024,
-            client_max_writeback_iodepth: 256,
            // client and osd - configurable online
            log_level: 0,
+            client_dirty_limit: 33554432,
            peer_connect_interval: 5, // seconds. min: 1
            peer_connect_timeout: 5, // seconds. min: 1
            osd_idle_timeout: 5, // seconds. min: 1
@ -1162,33 +1156,6 @@ class Mon
        }
    }

-    filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
-    {
-        for (const host in flat_tree)
-        {
-            let found = 0;
-            for (const osd in flat_tree[host])
-            {
-                const osd_stat = this.state.osd.stats[osd];
-                if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
-                    osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
-                    osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
-                    osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
-                {
-                    delete flat_tree[host][osd];
-                }
-                else
-                {
-                    found++;
-                }
-            }
-            if (!found)
-            {
-                delete flat_tree[host];
-            }
-        }
-    }
-
    get_affinity_osds(pool_cfg, up_osds, osd_tree)
    {
        let aff_osds = up_osds;
@ -1249,12 +1216,6 @@ class Mon
                pool_tree = pool_tree ? pool_tree.children : [];
                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
-                this.filter_osds_by_block_layout(
-                    pool_tree,
-                    pool_cfg.block_size || this.config.block_size || 131072,
-                    pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
-                    pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
-                );
                // These are for the purpose of building history.osd_sets
                const real_prev_pgs = [];
                let pg_history = [];
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -137,7 +137,6 @@ endif (${WITH_FIO})
 add_library(vitastor_client SHARED
 	cluster_client.cpp
 	cluster_client_list.cpp
-	cluster_client_wb.cpp
 	vitastor_c.cpp
 	cli_common.cpp
 	cli_alloc_osd.cpp
@ -301,7 +300,7 @@ target_link_libraries(test_crc32
 add_executable(test_cluster_client
 	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
-	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
+	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
 	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@ -384,10 +384,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
-    if (op->opcode == BS_OP_SYNC)
-    {
-        unsynced_queued_ops = 0;
-    }
    init_op(op);
    submit_queue.push_back(op);
    ringloop->wakeup();
@ -397,7 +393,6 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
 {
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
-    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@ -210,7 +210,7 @@ struct blockstore_op_private_t
    std::vector<copy_buffer_t> read_vec;

    // Sync, write
-    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
+    int min_flushed_journal_sector, max_flushed_journal_sector;

    // Write
    struct iovec iov_zerofill[3];
@ -220,6 +220,7 @@ struct blockstore_op_private_t

    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
+    int sync_small_checked, sync_big_checked;
 };

 typedef uint32_t pool_id_t;
@ -262,8 +263,6 @@ class blockstore_impl_t
    int throttle_target_parallelism = 1;
    // Minimum difference in microseconds between target and real execution times to throttle the response
    int throttle_threshold_us = 50;
-    // Maximum writes between automatically added fsync operations
-    uint64_t autosync_writes = 128;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;
@ -275,7 +274,6 @@ class blockstore_impl_t
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
    int unsynced_big_write_count = 0;
-    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;

--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@ -198,7 +198,6 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
    priv->pending_ops++;
    if (!priv->min_flushed_journal_sector)
        priv->min_flushed_journal_sector = 1+cur_sector;
-    assert(priv->min_flushed_journal_sector <= journal.sector_count);
    priv->max_flushed_journal_sector = 1+cur_sector;
 }

--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@ -19,10 +19,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
-    if (config.find("autosync_writes") != config.end())
-    {
-        autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
-    }
    if (!max_flusher_count)
    {
        max_flusher_count = 256;
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@ -27,6 +27,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
        unsynced_big_write_count -= unsynced_big_writes.size();
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
+        PRIV(op)->sync_small_checked = 0;
+        PRIV(op)->sync_big_checked = 0;
        unsynced_big_writes.clear();
        unsynced_small_writes.clear();
        if (PRIV(op)->sync_big_writes.size() > 0)
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@ -127,9 +127,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            return false;
        }
    }
-    bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL));
-    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm ||
-        !imm && unsynced_queued_ops >= autosync_writes)
+    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size &&
+        immediate_commit != IMMEDIATE_ALL)
    {
        // Issue an additional sync so that the previous big write can reach the journal
        blockstore_op_t *sync_op = new blockstore_op_t;
@ -140,8 +139,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        };
        enqueue_op(sync_op);
    }
-    else if (!imm)
-        unsynced_queued_ops++;
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
@ -381,6 +378,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
        if (immediate_commit != IMMEDIATE_ALL)
        {
            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@ -417,11 +415,17 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        write_iodepth++;
        // Got SQEs. Prepare previous journal sector write if required
        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        if (immediate_commit == IMMEDIATE_NONE &&
-            !journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
+        if (immediate_commit == IMMEDIATE_NONE)
+        {
+            if (!journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
            {
                prepare_journal_sector_write(journal.cur_sector, op);
            }
+            else
+            {
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+            }
+        }
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
@ -746,12 +750,18 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    }
    write_iodepth++;
    // Prepare journal sector write
-    if (immediate_commit == IMMEDIATE_NONE &&
-        (dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
+    if (immediate_commit == IMMEDIATE_NONE)
+    {
+        if ((dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
            prepare_journal_sector_write(journal.cur_sector, op);
        }
+        else
+        {
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        }
+    }
    // Pre-fill journal entry
    journal_entry_del *je = (journal_entry_del*)prefill_single_journal_entry(
        journal, JE_DELETE, sizeof(struct journal_entry_del)
--- a/src/cli.cpp
+++ b/src/cli.cpp
@ -349,7 +349,6 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
                p->ringloop->wait();
        }
        // Destroy the client
-        p->cli->flush();
        delete p->cli;
        delete p->epmgr;
        delete p->ringloop;
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@ -53,7 +53,6 @@ struct snap_merger_t
    std::map<inode_t, std::vector<uint64_t>> layer_lists;
    std::map<inode_t, uint64_t> layer_block_size;
    std::map<inode_t, uint64_t> layer_list_pos;
-    std::vector<snap_rw_op_t*> continue_rwo, continue_rwo2;
    int in_flight = 0;
    uint64_t last_fsync_offset = 0;
    uint64_t last_written_offset = 0;
@ -305,12 +304,6 @@ struct snap_merger_t
        oit = merge_offsets.begin();
    resume_5:
        // Now read, overwrite and optionally delete offsets one by one
-        continue_rwo2.swap(continue_rwo);
-        for (auto rwo: continue_rwo2)
-        {
-            next_write(rwo);
-        }
-        continue_rwo2.clear();
        while (in_flight < parent->iodepth*parent->parallel_osds &&
            oit != merge_offsets.end() && !rwo_error.size())
        {
@ -471,8 +464,7 @@ struct snap_merger_t
                rwo->error_offset = op->offset;
                rwo->error_read = true;
            }
-            continue_rwo.push_back(rwo);
-            parent->ringloop->wakeup();
+            next_write(rwo);
        };
        parent->cli->execute(op);
    }
@ -552,9 +544,11 @@ struct snap_merger_t
            }
            // Increment CAS version
            rwo->op.version = subop->version;
+            if (use_cas)
+                next_write(rwo);
+            else
+                autofree_op(rwo);
            delete subop;
-            continue_rwo.push_back(rwo);
-            parent->ringloop->wakeup();
        };
        parent->cli->execute(subop);
    }
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@ -3,13 +3,21 @@

 #include <stdexcept>
 #include <assert.h>
-#include "cluster_client_impl.h"
-#include "http_client.h" // json_is_true
+#include "cluster_client.h"
+
+#define SCRAP_BUFFER_SIZE 4*1024*1024
+#define PART_SENT 1
+#define PART_DONE 2
+#define PART_ERROR 4
+#define PART_RETRY 8
+#define CACHE_DIRTY 1
+#define CACHE_FLUSHING 2
+#define CACHE_REPEATING 3
+#define OP_FLUSH_BUFFER 0x02
+#define OP_IMMEDIATE_COMMIT 0x04

 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
-    wb = new writeback_cache_t();
-
    cli_config = config.object_items();
    file_config = osd_messenger_t::read_config(config);
    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
@ -29,14 +37,30 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
            continue_lists();
            continue_raw_ops(peer_osd);
        }
-        else
+        else if (dirty_buffers.size())
        {
            // peer_osd just dropped connection
            // determine WHICH dirty_buffers are now obsolete and repeat them
-            if (wb->repeat_ops_for(this, peer_osd) > 0)
+            for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
            {
-                continue_ops();
+                bool end = wr_it == dirty_buffers.end();
+                bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
+                    affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
+                if (flush_it != wr_it && (end || !flush_this ||
+                    wr_it->first.inode != flush_it->first.inode ||
+                    wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
+                {
+                    flush_buffers(flush_it, wr_it);
+                    flush_it = wr_it;
                }
+                if (end)
+                    break;
+                last_it = wr_it;
+                wr_it++;
+                if (!flush_this)
+                    flush_it = wr_it;
+            }
+            continue_ops();
        }
    };
    msgr.exec_op = [this](osd_op_t *op)
@ -64,14 +88,16 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

 cluster_client_t::~cluster_client_t()
 {
-    msgr.repeer_pgs = [this](osd_num_t){};
+    for (auto bp: dirty_buffers)
+    {
+        free(bp.second.buf);
+    }
+    dirty_buffers.clear();
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
    }
    free(scrap_buffer);
-    delete wb;
-    wb = NULL;
 }

 cluster_op_t::~cluster_op_t()
@ -120,19 +146,6 @@ void cluster_client_t::init_msgr()
    }
 }

-void cluster_client_t::unshift_op(cluster_op_t *op)
-{
-    op->next = op_queue_head;
-    if (op_queue_head)
-    {
-        op_queue_head->prev = op;
-        op_queue_head = op;
-    }
-    else
-        op_queue_tail = op_queue_head = op;
-    inc_wait(op->opcode, op->flags, op->next, 1);
-}
-
 void cluster_client_t::calc_wait(cluster_op_t *op)
 {
    op->prev_wait = 0;
@ -153,7 +166,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    {
        for (auto prev = op->prev; prev; prev = prev->prev)
        {
-            if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && (!(prev->flags & OP_IMMEDIATE_COMMIT) || enable_writeback))
+            if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && !(prev->flags & OP_IMMEDIATE_COMMIT))
            {
                op->prev_wait++;
            }
@ -163,6 +176,20 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    }
    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
    {
+        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
+        {
+            if (prev->opcode == OSD_OP_WRITE && (prev->flags & OP_FLUSH_BUFFER))
+            {
+                op->prev_wait++;
+            }
+            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
+                prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
+            {
+                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
+                break;
+            }
+        }
+        if (!op->prev_wait)
            continue_rw(op);
    }
 }
@ -174,8 +201,10 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
        while (next)
        {
            auto n2 = next->next;
-            if (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
-                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
+            if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
+                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
+                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
+                    next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
            {
                next->prev_wait += inc;
                assert(next->prev_wait >= 0);
@ -226,38 +255,14 @@ void cluster_client_t::erase_op(cluster_op_t *op)
        op_queue_tail = op->prev;
    op->next = op->prev = NULL;
    if (flags & OP_FLUSH_BUFFER)
-    {
-        // Completed flushes change writeback buffer states,
-        // so the callback should be run before inc_wait()
-        // which may continue following SYNCs, but these SYNCs
-        // should know about the changed buffer state
-        // This is ugly but this is the way we do it
        std::function<void(cluster_op_t*)>(op->callback)(op);
-    }
-    if (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
-    {
+    if (!(flags & OP_IMMEDIATE_COMMIT))
        inc_wait(opcode, flags, next, -1);
-    }
-    if (!(flags & OP_FLUSH_BUFFER))
-    {
    // Call callback at the end to avoid inconsistencies in prev_wait
    // if the callback adds more operations itself
+    if (!(flags & OP_FLUSH_BUFFER))
        std::function<void(cluster_op_t*)>(op->callback)(op);
 }
-    if (flags & OP_FLUSH_BUFFER)
-    {
-        int i = 0;
-        while (i < wb->writeback_overflow.size() && wb->writebacks_active < client_max_writeback_iodepth)
-        {
-            execute_internal(wb->writeback_overflow[i]);
-            i++;
-        }
-        if (i > 0)
-        {
-            wb->writeback_overflow.erase(wb->writeback_overflow.begin(), wb->writeback_overflow.begin()+i);
-        }
-    }
-}

 void cluster_client_t::continue_ops(bool up_retry)
 {
@ -300,7 +305,6 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
 {
    this->etcd_global_config = etcd_global_config;
    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
-    // client_max_dirty_bytes/client_dirty_limit
    if (config.find("client_max_dirty_bytes") != config.end())
    {
        client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@ -316,34 +320,11 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
    {
        client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
    }
-    // client_max_dirty_ops
    client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
    if (!client_max_dirty_ops)
    {
        client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
    }
-    // client_enable_writeback
-    enable_writeback = json_is_true(config["client_enable_writeback"]) &&
-        json_is_true(config["client_writeback_allowed"]);
-    // client_max_buffered_bytes
-    client_max_buffered_bytes = config["client_max_buffered_bytes"].uint64_value();
-    if (!client_max_buffered_bytes)
-    {
-        client_max_buffered_bytes = DEFAULT_CLIENT_MAX_BUFFERED_BYTES;
-    }
-    // client_max_buffered_ops
-    client_max_buffered_ops = config["client_max_buffered_ops"].uint64_value();
-    if (!client_max_buffered_ops)
-    {
-        client_max_buffered_ops = DEFAULT_CLIENT_MAX_BUFFERED_OPS;
-    }
-    // client_max_writeback_iodepth
-    client_max_writeback_iodepth = config["client_max_writeback_iodepth"].uint64_value();
-    if (!client_max_writeback_iodepth)
-    {
-        client_max_writeback_iodepth = DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH;
-    }
-    // up_wait_retry_interval
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
    {
@ -403,8 +384,6 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes

 bool cluster_client_t::get_immediate_commit(uint64_t inode)
 {
-    if (enable_writeback)
-        return false;
    pool_id_t pool_id = INODE_POOL(inode);
    if (!pool_id)
        return true;
@ -439,41 +418,6 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
    }
 }

-bool cluster_client_t::flush()
-{
-    if (!ringloop)
-    {
-        if (wb->writeback_queue.size())
-        {
-            wb->start_writebacks(this, 0);
-            cluster_op_t *sync = new cluster_op_t;
-            sync->opcode = OSD_OP_SYNC;
-            sync->callback = [this](cluster_op_t *sync)
-            {
-                delete sync;
-            };
-            execute(sync);
-        }
-        return op_queue_head == NULL;
-    }
-    bool sync_done = false;
-    cluster_op_t *sync = new cluster_op_t;
-    sync->opcode = OSD_OP_SYNC;
-    sync->callback = [this, &sync_done](cluster_op_t *sync)
-    {
-        delete sync;
-        sync_done = true;
-    };
-    execute(sync);
-    while (!sync_done)
-    {
-        ringloop->loop();
-        if (!sync_done)
-            ringloop->wait();
-    }
-    return true;
-}
-
 /**
 * How writes are synced when immediate_commit is false
 *
@ -494,9 +438,6 @@ bool cluster_client_t::flush()
 * 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
 * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
 * 5) if any of them fail due to other errors, fail the SYNC operation
- *
- * If writeback caching is turned on and writeback limit is not exhausted:
- * data is just copied and the write is confirmed to the client.
 */
 void cluster_client_t::execute(cluster_op_t *op)
 {
@ -512,73 +453,36 @@ void cluster_client_t::execute(cluster_op_t *op)
        offline_ops.push_back(op);
        return;
    }
-    op->flags = op->flags & OSD_OP_IGNORE_READONLY; // the only allowed flag
-    execute_internal(op);
-}
-
-void cluster_client_t::execute_internal(cluster_op_t *op)
-{
    op->cur_inode = op->inode;
    op->retval = 0;
+    op->flags = op->flags & OSD_OP_IGNORE_READONLY; // the only allowed flag
    // check alignment, readonly flag and so on
    if (!check_rw(op))
    {
        return;
    }
-    if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
-        !op->version /* FIXME no CAS writeback */)
-    {
-        if (wb->writebacks_active >= client_max_writeback_iodepth)
-        {
-            // Writeback queue is full, postpone the operation
-            wb->writeback_overflow.push_back(op);
-            return;
-        }
-        // Just copy and acknowledge the operation
-        wb->copy_write(op, CACHE_DIRTY);
-        while (wb->writeback_bytes + op->len > client_max_buffered_bytes || wb->writeback_queue_size > client_max_buffered_ops)
-        {
-            // Initiate some writeback (asynchronously)
-            wb->start_writebacks(this, 1);
-        }
-        op->retval = op->len;
-        std::function<void(cluster_op_t*)>(op->callback)(op);
-        return;
-    }
    if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
    {
        if (!(op->flags & OP_FLUSH_BUFFER))
        {
-            wb->copy_write(op, CACHE_WRITTEN);
+            copy_write(op, dirty_buffers);
        }
        if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
        {
            // Push an extra SYNC operation to flush previous writes
            cluster_op_t *sync_op = new cluster_op_t;
            sync_op->opcode = OSD_OP_SYNC;
-            sync_op->flags = OP_FLUSH_BUFFER;
            sync_op->callback = [](cluster_op_t* sync_op)
            {
                delete sync_op;
            };
-            execute_internal(sync_op);
+            execute(sync_op);
        }
        dirty_bytes += op->len;
        dirty_ops++;
    }
    else if (op->opcode == OSD_OP_SYNC)
    {
-        // Flush the whole write-back queue first
-        if (!(op->flags & OP_FLUSH_BUFFER) && wb->writeback_overflow.size() > 0)
-        {
-            // Writeback queue is full, postpone the operation
-            wb->writeback_overflow.push_back(op);
-            return;
-        }
-        if (wb->writeback_queue.size())
-        {
-            wb->start_writebacks(this, 0);
-        }
        dirty_bytes = 0;
        dirty_ops = 0;
    }
@ -590,7 +494,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
    }
    else
        op_queue_tail = op_queue_head = op;
-    if (!(op->flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
+    if (!(op->flags & OP_IMMEDIATE_COMMIT))
        calc_wait(op);
    else
    {
@ -664,6 +568,136 @@ void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
    }
 }

+static std::map<object_id, cluster_buffer_t>::iterator find_dirty(uint64_t inode, uint64_t offset, std::map<object_id, cluster_buffer_t> & dirty_buffers)
+{
+    auto dirty_it = dirty_buffers.lower_bound((object_id){
+        .inode = inode,
+        .stripe = offset,
+    });
+    while (dirty_it != dirty_buffers.begin())
+    {
+        dirty_it--;
+        if (dirty_it->first.inode != inode ||
+            (dirty_it->first.stripe + dirty_it->second.len) <= offset)
+        {
+            dirty_it++;
+            break;
+        }
+    }
+    return dirty_it;
+}
+
+void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
+{
+    // Save operation for replay when one of PGs goes out of sync
+    // (primary OSD drops our connection in this case)
+    auto dirty_it = find_dirty(op->inode, op->offset, dirty_buffers);
+    uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
+    while (len > 0)
+    {
+        uint64_t new_len = 0;
+        if (dirty_it == dirty_buffers.end() || dirty_it->first.inode != op->inode)
+        {
+            new_len = len;
+        }
+        else if (dirty_it->first.stripe > pos)
+        {
+            new_len = dirty_it->first.stripe - pos;
+            if (new_len > len)
+            {
+                new_len = len;
+            }
+        }
+        if (new_len > 0)
+        {
+            dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
+                .inode = op->inode,
+                .stripe = pos,
+            }, (cluster_buffer_t){
+                .buf = malloc_or_die(new_len),
+                .len = new_len,
+            });
+        }
+        // FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
+        dirty_it->second.state = CACHE_DIRTY;
+        uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
+        if (cur_len > len)
+        {
+            cur_len = len;
+        }
+        while (cur_len > 0 && iov_idx < op->iov.count)
+        {
+            unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
+            if (iov_len <= cur_len)
+            {
+                memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
+                    (uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
+                pos += iov_len;
+                len -= iov_len;
+                cur_len -= iov_len;
+                iov_pos = 0;
+                iov_idx++;
+            }
+            else
+            {
+                memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
+                    (uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
+                pos += cur_len;
+                len -= cur_len;
+                iov_pos += cur_len;
+                cur_len = 0;
+            }
+        }
+        dirty_it++;
+    }
+}
+
+void cluster_client_t::flush_buffers(std::map<object_id, cluster_buffer_t>::iterator from_it,
+    std::map<object_id, cluster_buffer_t>::iterator to_it)
+{
+    auto prev_it = std::prev(to_it);
+    cluster_op_t *op = new cluster_op_t;
+    op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
+    op->opcode = OSD_OP_WRITE;
+    op->cur_inode = op->inode = from_it->first.inode;
+    op->offset = from_it->first.stripe;
+    op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
+    uint32_t calc_len = 0;
+    uint64_t flush_id = ++last_flush_id;
+    for (auto it = from_it; it != to_it; it++)
+    {
+        it->second.state = CACHE_REPEATING;
+        it->second.flush_id = flush_id;
+        op->iov.push_back(it->second.buf, it->second.len);
+        calc_len += it->second.len;
+    }
+    assert(calc_len == op->len);
+    op->callback = [this, flush_id](cluster_op_t* op)
+    {
+        for (auto dirty_it = find_dirty(op->inode, op->offset, dirty_buffers);
+            dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
+            dirty_it->first.stripe < op->offset+op->len; dirty_it++)
+        {
+            if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
+            {
+                dirty_it->second.flush_id = 0;
+                dirty_it->second.state = CACHE_DIRTY;
+            }
+        }
+        delete op;
+    };
+    op->next = op_queue_head;
+    if (op_queue_head)
+    {
+        op_queue_head->prev = op;
+        op_queue_head = op;
+    }
+    else
+        op_queue_tail = op_queue_head = op;
+    inc_wait(op->opcode, op->flags, op->next, 1);
+    continue_rw(op);
+}
+
 int cluster_client_t::continue_rw(cluster_op_t *op)
 {
    if (op->state == 0)
@ -767,8 +801,7 @@ resume_2:
        erase_op(op);
        return 1;
    }
-    else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
-        op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
+    else if (op->retval != 0 && op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
    {
        // Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
        // FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
@ -840,6 +873,50 @@ static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t
    }
 }

+static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t len, uint32_t bitmap_granularity)
+{
+    if (op->opcode == OSD_OP_READ)
+    {
+        // Not OSD_OP_READ_BITMAP or OSD_OP_READ_CHAIN_BITMAP
+        int iov_idx = 0;
+        uint64_t cur_offset = op->offset;
+        while (iov_idx < op->iov.count && cur_offset+op->iov.buf[iov_idx].iov_len <= offset)
+        {
+            cur_offset += op->iov.buf[iov_idx].iov_len;
+            iov_idx++;
+        }
+        while (iov_idx < op->iov.count && cur_offset < offset+len)
+        {
+            auto & v = op->iov.buf[iov_idx];
+            auto begin = (cur_offset < offset ? offset : cur_offset);
+            auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
+            memcpy(
+                v.iov_base + begin - cur_offset,
+                buf + (cur_offset <= offset ? 0 : cur_offset-offset),
+                end - begin
+            );
+            cur_offset += v.iov_len;
+            iov_idx++;
+        }
+    }
+    // Set bitmap bits
+    int start_bit = (offset-op->offset)/bitmap_granularity;
+    int end_bit = (offset-op->offset+len)/bitmap_granularity;
+    for (int bit = start_bit; bit < end_bit;)
+    {
+        if (!(bit%8) && bit <= end_bit-8)
+        {
+            ((uint8_t*)op->bitmap_buf)[bit/8] = 0xFF;
+            bit += 8;
+        }
+        else
+        {
+            ((uint8_t*)op->bitmap_buf)[bit/8] |= (1 << (bit%8));
+            bit++;
+        }
+    }
+}
+
 void cluster_client_t::slice_rw(cluster_op_t *op)
 {
    // Slice the request into individual object stripe requests
@ -868,11 +945,52 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    int iov_idx = 0;
    size_t iov_pos = 0;
    int i = 0;
+    bool dirty_copied = false;
+    if (dirty_buffers.size() && (op->opcode == OSD_OP_READ ||
+        op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
+    {
        // We also have to return reads from CACHE_REPEATING buffers - they are not
        // guaranteed to be present on target OSDs at the moment of repeating
        // And we're also free to return data from other cached buffers just
        // because it's faster
-    bool dirty_copied = wb->read_from_cache(op, pool_cfg.bitmap_granularity);
+        auto dirty_it = find_dirty(op->cur_inode, op->offset, dirty_buffers);
+        while (dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->cur_inode &&
+            dirty_it->first.stripe < op->offset+op->len)
+        {
+            uint64_t begin = dirty_it->first.stripe, end = dirty_it->first.stripe + dirty_it->second.len;
+            if (begin < op->offset)
+                begin = op->offset;
+            if (end > op->offset+op->len)
+                end = op->offset+op->len;
+            bool skip_prev = true;
+            uint64_t cur = begin, prev = begin;
+            while (cur < end)
+            {
+                unsigned bmp_loc = (cur - op->offset)/pool_cfg.bitmap_granularity;
+                bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
+                if (skip_prev != skip)
+                {
+                    if (cur > prev && !skip)
+                    {
+                        // Copy data
+                        dirty_copied = true;
+                        copy_to_op(op, prev, (uint8_t*)dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, pool_cfg.bitmap_granularity);
+                    }
+                    skip_prev = skip;
+                    prev = cur;
+                }
+                cur += pool_cfg.bitmap_granularity;
+            }
+            assert(cur > prev);
+            if (!skip_prev)
+            {
+                // Copy data
+                dirty_copied = true;
+                copy_to_op(op, prev, (uint8_t*)dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, pool_cfg.bitmap_granularity);
+            }
+            dirty_it++;
+        }
+    }
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
        pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
@ -1044,7 +1162,13 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
            do_it++;
    }
    // Post sync to affected OSDs
-    wb->fsync_start();
+    for (auto & prev_op: dirty_buffers)
+    {
+        if (prev_op.second.state == CACHE_DIRTY)
+        {
+            prev_op.second.state = CACHE_FLUSHING;
+        }
+    }
    op->parts.resize(dirty_osds.size());
    op->retval = 0;
    {
@ -1069,7 +1193,13 @@ resume_1:
    }
    if (op->retval != 0)
    {
-        wb->fsync_error();
+        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
+        {
+            if (uw_it->second.state == CACHE_FLUSHING)
+            {
+                uw_it->second.state = CACHE_DIRTY;
+            }
+        }
        if (op->retval == -EPIPE || op->retval == -EIO || op->retval == -ENOSPC)
        {
            // Retry later
@ -1083,7 +1213,16 @@ resume_1:
    }
    else
    {
-        wb->fsync_ok();
+        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
+        {
+            if (uw_it->second.state == CACHE_FLUSHING)
+            {
+                free(uw_it->second.buf);
+                dirty_buffers.erase(uw_it++);
+            }
+            else
+                uw_it++;
+        }
    }
    erase_op(op);
    return 1;
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@ -8,9 +8,6 @@

 #define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
 #define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
-#define DEFAULT_CLIENT_MAX_BUFFERED_BYTES 32*1024*1024
-#define DEFAULT_CLIENT_MAX_BUFFERED_OPS 1024
-#define DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH 256
 #define INODE_LIST_DONE 1
 #define INODE_LIST_HAS_UNSTABLE 2
 #define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
@ -67,12 +64,18 @@ protected:
    cluster_op_t *prev = NULL, *next = NULL;
    int prev_wait = 0;
    friend class cluster_client_t;
-    friend class writeback_cache_t;
+};
+
+struct cluster_buffer_t
+{
+    void *buf;
+    uint64_t len;
+    int state;
+    uint64_t flush_id;
 };

 struct inode_list_t;
 struct inode_list_osd_t;
-class writeback_cache_t;

 // FIXME: Split into public and private interfaces
 class cluster_client_t
@ -81,23 +84,17 @@ class cluster_client_t
    ring_loop_t *ringloop;

    std::map<pool_id_t, uint64_t> pg_counts;
-    // client_max_dirty_* is actually "max unsynced", for the case when immediate_commit is off
+    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
    uint64_t client_max_dirty_bytes = 0;
    uint64_t client_max_dirty_ops = 0;
-    // writeback improves (1) small consecutive writes and (2) Q1 writes without fsync
-    bool enable_writeback = false;
-    // client_max_buffered_* is the real "dirty limit" - maximum amount of writes buffered in memory
-    uint64_t client_max_buffered_bytes = 0;
-    uint64_t client_max_buffered_ops = 0;
-    uint64_t client_max_writeback_iodepth = 0;
-
    int log_level;
    int up_wait_retry_interval = 500; // ms

    int retry_timeout_id = 0;
    std::vector<cluster_op_t*> offline_ops;
    cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
-    writeback_cache_t *wb = NULL;
+    std::map<object_id, cluster_buffer_t> dirty_buffers;
+    uint64_t last_flush_id = 0;
    std::set<osd_num_t> dirty_osds;
    uint64_t dirty_bytes = 0, dirty_ops = 0;

@ -127,10 +124,10 @@ public:
    void execute_raw(osd_num_t osd_num, osd_op_t *op);
    bool is_ready();
    void on_ready(std::function<void(void)> fn);
-    bool flush();

    bool get_immediate_commit(uint64_t inode);

+    static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
    void continue_ops(bool up_retry = false);
    inode_list_t *list_inode_start(inode_t inode,
        std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
@ -143,12 +140,12 @@ public:

 protected:
    bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
+    void flush_buffers(std::map<object_id, cluster_buffer_t>::iterator from_it,
+        std::map<object_id, cluster_buffer_t>::iterator to_it);
    void on_load_config_hook(json11::Json::object & config);
    void on_load_pgs_hook(bool success);
    void on_change_hook(std::map<std::string, etcd_kv_t> & changes);
    void on_change_osd_state_hook(uint64_t peer_osd);
-    void execute_internal(cluster_op_t *op);
-    void unshift_op(cluster_op_t *op);
    int continue_rw(cluster_op_t *op);
    bool check_rw(cluster_op_t *op);
    void slice_rw(cluster_op_t *op);
@ -164,6 +161,4 @@ protected:
    void continue_listing(inode_list_t *lst);
    void send_list(inode_list_osd_t *cur_list);
    void continue_raw_ops(osd_num_t peer_osd);
-
-    friend class writeback_cache_t;
 };
--- a/src/cluster_client_impl.h
+++ b/src/cluster_client_impl.h
@ -1,57 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-#pragma once
-
-#include "cluster_client.h"
-
-#define SCRAP_BUFFER_SIZE 4*1024*1024
-#define PART_SENT 1
-#define PART_DONE 2
-#define PART_ERROR 4
-#define PART_RETRY 8
-#define CACHE_DIRTY 1
-#define CACHE_WRITTEN 2
-#define CACHE_FLUSHING 3
-#define CACHE_REPEATING 4
-#define OP_FLUSH_BUFFER 0x02
-#define OP_IMMEDIATE_COMMIT 0x04
-
-struct cluster_buffer_t
-{
-    uint8_t *buf;
-    uint64_t len;
-    int state;
-    uint64_t flush_id;
-    uint64_t *refcnt;
-};
-
-typedef std::map<object_id, cluster_buffer_t>::iterator dirty_buf_it_t;
-
-class writeback_cache_t
-{
-public:
-    uint64_t writeback_bytes = 0;
-    int writeback_queue_size = 0;
-    int writebacks_active = 0;
-    uint64_t last_flush_id = 0;
-
-    std::map<object_id, cluster_buffer_t> dirty_buffers;
-    std::vector<cluster_op_t*> writeback_overflow;
-    std::vector<object_id> writeback_queue;
-    std::multimap<uint64_t, uint64_t*> flushed_buffers; // flush_id => refcnt
-
-    ~writeback_cache_t();
-    dirty_buf_it_t find_dirty(uint64_t inode, uint64_t offset);
-    bool is_left_merged(dirty_buf_it_t dirty_it);
-    bool is_right_merged(dirty_buf_it_t dirty_it);
-    bool is_merged(const dirty_buf_it_t & dirty_it);
-    void copy_write(cluster_op_t *op, int state);
-    int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd);
-    void start_writebacks(cluster_client_t *cli, int count);
-    bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
-    void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
-    void fsync_start();
-    void fsync_error();
-    void fsync_ok();
-};
--- a/src/cluster_client_wb.cpp
+++ b/src/cluster_client_wb.cpp
@ -1,498 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-#include <cassert>
-
-#include "cluster_client_impl.h"
-
-writeback_cache_t::~writeback_cache_t()
-{
-    for (auto & bp: dirty_buffers)
-    {
-        if (!--(*bp.second.refcnt))
-        {
-            free(bp.second.refcnt); // refcnt is allocated with the buffer
-        }
-    }
-    dirty_buffers.clear();
-}
-
-dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
-{
-    auto dirty_it = dirty_buffers.lower_bound((object_id){
-        .inode = inode,
-        .stripe = offset,
-    });
-    while (dirty_it != dirty_buffers.begin())
-    {
-        dirty_it--;
-        if (dirty_it->first.inode != inode ||
-            (dirty_it->first.stripe + dirty_it->second.len) <= offset)
-        {
-            dirty_it++;
-            break;
-        }
-    }
-    return dirty_it;
-}
-
-bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
-{
-    if (dirty_it != dirty_buffers.begin())
-    {
-        auto prev_it = dirty_it;
-        prev_it--;
-        if (prev_it->first.inode == dirty_it->first.inode &&
-            prev_it->first.stripe+prev_it->second.len == dirty_it->first.stripe &&
-            prev_it->second.state == CACHE_DIRTY)
-        {
-            return true;
-        }
-    }
-    return false;
-}
-
-bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
-{
-    auto next_it = dirty_it;
-    next_it++;
-    if (next_it != dirty_buffers.end() &&
-        next_it->first.inode == dirty_it->first.inode &&
-        next_it->first.stripe == dirty_it->first.stripe+dirty_it->second.len &&
-        next_it->second.state == CACHE_DIRTY)
-    {
-        return true;
-    }
-    return false;
-}
-
-bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
-{
-    return is_left_merged(dirty_it) || is_right_merged(dirty_it);
-}
-
-void writeback_cache_t::copy_write(cluster_op_t *op, int state)
-{
-    // Save operation for replay when one of PGs goes out of sync
-    // (primary OSD drops our connection in this case)
-    // ...or just save it for writeback if write buffering is enabled
-    if (op->len == 0)
-    {
-        return;
-    }
-    auto dirty_it = find_dirty(op->inode, op->offset);
-    auto new_end = op->offset + op->len;
-    while (dirty_it != dirty_buffers.end() &&
-        dirty_it->first.inode == op->inode &&
-        dirty_it->first.stripe < op->offset+op->len)
-    {
-        assert(dirty_it->first.stripe + dirty_it->second.len > op->offset);
-        // Remove overlapping part(s) of buffers
-        auto old_end = dirty_it->first.stripe + dirty_it->second.len;
-        if (dirty_it->first.stripe < op->offset)
-        {
-            if (old_end > new_end)
-            {
-                // Split into end and start
-                dirty_it->second.len = op->offset - dirty_it->first.stripe;
-                dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
-                    .inode = op->inode,
-                    .stripe = new_end,
-                }, (cluster_buffer_t){
-                    .buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
-                    .len = old_end - new_end,
-                    .state = dirty_it->second.state,
-                    .flush_id = dirty_it->second.flush_id,
-                    .refcnt = dirty_it->second.refcnt,
-                });
-                (*dirty_it->second.refcnt)++;
-                if (dirty_it->second.state == CACHE_DIRTY)
-                {
-                    writeback_bytes -= op->len;
-                    writeback_queue_size++;
-                }
-                break;
-            }
-            else
-            {
-                // Only leave the beginning
-                if (dirty_it->second.state == CACHE_DIRTY)
-                {
-                    writeback_bytes -= old_end - op->offset;
-                    if (is_left_merged(dirty_it) && !is_right_merged(dirty_it))
-                    {
-                        writeback_queue_size++;
-                    }
-                }
-                dirty_it->second.len = op->offset - dirty_it->first.stripe;
-                dirty_it++;
-            }
-        }
-        else if (old_end > new_end)
-        {
-            // Only leave the end
-            if (dirty_it->second.state == CACHE_DIRTY)
-            {
-                writeback_bytes -= new_end - dirty_it->first.stripe;
-                if (!is_left_merged(dirty_it) && is_right_merged(dirty_it))
-                {
-                    writeback_queue_size++;
-                }
-            }
-            auto new_dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
-                .inode = op->inode,
-                .stripe = new_end,
-            }, (cluster_buffer_t){
-                .buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
-                .len = old_end - new_end,
-                .state = dirty_it->second.state,
-                .flush_id = dirty_it->second.flush_id,
-                .refcnt = dirty_it->second.refcnt,
-            });
-            dirty_buffers.erase(dirty_it);
-            dirty_it = new_dirty_it;
-            break;
-        }
-        else
-        {
-            // Remove the whole buffer
-            if (dirty_it->second.state == CACHE_DIRTY && !is_merged(dirty_it))
-            {
-                writeback_bytes -= dirty_it->second.len;
-                assert(writeback_queue_size > 0);
-                writeback_queue_size--;
-            }
-            if (!--(*dirty_it->second.refcnt))
-            {
-                free(dirty_it->second.refcnt);
-            }
-            dirty_buffers.erase(dirty_it++);
-        }
-    }
-    // Overlapping buffers are removed, just insert the new one
-    uint64_t *refcnt = (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
-    uint8_t *buf = (uint8_t*)refcnt + sizeof(uint64_t);
-    *refcnt = 1;
-    dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
-        .inode = op->inode,
-        .stripe = op->offset,
-    }, (cluster_buffer_t){
-        .buf = buf,
-        .len = op->len,
-        .state = state,
-        .refcnt = refcnt,
-    });
-    if (state == CACHE_DIRTY)
-    {
-        writeback_bytes += op->len;
-        // Track consecutive write-back operations
-        if (!is_merged(dirty_it))
-        {
-            // <writeback_queue> is OK to contain more than actual number of consecutive
-            // requests as long as it doesn't miss anything. But <writeback_queue_size>
-            // is always calculated correctly.
-            writeback_queue_size++;
-            writeback_queue.push_back((object_id){
-                .inode = op->inode,
-                .stripe = op->offset,
-            });
-        }
-    }
-    uint64_t pos = 0, len = op->len, iov_idx = 0;
-    while (len > 0 && iov_idx < op->iov.count)
-    {
-        auto & iov = op->iov.buf[iov_idx];
-        memcpy(buf + pos, iov.iov_base, iov.iov_len);
-        pos += iov.iov_len;
-        iov_idx++;
-    }
-}
-
-int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
-{
-    int repeated = 0;
-    if (dirty_buffers.size())
-    {
-        // peer_osd just dropped connection
-        // determine WHICH dirty_buffers are now obsolete and repeat them
-        for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
-        {
-            bool end = wr_it == dirty_buffers.end();
-            bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
-                cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
-            if (flush_it != wr_it && (end || !flush_this ||
-                wr_it->first.inode != flush_it->first.inode ||
-                wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
-            {
-                repeated++;
-                flush_buffers(cli, flush_it, wr_it);
-                flush_it = wr_it;
-            }
-            if (end)
-                break;
-            last_it = wr_it;
-            wr_it++;
-            if (!flush_this)
-                flush_it = wr_it;
-        }
-    }
-    return repeated;
-}
-
-void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it)
-{
-    auto prev_it = to_it;
-    prev_it--;
-    bool is_writeback = from_it->second.state == CACHE_DIRTY;
-    cluster_op_t *op = new cluster_op_t;
-    op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
-    op->opcode = OSD_OP_WRITE;
-    op->cur_inode = op->inode = from_it->first.inode;
-    op->offset = from_it->first.stripe;
-    op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
-    uint32_t calc_len = 0;
-    uint64_t flush_id = ++last_flush_id;
-    for (auto it = from_it; it != to_it; it++)
-    {
-        it->second.state = CACHE_REPEATING;
-        it->second.flush_id = flush_id;
-        (*it->second.refcnt)++;
-        flushed_buffers.emplace(flush_id, it->second.refcnt);
-        op->iov.push_back(it->second.buf, it->second.len);
-        calc_len += it->second.len;
-    }
-    assert(calc_len == op->len);
-    writebacks_active++;
-    op->callback = [this, cli, flush_id](cluster_op_t* op)
-    {
-        // Buffer flushes should be always retried, regardless of the error,
-        // so they should never result in an error here
-        assert(op->retval == op->len);
-        for (auto fl_it = flushed_buffers.find(flush_id);
-            fl_it != flushed_buffers.end() && fl_it->first == flush_id; )
-        {
-            if (!--(*fl_it->second)) // refcnt
-            {
-                free(fl_it->second);
-            }
-            flushed_buffers.erase(fl_it++);
-        }
-        for (auto dirty_it = find_dirty(op->inode, op->offset);
-            dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
-            dirty_it->first.stripe < op->offset+op->len; dirty_it++)
-        {
-            if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
-            {
-                dirty_it->second.flush_id = 0;
-                dirty_it->second.state = CACHE_WRITTEN;
-            }
-        }
-        delete op;
-        writebacks_active--;
-        // We can't call execute_internal because it affects an invalid copy of the list here
-        // (erase_op remembers `next` after writeback callback)
-    };
-    if (is_writeback)
-    {
-        cli->execute_internal(op);
-    }
-    else
-    {
-        // Insert repeated flushes into the beginning
-        cli->unshift_op(op);
-        cli->continue_rw(op);
-    }
-}
-
-void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
-{
-    if (!writeback_queue.size())
-    {
-        return;
-    }
-    std::vector<object_id> queue_copy;
-    queue_copy.swap(writeback_queue);
-    int started = 0, i = 0;
-    for (i = 0; i < queue_copy.size() && (!count || started < count); i++)
-    {
-        object_id & req = queue_copy[i];
-        auto dirty_it = find_dirty(req.inode, req.stripe);
-        if (dirty_it == dirty_buffers.end() ||
-            dirty_it->first.inode != req.inode ||
-            dirty_it->second.state != CACHE_DIRTY)
-        {
-            continue;
-        }
-        auto from_it = dirty_it;
-        uint64_t off = dirty_it->first.stripe;
-        while (from_it != dirty_buffers.begin())
-        {
-            from_it--;
-            if (from_it->second.state != CACHE_DIRTY ||
-                from_it->first.inode != req.inode ||
-                from_it->first.stripe+from_it->second.len != off)
-            {
-                from_it++;
-                break;
-            }
-            off = from_it->first.stripe;
-        }
-        off = dirty_it->first.stripe + dirty_it->second.len;
-        auto to_it = dirty_it;
-        to_it++;
-        while (to_it != dirty_buffers.end())
-        {
-            if (to_it->second.state != CACHE_DIRTY ||
-                to_it->first.inode != req.inode ||
-                to_it->first.stripe != off)
-            {
-                break;
-            }
-            off = to_it->first.stripe + to_it->second.len;
-            to_it++;
-        }
-        started++;
-        assert(writeback_queue_size > 0);
-        writeback_queue_size--;
-        writeback_bytes -= off - from_it->first.stripe;
-        flush_buffers(cli, from_it, to_it);
-    }
-    queue_copy.erase(queue_copy.begin(), queue_copy.begin()+i);
-    if (writeback_queue.size())
-    {
-        queue_copy.insert(queue_copy.end(), writeback_queue.begin(), writeback_queue.end());
-    }
-    queue_copy.swap(writeback_queue);
-}
-
-static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t len, uint32_t bitmap_granularity)
-{
-    if (op->opcode == OSD_OP_READ)
-    {
-        // Not OSD_OP_READ_BITMAP or OSD_OP_READ_CHAIN_BITMAP
-        int iov_idx = 0;
-        uint64_t cur_offset = op->offset;
-        while (iov_idx < op->iov.count && cur_offset+op->iov.buf[iov_idx].iov_len <= offset)
-        {
-            cur_offset += op->iov.buf[iov_idx].iov_len;
-            iov_idx++;
-        }
-        while (iov_idx < op->iov.count && cur_offset < offset+len)
-        {
-            auto & v = op->iov.buf[iov_idx];
-            auto begin = (cur_offset < offset ? offset : cur_offset);
-            auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
-            memcpy(
-                v.iov_base + begin - cur_offset,
-                buf + (cur_offset <= offset ? 0 : cur_offset-offset),
-                end - begin
-            );
-            cur_offset += v.iov_len;
-            iov_idx++;
-        }
-    }
-    // Set bitmap bits
-    int start_bit = (offset-op->offset)/bitmap_granularity;
-    int end_bit = (offset-op->offset+len)/bitmap_granularity;
-    for (int bit = start_bit; bit < end_bit;)
-    {
-        if (!(bit%8) && bit <= end_bit-8)
-        {
-            ((uint8_t*)op->bitmap_buf)[bit/8] = 0xFF;
-            bit += 8;
-        }
-        else
-        {
-            ((uint8_t*)op->bitmap_buf)[bit/8] |= (1 << (bit%8));
-            bit++;
-        }
-    }
-}
-
-bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity)
-{
-    bool dirty_copied = false;
-    if (dirty_buffers.size() && (op->opcode == OSD_OP_READ ||
-        op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
-    {
-        // We also have to return reads from CACHE_REPEATING buffers - they are not
-        // guaranteed to be present on target OSDs at the moment of repeating
-        // And we're also free to return data from other cached buffers just
-        // because it's faster
-        auto dirty_it = find_dirty(op->cur_inode, op->offset);
-        while (dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->cur_inode &&
-            dirty_it->first.stripe < op->offset+op->len)
-        {
-            uint64_t begin = dirty_it->first.stripe, end = dirty_it->first.stripe + dirty_it->second.len;
-            if (begin < op->offset)
-                begin = op->offset;
-            if (end > op->offset+op->len)
-                end = op->offset+op->len;
-            bool skip_prev = true;
-            uint64_t cur = begin, prev = begin;
-            while (cur < end)
-            {
-                unsigned bmp_loc = (cur - op->offset)/bitmap_granularity;
-                bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
-                if (skip_prev != skip)
-                {
-                    if (cur > prev && !skip)
-                    {
-                        // Copy data
-                        dirty_copied = true;
-                        copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
-                    }
-                    skip_prev = skip;
-                    prev = cur;
-                }
-                cur += bitmap_granularity;
-            }
-            assert(cur > prev);
-            if (!skip_prev)
-            {
-                // Copy data
-                dirty_copied = true;
-                copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
-            }
-            dirty_it++;
-        }
-    }
-    return dirty_copied;
-}
-
-void writeback_cache_t::fsync_start()
-{
-    for (auto & prev_op: dirty_buffers)
-    {
-        if (prev_op.second.state == CACHE_WRITTEN)
-        {
-            prev_op.second.state = CACHE_FLUSHING;
-        }
-    }
-}
-
-void writeback_cache_t::fsync_error()
-{
-    for (auto & prev_op: dirty_buffers)
-    {
-        if (prev_op.second.state == CACHE_FLUSHING)
-        {
-            prev_op.second.state = CACHE_WRITTEN;
-        }
-    }
-}
-
-void writeback_cache_t::fsync_ok()
-{
-    for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
-    {
-        if (uw_it->second.state == CACHE_FLUSHING)
-        {
-            if (!--(*uw_it->second.refcnt))
-                free(uw_it->second.refcnt);
-            dirty_buffers.erase(uw_it++);
-        }
-        else
-            uw_it++;
-    }
-}
--- a/src/fio_cluster.cpp
+++ b/src/fio_cluster.cpp
@ -24,7 +24,6 @@
 #include <netinet/tcp.h>

 #include <vector>
-#include <string>

 #include "vitastor_c.h"
 #include "fio_headers.h"
@ -204,15 +203,6 @@ static void watch_callback(void *opaque, long watch)
    bsd->watch = (void*)watch;
 }

-static void opt_push(std::vector<char *> & options, const char *opt, const char *value)
-{
-    if (value)
-    {
-        options.push_back(strdup(opt));
-        options.push_back(strdup(value));
-    }
-}
-
 static int sec_setup(struct thread_data *td)
 {
    sec_options *o = (sec_options*)td->eo;
@ -264,27 +254,8 @@ static int sec_setup(struct thread_data *td)
    {
        o->inode = 0;
    }
-    std::vector<char *> options;
-    opt_push(options, "config_path", o->config_path);
-    opt_push(options, "etcd_address", o->etcd_host);
-    opt_push(options, "etcd_prefix", o->etcd_prefix);
-    if (o->use_rdma != -1)
-        opt_push(options, "use_rdma", std::to_string(o->use_rdma).c_str());
-    opt_push(options, "rdma_device", o->rdma_device);
-    if (o->rdma_port_num)
-        opt_push(options, "rdma_port_num", std::to_string(o->rdma_port_num).c_str());
-    if (o->rdma_gid_index)
-        opt_push(options, "rdma_gid_index", std::to_string(o->rdma_gid_index).c_str());
-    if (o->rdma_mtu)
-        opt_push(options, "rdma_mtu", std::to_string(o->rdma_mtu).c_str());
-    if (o->cluster_log)
-        opt_push(options, "log_level", std::to_string(o->cluster_log).c_str());
-    // allow writeback caching if -direct is not set
-    opt_push(options, "client_writeback_allowed", td->o.odirect ? "0" : "1");
-    bsd->cli = vitastor_c_create_uring_json((const char**)options.data(), options.size());
-    for (auto opt: options)
-        free(opt);
-    options.clear();
+    bsd->cli = vitastor_c_create_uring(o->config_path, o->etcd_host, o->etcd_prefix,
+        o->use_rdma, o->rdma_device, o->rdma_port_num, o->rdma_gid_index, o->rdma_mtu, o->cluster_log);
    if (o->image)
    {
        bsd->watch = NULL;
--- a/src/mock/messenger.cpp
+++ b/src/mock/messenger.cpp
@ -55,10 +55,3 @@ json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object &
 {
    return cli_config;
 }
-
-bool json_is_true(const json11::Json & val)
-{
-    if (val.is_string())
-        return val == "true" || val == "yes" || val == "1";
-    return val.bool_value();
-}
--- a/src/mock/ringloop.h
+++ b/src/mock/ringloop.h
@ -22,10 +22,4 @@ public:
    void submit()
    {
    }
-    void wait()
-    {
-    }
-    void loop()
-    {
-    }
 };
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@ -341,7 +341,6 @@ public:
            ringloop->loop();
            ringloop->wait();
        }
-        cli->flush();
        delete cli;
        delete epmgr;
        delete ringloop;
--- a/src/nfs_proxy.cpp
+++ b/src/nfs_proxy.cpp
@ -34,10 +34,7 @@ nfs_proxy_t::~nfs_proxy_t()
    if (cmd)
        delete cmd;
    if (cli)
-    {
-        cli->flush();
        delete cli;
-    }
    if (epmgr)
        delete epmgr;
    if (ringloop)
@ -264,8 +261,16 @@ void nfs_proxy_t::run(json11::Json cfg)
        ringloop->loop();
        ringloop->wait();
    }
+    /*// Sync at the end
+    cluster_op_t *close_sync = new cluster_op_t;
+    close_sync->opcode = OSD_OP_SYNC;
+    close_sync->callback = [&stop](cluster_op_t *op)
+    {
+        stop = true;
+        delete op;
+    };
+    cli->execute(close_sync);*/
    // Destroy the client
-    cli->flush();
    delete cli;
    delete epmgr;
    delete ringloop;
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@ -186,12 +186,10 @@ json11::Json osd_t::get_statistics()
    if (bs)
    {
        st["blockstore_ready"] = bs->is_started();
+        st["data_block_size"] = (uint64_t)bs->get_block_size();
        st["size"] = bs->get_block_count() * bs->get_block_size();
        st["free"] = bs->get_free_block_count() * bs->get_block_size();
    }
-    st["data_block_size"] = (uint64_t)bs_block_size;
-    st["bitmap_granularity"] = (uint64_t)bs_bitmap_granularity;
-    st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
    st["host"] = self_state["host"];
    json11::Json::object op_stats, subop_stats;
    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
--- a/src/qemu_driver.c
+++ b/src/qemu_driver.c
@ -388,43 +388,6 @@ static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandl
    );
 }

-typedef struct str_array
-{
-    const char **items;
-    int len, alloc;
-} str_array;
-
-static void strarray_push(str_array *a, const char *str)
-{
-    if (a->len >= a->alloc)
-    {
-        a->alloc = !a->alloc ? 4 : 2*a->alloc;
-        a->items = (const char**)realloc(a->items, a->alloc*sizeof(char*));
-        if (!a->items)
-        {
-            fprintf(stderr, "bad alloc\n");
-            abort();
-        }
-    }
-    a->items[a->len++] = str;
-}
-
-static void strarray_push_kv(str_array *a, const char *key, const char *value)
-{
-    if (key && value)
-    {
-        strarray_push(a, key);
-        strarray_push(a, value);
-    }
-}
-
-static void strarray_free(str_array *a)
-{
-    free(a->items);
-    a->items = NULL;
-    a->len = a->alloc = 0;
-}
-
 static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
 {
    VitastorRPC task;
@ -443,36 +406,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
    client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
    client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
    client->ctx = bdrv_get_aio_context(bs);
-    str_array opt = {};
-    strarray_push_kv(&opt, "config_path", qdict_get_try_str(options, "config-path"));
-    strarray_push_kv(&opt, "etcd_address", qdict_get_try_str(options, "etcd-host"));
-    strarray_push_kv(&opt, "etcd_prefix", qdict_get_try_str(options, "etcd-prefix"));
-    strarray_push_kv(&opt, "use_rdma", qdict_get_try_str(options, "use-rdma"));
-    strarray_push_kv(&opt, "rdma_device", qdict_get_try_str(options, "rdma-device"));
-    strarray_push_kv(&opt, "rdma_port_num", qdict_get_try_str(options, "rdma-port-num"));
-    strarray_push_kv(&opt, "rdma_gid_index", qdict_get_try_str(options, "rdma-gid-index"));
-    strarray_push_kv(&opt, "rdma_mtu", qdict_get_try_str(options, "rdma-mtu"));
-    strarray_push_kv(&opt, "client_writeback_allowed", (flags & BDRV_O_NOCACHE) ? "0" : "1");
-    client->proxy = vitastor_c_create_uring_json(opt.items, opt.len);
-    strarray_free(&opt);
-    if (client->proxy)
-    {
-        client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
-        if (client->uring_eventfd < 0)
-        {
-            fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
-            error_setg(errp, "failed to create io_uring eventfd");
-            vitastor_close(bs);
-            return -1;
-        }
-        universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
-    }
-    else
-    {
-        // Writeback cache is unusable without io_uring because the client can't correctly flush on exit
-        fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower%s\n",
-            strerror(errno), (flags & BDRV_O_NOCACHE ? "" : " and writeback cache will be disabled"));
-        client->uring_eventfd = -1;
 #if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
    client->proxy = vitastor_c_create_qemu_uring(
        vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
@ -482,10 +415,12 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
    {
        fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
        client->uring_eventfd = -1;
+#endif
        client->proxy = vitastor_c_create_qemu(
            vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
            client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
        );
+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
    }
    else
    {
@ -499,13 +434,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
        }
        universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
    }
-#else
-        client->proxy = vitastor_c_create_qemu(
-            vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
-            client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
-        );
 #endif
-    }
    image = client->image = g_strdup(qdict_get_try_str(options, "image"));
    client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
    // Get image metadata (size and readonly flag) or just wait until the client is ready
--- a/src/test_cluster_client.cpp
+++ b/src/test_cluster_client.cpp
@ -4,7 +4,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
-#include "cluster_client_impl.h"
+#include "cluster_client.h"

 void configure_single_pg_pool(cluster_client_t *cli)
 {
@ -47,11 +47,11 @@ void configure_single_pg_pool(cluster_client_t *cli)
    cli->st_cli.on_change_hook(changes);
 }

-int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL, bool instant = false)
+int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL)
 {
    printf("Post write %lx+%lx\n", offset, len);
    int *r = new int;
-    *r = instant ? -2 : -1;
+    *r = -1;
    cluster_op_t *op = new cluster_op_t();
    op->opcode = OSD_OP_WRITE;
    op->inode = 0x1000000000001;
@ -72,13 +72,6 @@ int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c,
            cb();
    };
    cli->execute(op);
-    if (instant)
-    {
-        long res = *r;
-        assert(*r >= 0);
-        delete r;
-        return (int*)res;
-    }
    return r;
 }

@ -167,13 +160,6 @@ osd_op_t *find_op(cluster_client_t *cli, osd_num_t osd_num, uint64_t opcode, uin
        }
        op_it++;
    }
-    op_it = cli->msgr.clients[peer_fd]->sent_ops.begin();
-    while (op_it != cli->msgr.clients[peer_fd]->sent_ops.end())
-    {
-        printf("Found opcode %lu offset %lx size %x\n", op_it->second->req.hdr.opcode, op_it->second->req.rw.offset, op_it->second->req.rw.len);
-        op_it++;
-    }
-    printf("Not found opcode %lu offset %lx size %lx\n", opcode, offset, len);
    return NULL;
 }

@ -355,7 +341,7 @@ void test1()

 void test2()
 {
-    writeback_cache_t *wb = new writeback_cache_t();
+    std::map<object_id, cluster_buffer_t> unsynced_writes;
    cluster_op_t *op = new cluster_op_t();
    op->opcode = OSD_OP_WRITE;
    op->inode = 1;
@ -364,19 +350,19 @@ void test2()
    op->iov.push_back(malloc_or_die(4096*1024), 4096);
    // 0-4k = 0x55
    memset(op->iov.buf[0].iov_base, 0x55, op->iov.buf[0].iov_len);
-    wb->copy_write(op, CACHE_WRITTEN);
+    cluster_client_t::copy_write(op, unsynced_writes);
    // 8k-12k = 0x66
    op->offset = 8192;
    memset(op->iov.buf[0].iov_base, 0x66, op->iov.buf[0].iov_len);
-    wb->copy_write(op, CACHE_WRITTEN);
+    cluster_client_t::copy_write(op, unsynced_writes);
    // 4k-1M+4k = 0x77
    op->len = op->iov.buf[0].iov_len = 1048576;
    op->offset = 4096;
    memset(op->iov.buf[0].iov_base, 0x77, op->iov.buf[0].iov_len);
-    wb->copy_write(op, CACHE_WRITTEN);
+    cluster_client_t::copy_write(op, unsynced_writes);
    // check it
-    assert(wb->dirty_buffers.size() == 2);
-    auto uit = wb->dirty_buffers.begin();
+    assert(unsynced_writes.size() == 4);
+    auto uit = unsynced_writes.begin();
    int i;
    assert(uit->first.inode == 1);
    assert(uit->first.stripe == 0);
@ -386,106 +372,35 @@ void test2()
    uit++;
    assert(uit->first.inode == 1);
    assert(uit->first.stripe == 4096);
-    assert(uit->second.len == 1048576);
+    assert(uit->second.len == 4096);
+    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
+    assert(i == uit->second.len);
+    uit++;
+    assert(uit->first.inode == 1);
+    assert(uit->first.stripe == 8192);
+    assert(uit->second.len == 4096);
+    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
+    assert(i == uit->second.len);
+    uit++;
+    assert(uit->first.inode == 1);
+    assert(uit->first.stripe == 12*1024);
+    assert(uit->second.len == 1016*1024);
    for (i = 0; i < uit->second.len && ((uint8_t*)uit->second.buf)[i] == 0x77; i++) {}
    assert(i == uit->second.len);
    uit++;
    // free memory
    free(op->iov.buf[0].iov_base);
    delete op;
-    delete wb;
-    printf("[ok] copy_write test\n");
-}
-
-void test_writeback()
+    for (auto p: unsynced_writes)
    {
-    json11::Json config = json11::Json::object {
-        { "client_enable_writeback", true },
-        { "client_writeback_allowed", true },
-        { "client_max_buffered_bytes", 1024*1024 },
-        { "client_max_buffered_ops", 2 },
-        { "client_max_writeback_iodepth", 2 },
-        { "client_max_dirty_bytes", 1024*1024 },
-        { "client_max_dirty_ops", 2 },
-    };
-    timerfd_manager_t *tfd = new timerfd_manager_t([](int fd, bool wr, std::function<void(int, int)> callback){});
-    cluster_client_t *cli = new cluster_client_t(NULL, tfd, config);
-
-    configure_single_pg_pool(cli);
-    pretend_connected(cli, 1);
-
-    // Check that 3 consecutive writes are merged by writeback
-    assert((long)test_write(cli, 0, 4096, 0x55, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-    assert((long)test_write(cli, 4096, 4096, 0x55, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-    assert((long)test_write(cli, 8192, 4096, 0x55, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-
-    assert((long)test_write(cli, 1024*1024, 4096, 0x66, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-
-    // 3rd and 4th writes should trigger 1 writeback each
-    assert((long)test_write(cli, 2*1024*1024, 4096, 0x66, NULL, true) == 1);
-    check_op_count(cli, 1, 1);
-    assert((long)test_write(cli, 3*1024*1024, 4096, 0x66, NULL, true) == 1);
-    check_op_count(cli, 1, 2);
-
-    // 5th write should be postponed until at least 1 writeback is completed
-    int *r1 = test_write(cli, 4*1024*1024, 4096, 0x67, NULL);
-    check_op_count(cli, 1, 2);
-    can_complete(r1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 3*4096), 0);
-    check_completed(r1);
-    // autosync because max_dirty_ops=2, flush waits for sync
-    check_op_count(cli, 1, 1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 1024*1024, 4096), 0);
-    check_op_count(cli, 1, 1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
-    check_op_count(cli, 1, 1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 2*1024*1024, 4096), 0);
-    check_op_count(cli, 1, 0);
-
-    int *r2 = test_sync(cli);
-    check_op_count(cli, 1, 1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 3*1024*1024, 4096), 0);
-    check_op_count(cli, 1, 1);
-    // autosync because max_dirty_ops=2, flush waits for sync
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
-    check_op_count(cli, 1, 1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 4*1024*1024, 4096), 0);
-    check_op_count(cli, 1, 1);
-    can_complete(r2);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
-    check_completed(r2);
-
-    // Check cutting of the beginning and end
-    assert((long)test_write(cli, 0, 32768, 0x55, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-    assert((long)test_write(cli, 32768, 32768, 0x56, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-    assert((long)test_write(cli, 16384, 32768, 0x57, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-    assert((long)test_write(cli, 16384+4096, 32768-4096, 0x58, NULL, true) == 1);
-    check_op_count(cli, 1, 0);
-    r2 = test_sync(cli);
-    check_op_count(cli, 1, 1);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 65536), 0);
-    check_op_count(cli, 1, 1);
-    can_complete(r2);
-    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_SYNC, 0, 0), 0);
-    check_completed(r2);
-
-    // Free client
-    delete cli;
-    delete tfd;
-    printf("[ok] writeback test\n");
+        free(p.second.buf);
+    }
+    printf("[ok] copy_write test\n");
 }

 int main(int narg, char *args[])
 {
    test1();
    test2();
-    test_writeback();
    return 0;
 }
--- a/src/timerfd_manager.cpp
+++ b/src/timerfd_manager.cpp
@ -129,7 +129,6 @@ again:
        if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
        {
            // It already happened
-            // FIXME: Postpone to setImmediate/BH to avoid reenterability problems
            trigger_nearest();
            goto again;
        }
--- a/tests/run_3osds.sh
+++ b/tests/run_3osds.sh
@ -19,10 +19,10 @@ fi

 if [ "$IMMEDIATE_COMMIT" != "" ]; then
    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
-    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all","client_enable_writeback":true}'
+    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
 else
    NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
-    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"client_enable_writeback":true}'
+    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
 fi

 start_osd_on()