WIP Implement RDMA v2 based on IBV_WR_RDMA_WRITE with remote buffer management

One BIG FIXME remaining - handling large operations :))
2023-02-26 00:26:39 +03:00
75 changed files with 1073 additions and 1204 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8)

 project(vitastor)

-set(VERSION "0.8.7")
+set(VERSION "0.8.5")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.8.7
+VERSION ?= v0.8.5

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.8.7
+          image: vitalif/vitastor-csi:v0.8.5
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.8.7
+          image: vitalif/vitastor-csi:v0.8.5
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.8.7"
+    vitastorCSIDriverVersion = "0.8.5"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -6,11 +6,11 @@ package vitastor
 import (
    "context"
    "encoding/json"
-    "fmt"
    "strings"
    "bytes"
    "strconv"
    "time"
+    "fmt"
    "os"
    "os/exec"
    "io/ioutil"
@@ -21,6 +21,8 @@ import (
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"

+    "go.etcd.io/etcd/clientv3"
+
    "github.com/container-storage-interface/spec/lib/go/csi"
 )

@@ -112,34 +114,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
    return ctxVars, etcdUrl, etcdPrefix
 }

-func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
-{
-    if (ctxVars["etcdUrl"] != "")
-    {
-        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
-    }
-    if (ctxVars["etcdPrefix"] != "")
-    {
-        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
-    }
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    c := exec.Command("/usr/bin/vitastor-cli", args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout = &stdout
-    c.Stderr = &stderr
-    err := c.Run()
-    stderrStr := string(stderr.Bytes())
-    if (err != nil)
-    {
-        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
-        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
-    }
-    return stdout.Bytes(), nil
-}
-
 // Create the volume
 func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
 {
@@ -172,41 +146,128 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }

-    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
+    // FIXME: The following should PROBABLY be implemented externally in a management tool
+
+    ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
    if (len(etcdUrl) == 0)
    {
        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

-    // Create image using vitastor-cli
-    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
+    // Connect to etcd
+    cli, err := clientv3.New(clientv3.Config{
+        DialTimeout: ETCD_TIMEOUT,
+        Endpoints: etcdUrl,
+    })
    if (err != nil)
    {
-        if (strings.Index(err.Error(), "already exists") > 0)
+        return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
+    }
+    defer cli.Close()
+
+    var imageId uint64 = 0
+    for
+    {
+        // Check if the image exists
+        ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+        resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
+        cancel()
+        if (err != nil)
        {
-            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
+            return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+        }
+        if (len(resp.Kvs) > 0)
+        {
+            kv := resp.Kvs[0]
+            var v InodeIndex
+            err := json.Unmarshal(kv.Value, &v)
            if (err != nil)
            {
-                return nil, err
+                return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
            }
-            var inodeCfg []InodeConfig
-            err = json.Unmarshal(stat, &inodeCfg)
+            poolId = v.PoolId
+            imageId = v.Id
+            inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
+            ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+            resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
+            cancel()
            if (err != nil)
            {
-                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+                return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
            }
-            if (len(inodeCfg) == 0)
+            if (len(resp.Kvs) == 0)
            {
-                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
+                return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
            }
-            if (inodeCfg[0].Size < uint64(volSize))
+            var inodeCfg InodeConfig
+            err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
+            }
+            if (inodeCfg.Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
            }
        }
        else
        {
-            return nil, err
+            // Find a free ID
+            // Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
+            maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
+            ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+            resp, err := cli.Get(ctx, maxIdKey)
+            cancel()
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+            }
+            var modRev int64
+            var nextId uint64
+            if (len(resp.Kvs) > 0)
+            {
+                var err error
+                nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
+                if (err != nil)
+                {
+                    return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
+                }
+                modRev = resp.Kvs[0].ModRevision
+                nextId++
+            }
+            else
+            {
+                nextId = 1
+            }
+            inodeIdxJson, _ := json.Marshal(InodeIndex{
+                Id: nextId,
+                PoolId: poolId,
+            })
+            inodeCfgJson, _ := json.Marshal(InodeConfig{
+                Name: volName,
+                Size: uint64(volSize),
+            })
+            ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+            txnResp, err := cli.Txn(ctx).If(
+                clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
+                clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
+                clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
+            ).Then(
+                clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
+                clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
+                clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
+            ).Commit()
+            cancel()
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
+            }
+            if (txnResp.Succeeded)
+            {
+                imageId = nextId
+                break
+            }
+            // Start over if the transaction fails
        }
    }

@@ -238,12 +299,97 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
    }
    volName := ctxVars["name"]

-    ctxVars, _, _ = GetConnectionParams(ctxVars)
+    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
+    if (len(etcdUrl) == 0)
+    {
+        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
+    }

-    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
+    cli, err := clientv3.New(clientv3.Config{
+        DialTimeout: ETCD_TIMEOUT,
+        Endpoints: etcdUrl,
+    })
    if (err != nil)
    {
-        return nil, err
+        return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
+    }
+    defer cli.Close()
+
+    // Find inode by name
+    ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+    resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
+    cancel()
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+    }
+    if (len(resp.Kvs) == 0)
+    {
+        return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
+    }
+    var idx InodeIndex
+    err = json.Unmarshal(resp.Kvs[0].Value, &idx)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
+    }
+
+    // Get inode config
+    inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
+    ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+    resp, err = cli.Get(ctx, inodeCfgKey)
+    cancel()
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+    }
+    if (len(resp.Kvs) == 0)
+    {
+        return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
+    }
+    var inodeCfg InodeConfig
+    err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
+    }
+
+    // Delete inode data by invoking vitastor-cli
+    args := []string{
+        "rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
+        "--pool", fmt.Sprintf("%d", idx.PoolId),
+        "--inode", fmt.Sprintf("%d", idx.Id),
+    }
+    if (ctxVars["configPath"] != "")
+    {
+        args = append(args, "--config_path", ctxVars["configPath"])
+    }
+    c := exec.Command("/usr/bin/vitastor-cli", args...)
+    var stderr bytes.Buffer
+    c.Stdout = nil
+    c.Stderr = &stderr
+    err = c.Run()
+    stderrStr := string(stderr.Bytes())
+    if (err != nil)
+    {
+        klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
+        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
+    }
+
+    // Delete inode config in etcd
+    ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+    txnResp, err := cli.Txn(ctx).Then(
+        clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
+        clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
+    ).Commit()
+    cancel()
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
+    }
+    if (!txnResp.Succeeded)
+    {
+        return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
    }

    return &csi.DeleteVolumeResponse{}, nil
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.8.7-1) unstable; urgency=medium
+vitastor (0.8.5-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.8.7-1) unstable; urgency=medium
+vitastor (0.8.5-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -34,8 +34,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.8.7; \
-    cd vitastor-0.8.7; \
+    cp -r /root/vitastor vitastor-0.8.5; \
+    cd vitastor-0.8.5; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.7.orig.tar.xz vitastor-0.8.7; \
-    cd vitastor-0.8.7; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
+    cd vitastor-0.8.5; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -19,7 +19,6 @@ between clients, OSDs and etcd.
 - [rdma_max_sge](#rdma_max_sge)
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -75,12 +74,6 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
 Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
 root to list available RDMA devices and their features.

-Remember that you also have to configure your network switches if you use
-RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
-the manual of your network vendor for details about setting up the switch
-for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
-PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
-
 ## rdma_port_num

 - Type: integer
@@ -123,30 +116,20 @@ required to change this parameter.
 ## rdma_max_msg

 - Type: integer
- Default: 132096
+- Default: 1048576

 Maximum size of a single RDMA send or receive operation in bytes.

 ## rdma_max_recv

- Type: integer
- Default: 16
-
-Maximum number of RDMA receive buffers per connection (RDMA requires
-preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
-in size. So this setting directly affects memory usage: a single Vitastor
-RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
-Default is roughly 2 MB * number of OSDs.
-
-## rdma_max_send
-
 - Type: integer
 - Default: 8

-Maximum number of outstanding RDMA send operations per connection. Should be
-less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
-Doesn't affect memory usage - additional memory isn't allocated for send
-operations.
+Maximum number of parallel RDMA receive operations. Note that this number
+of receive buffers `rdma_max_msg` in size are allocated for each client,
+so this setting actually affects memory usage. This is because RDMA receive
+operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
+later versions.

 ## peer_connect_interval

--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -19,7 +19,6 @@
 - [rdma_max_sge](#rdma_max_sge)
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -79,13 +78,6 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
 суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
 параметры и возможности.

-Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
-правильно настроить для него коммутаторы, иначе вы можете столкнуться с
-нестабильной производительностью. Подробную информацию о настройке
-коммутатора для RoCEv2 ищите в документации производителя. Обычно это
-подразумевает настройку сети без потерь на основе PFC (Priority Flow
-Control) и ECN (Explicit Congestion Notification).
-
 ## rdma_port_num

 - Тип: целое число
@@ -129,32 +121,22 @@ OSD в любом случае согласовывают реальное зн
 ## rdma_max_msg

 - Тип: целое число
- Значение по умолчанию: 132096
+- Значение по умолчанию: 1048576

 Максимальный размер одной RDMA-операции отправки или приёма.

 ## rdma_max_recv

- Тип: целое число
- Значение по умолчанию: 16
-
-Максимальное число буферов для RDMA-приёма данных на одно соединение
-(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
-имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
-потребление памяти - один Vitastor-клиент с RDMA использует
-`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
-примерно 2 МБ * число OSD.
-
-## rdma_max_send
-
 - Тип: целое число
 - Значение по умолчанию: 8

-Максимальное число RDMA-операций отправки, отправляемых в очередь одного
-соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
-у принимающей стороны в процессе работы не заканчивались буферы на приём.
-Не влияет на потребление памяти - дополнительная память на операции отправки
-не выделяется.
+Максимальное число параллельных RDMA-операций получения данных. Следует
+иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
+для каждого подключённого клиентского соединения, так что данная настройка
+влияет на потребление памяти. Это так потому, что RDMA-приём данных в
+Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
+копирует данные в памяти. Данная особенность, возможно, будет исправлена в
+более новых версиях Vitastor.

 ## peer_connect_interval

--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -53,12 +53,6 @@
    to work. For example, Mellanox ConnectX-3 and older adapters don't have
    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
    root to list available RDMA devices and their features.
-
-    Remember that you also have to configure your network switches if you use
-    RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
-    the manual of your network vendor for details about setting up the switch
-    for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
-    PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
@@ -67,13 +61,6 @@
    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
    параметры и возможности.
-
-    Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
-    правильно настроить для него коммутаторы, иначе вы можете столкнуться с
-    нестабильной производительностью. Подробную информацию о настройке
-    коммутатора для RoCEv2 ищите в документации производителя. Обычно это
-    подразумевает настройку сети без потерь на основе PFC (Priority Flow
-    Control) и ECN (Explicit Congestion Notification).
 - name: rdma_port_num
  type: int
  default: 1
@@ -127,39 +114,26 @@
    так что менять этот параметр обычно не нужно.
 - name: rdma_max_msg
  type: int
-  default: 132096
+  default: 1048576
  info: Maximum size of a single RDMA send or receive operation in bytes.
  info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
 - name: rdma_max_recv
-  type: int
-  default: 16
-  info: |
-    Maximum number of RDMA receive buffers per connection (RDMA requires
-    preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
-    in size. So this setting directly affects memory usage: a single Vitastor
-    RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
-    Default is roughly 2 MB * number of OSDs.
-  info_ru: |
-    Максимальное число буферов для RDMA-приёма данных на одно соединение
-    (RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
-    имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
-    потребление памяти - один Vitastor-клиент с RDMA использует
-    `rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
-    примерно 2 МБ * число OSD.
- name: rdma_max_send
  type: int
  default: 8
  info: |
-    Maximum number of outstanding RDMA send operations per connection. Should be
-    less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
-    Doesn't affect memory usage - additional memory isn't allocated for send
-    operations.
+    Maximum number of parallel RDMA receive operations. Note that this number
+    of receive buffers `rdma_max_msg` in size are allocated for each client,
+    so this setting actually affects memory usage. This is because RDMA receive
+    operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
+    later versions.
  info_ru: |
-    Максимальное число RDMA-операций отправки, отправляемых в очередь одного
-    соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
-    у принимающей стороны в процессе работы не заканчивались буферы на приём.
-    Не влияет на потребление памяти - дополнительная память на операции отправки
-    не выделяется.
+    Максимальное число параллельных RDMA-операций получения данных. Следует
+    иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
+    для каждого подключённого клиентского соединения, так что данная настройка
+    влияет на потребление памяти. Это так потому, что RDMA-приём данных в
+    Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
+    копирует данные в памяти. Данная особенность, возможно, будет исправлена в
+    более новых версиях Vitastor.
 - name: peer_connect_interval
  type: sec
  min: 1
--- a/docs/performance/theoretical.en.md
+++ b/docs/performance/theoretical.en.md
@@ -35,24 +35,15 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
 If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
 lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.

-Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
-go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
-if you want to provide me with such cluster for tests.
-
 Lazy fsync also reduces WA for parallel workloads because journal blocks are only
 written when they fill up or fsync is requested.

 ## In Practice

-In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
-good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
+In practice, using tests from [Understanding Performance](understanding.en.md)
+and good server-grade SSD/NVMe drives, you should head for:
 - At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
 - At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
 - Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case

 Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
-
-Current latency records:
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe
--- a/docs/performance/theoretical.ru.md
+++ b/docs/performance/theoretical.ru.md
@@ -36,25 +36,6 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
 Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
 то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.

-Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
-дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
-тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
-
 Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
 нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
 образом запрашивается fsync.
-
-## На практике
-
-На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
-нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
-
-Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
-
-Зафиксированный на данный момент рекорд задержки:
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -1,4 +1,4 @@
-[Documentation](../../README.md#documentation) → Usage → Disk management tool
+[Documentation](../../README.md#documentation) → Usage → Disk Tool

 -----

--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -1,4 +1,4 @@
-[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
+[Документация](../../README-ru.md#документация) → Использование → Управление дисками

 -----

--- a/mon/mon.js
+++ b/mon/mon.js
@@ -51,9 +51,8 @@ const etcd_tree = {
            // THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
            // etcd connection
            config_path: "/etc/vitastor/vitastor.conf",
-            etcd_prefix: "/vitastor",
-            // etcd connection - configurable online
            etcd_address: "10.0.115.10:2379/v3",
+            etcd_prefix: "/vitastor",
            // mon
            etcd_mon_ttl: 30, // min: 10
            etcd_mon_timeout: 1000, // ms. min: 0
@@ -71,15 +70,14 @@ const etcd_tree = {
            rdma_gid_index: 0,
            rdma_mtu: 4096,
            rdma_max_sge: 128,
-            rdma_max_send: 8,
-            rdma_max_recv: 16,
-            rdma_max_msg: 132096,
+            rdma_max_send: 32,
+            rdma_max_recv: 8,
+            rdma_max_msg: 1048576,
+            log_level: 0,
            block_size: 131072,
            disk_alignment: 4096,
            bitmap_granularity: 4096,
            immediate_commit: false, // 'all' or 'small'
-            // client and osd - configurable online
-            log_level: 0,
            client_dirty_limit: 33554432,
            peer_connect_interval: 5, // seconds. min: 1
            peer_connect_timeout: 5, // seconds. min: 1
@@ -97,19 +95,18 @@ const etcd_tree = {
            osd_network: null, // "192.168.7.0/24" or an array of masks
            bind_address: "0.0.0.0",
            bind_port: 0,
-            readonly: false,
-            osd_memlock: false,
-            // osd - configurable online
            autosync_interval: 5,
            autosync_writes: 128,
            client_queue_depth: 128, // unused
            recovery_queue_depth: 4,
            recovery_sync_batch: 16,
+            readonly: false,
            no_recovery: false,
            no_rebalance: false,
            print_stats_interval: 3,
            slow_log_interval: 10,
            inode_vanish_time: 60,
+            osd_memlock: false,
            // blockstore - fixed in superblock
            block_size,
            disk_alignment,
@@ -128,15 +125,14 @@ const etcd_tree = {
            meta_offset,
            disable_meta_fsync,
            disable_device_lock,
-            // blockstore - configurable offline
+            // blockstore - configurable
+            max_write_iodepth,
+            min_flusher_count: 1,
+            max_flusher_count: 256,
            inmemory_metadata,
            inmemory_journal,
            journal_sector_buffer_count,
            journal_no_same_sector_overwrites,
-            // blockstore - configurable online
-            max_write_iodepth,
-            min_flusher_count: 1,
-            max_flusher_count: 256,
            throttle_small_writes: false,
            throttle_target_iops: 100,
            throttle_target_mbs: 100,
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.8.7'
+VERSION = '0.8.5'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -25,4 +25,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.8.7/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.7$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.7.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.7
+Version:        0.8.5
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.7.el7.tar.gz
+Source0:        vitastor-0.8.5.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.7.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.7
+Version:        0.8.5
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.7.el8.tar.gz
+Source0:        vitastor-0.8.5.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8)

 project(vitastor)

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.8.7")
+add_definitions(-DVERSION="0.8.5")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -98,7 +98,7 @@ endif (${WITH_FIO})
 # libvitastor_common.a
 set(MSGR_RDMA "")
 if (IBVERBS_LIBRARIES)
-	set(MSGR_RDMA "msgr_rdma.cpp")
+	set(MSGR_RDMA msgr_rdma.cpp freelist.cpp allocator.cpp)
 endif (IBVERBS_LIBRARIES)
 add_library(vitastor_common STATIC
 	epoll_manager.cpp etcd_state_client.cpp messenger.cpp addr_util.cpp
@@ -278,6 +278,11 @@ add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
 add_dependencies(build_tests test_allocator)
 add_test(NAME test_allocator COMMAND test_allocator)

+# test_freelist
+add_executable(test_freelist EXCLUDE_FROM_ALL test_freelist.cpp)
+add_dependencies(build_tests test_freelist)
+add_test(NAME test_freelist COMMAND test_freelist)
+
 # test_cas
 add_executable(test_cas
 	test_cas.cpp
--- a/src/blockstore.cpp
+++ b/src/blockstore.cpp
@@ -13,11 +13,6 @@ blockstore_t::~blockstore_t()
    delete impl;
 }

-void blockstore_t::parse_config(blockstore_config_t & config)
-{
-    impl->parse_config(config, false);
-}
-
 void blockstore_t::loop()
 {
    impl->loop();
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -107,7 +107,7 @@ Input:
 - buf = pre-allocated obj_ver_id array <len> units long

 Output:
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
+- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)

 ## BS_OP_SYNC_STAB_ALL

@@ -165,9 +165,6 @@ public:
    blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
    ~blockstore_t();

-    // Update configuration
-    void parse_config(blockstore_config_t & config);
-
    // Event loop
    void loop();

--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -11,7 +11,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    ring_consumer.loop = [this]() { loop(); };
    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
-    parse_config(config, true);
+    parse_config(config);
    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
    try
    {
@@ -171,7 +171,7 @@ void blockstore_impl_t::loop()
                    // Can't submit SYNC before previous writes
                    continue;
                }
-                wr_st = continue_sync(op);
+                wr_st = continue_sync(op, false);
                if (wr_st != 2)
                {
                    has_writes = wr_st > 0 ? 1 : 2;
@@ -371,18 +371,13 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
-    init_op(op);
-    submit_queue.push_back(op);
-    ringloop->wakeup();
-}
-
-void blockstore_impl_t::init_op(blockstore_op_t *op)
-{
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
+    submit_queue.push_back(op);
+    ringloop->wakeup();
 }

 static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -216,11 +216,6 @@ struct pool_shard_settings_t
    uint32_t pg_stripe_size;
 };

-#define STAB_SPLIT_DONE 1
-#define STAB_SPLIT_WAIT 2
-#define STAB_SPLIT_SYNC 3
-#define STAB_SPLIT_TODO 4
-
 class blockstore_impl_t
 {
    blockstore_disk_t dsk;
@@ -282,6 +277,7 @@ class blockstore_impl_t
    friend class journal_flusher_t;
    friend class journal_flusher_co;

+    void parse_config(blockstore_config_t & config);
    void calc_lengths();
    void open_data();
    void open_meta();
@@ -303,7 +299,6 @@ class blockstore_impl_t
    blockstore_init_journal* journal_init_reader;

    void check_wait(blockstore_op_t *op);
-    void init_op(blockstore_op_t *op);

    // Read
    int dequeue_read(blockstore_op_t *read_op);
@@ -323,7 +318,7 @@ class blockstore_impl_t
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

    // Sync
-    int continue_sync(blockstore_op_t *op);
+    int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
    void ack_sync(blockstore_op_t *op);

    // Stabilize
@@ -331,8 +326,6 @@ class blockstore_impl_t
    int continue_stable(blockstore_op_t *op);
    void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
    void stabilize_object(object_id oid, uint64_t max_ver);
-    blockstore_op_t* selective_sync(blockstore_op_t *op);
-    int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);

    // Rollback
    int dequeue_rollback(blockstore_op_t *op);
@@ -348,8 +341,6 @@ public:
    blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
    ~blockstore_impl_t();

-    void parse_config(blockstore_config_t & config, bool init);
-
    // Event loop
    void loop();

--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -4,54 +4,8 @@
 #include <sys/file.h>
 #include "blockstore_impl.h"

-void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
+void blockstore_impl_t::parse_config(blockstore_config_t & config)
 {
-    // Online-configurable options:
-    max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
-    if (!max_flusher_count)
-    {
-        max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
-    }
-    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
-    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
-    throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
-    throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
-    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
-    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
-    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
-    if (!max_flusher_count)
-    {
-        max_flusher_count = 256;
-    }
-    if (!min_flusher_count || journal.flush_journal)
-    {
-        min_flusher_count = 1;
-    }
-    if (!max_write_iodepth)
-    {
-        max_write_iodepth = 128;
-    }
-    if (!throttle_target_iops)
-    {
-        throttle_target_iops = 100;
-    }
-    if (!throttle_target_mbs)
-    {
-        throttle_target_mbs = 100;
-    }
-    if (!throttle_target_parallelism)
-    {
-        throttle_target_parallelism = 1;
-    }
-    if (!throttle_threshold_us)
-    {
-        throttle_threshold_us = 50;
-    }
-    if (!init)
-    {
-        return;
-    }
-    // Offline-configurable options:
    // Common disk options
    dsk.parse_config(config);
    // Parse
@@ -90,7 +44,29 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
        config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
    journal.inmemory = config["inmemory_journal"] != "false";
+    max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
+    if (!max_flusher_count)
+        max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
+    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
+    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
+    throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
+    throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
+    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
+    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
+    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
    // Validate
+    if (!max_flusher_count)
+    {
+        max_flusher_count = 256;
+    }
+    if (!min_flusher_count || journal.flush_journal)
+    {
+        min_flusher_count = 1;
+    }
+    if (!max_write_iodepth)
+    {
+        max_write_iodepth = 128;
+    }
    if (journal.sector_count < 2)
    {
        journal.sector_count = 32;
@@ -115,6 +91,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    {
        throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
    }
+    if (!throttle_target_iops)
+    {
+        throttle_target_iops = 100;
+    }
+    if (!throttle_target_mbs)
+    {
+        throttle_target_mbs = 100;
+    }
+    if (!throttle_target_parallelism)
+    {
+        throttle_target_parallelism = 1;
+    }
+    if (!throttle_threshold_us)
+    {
+        throttle_threshold_us = 50;
+    }
    // init some fields
    journal.block_size = dsk.journal_block_size;
    journal.next_free = dsk.journal_block_size;
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -9,39 +9,48 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
    {
        return continue_rollback(op);
    }
-    int r = split_stab_op(op, [this](obj_ver_id ov)
+    obj_ver_id *v, *nv;
+    int i, todo = op->len;
+    for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
    {
+        if (nv != v)
+        {
+            *nv = *v;
+        }
        // Check that there are some versions greater than v->version (which may be zero),
        // check that they're unstable, synced, and not currently written to
        auto dirty_it = dirty_db.lower_bound((obj_ver_id){
-            .oid = ov.oid,
+            .oid = v->oid,
            .version = UINT64_MAX,
        });
        if (dirty_it == dirty_db.begin())
        {
+skip_ov:
            // Already rolled back, skip this object version
-            return STAB_SPLIT_DONE;
+            todo--;
+            nv--;
+            continue;
        }
        else
        {
            dirty_it--;
-            if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
+            if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
            {
-                // Already rolled back, skip this object version
-                return STAB_SPLIT_DONE;
+                goto skip_ov;
            }
-            while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
+            while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
            {
                if (IS_IN_FLIGHT(dirty_it->second.state))
                {
                    // Object write is still in progress. Wait until the write request completes
-                    return STAB_SPLIT_WAIT;
+                    return 0;
                }
                else if (!IS_SYNCED(dirty_it->second.state) ||
                    IS_STABLE(dirty_it->second.state))
                {
-                    // Sync the object
-                    return STAB_SPLIT_SYNC;
+                    op->retval = -EBUSY;
+                    FINISH_OP(op);
+                    return 2;
                }
                if (dirty_it == dirty_db.begin())
                {
@@ -49,16 +58,19 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
                }
                dirty_it--;
            }
-            return STAB_SPLIT_TODO;
        }
-    });
-    if (r != 1)
+    }
+    op->len = todo;
+    if (!todo)
    {
-        return r;
+        // Already rolled back
+        op->retval = 0;
+        FINISH_OP(op);
+        return 2;
    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
+    if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
    {
        return 0;
    }
@@ -66,8 +78,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
    BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
    // Prepare and submit journal entries
    int s = 0;
-    auto v = (obj_ver_id*)op->buf;
-    for (int i = 0; i < op->len; i++, v++)
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
            journal.sector_info[journal.cur_sector].dirty)
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@@ -41,309 +41,60 @@
 // 4) after a while it takes his synced object list and sends stabilize requests
 //    to peers and to its own blockstore, thus freeing the old version

-struct ver_vector_t
-{
-    obj_ver_id *items = NULL;
-    uint64_t alloc = 0, size = 0;
-};
-
-static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
-{
-    if (!vec.items)
-    {
-        vec.alloc = len;
-        vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
-        for (auto sv = start; sv < end; sv++)
-        {
-            vec.items[vec.size++] = *sv;
-        }
-    }
-}
-
-static void append_version(ver_vector_t & vec, obj_ver_id ov)
-{
-    if (vec.size >= vec.alloc)
-    {
-        vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
-        vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
-    }
-    vec.items[vec.size++] = ov;
-}
-
-static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
-{
-    bool found = false;
-    int j = 0, k = 0;
-    while (j < check.size())
-    {
-        if (check[j] == ov)
-            found = true;
-        if (check[j].oid == ov.oid && check[j].version <= ov.version)
-        {
-            to.push_back(check[j++]);
-            if (count)
-                (*count)--;
-        }
-        else
-            check[k++] = check[j++];
-    }
-    check.resize(k);
-    return found;
-}
-
-blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
-{
-    unsynced_big_write_count -= unsynced_big_writes.size();
-    unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
-    unsynced_big_write_count += unsynced_big_writes.size();
-    unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
-    // Create a sync operation, insert into the end of the queue
-    // And move ourselves into the end too!
-    // Rather hacky but that's what we need...
-    blockstore_op_t *sync_op = new blockstore_op_t;
-    sync_op->opcode = BS_OP_SYNC;
-    sync_op->buf = NULL;
-    sync_op->callback = [this](blockstore_op_t *sync_op)
-    {
-        delete sync_op;
-    };
-    init_op(sync_op);
-    int sync_res = continue_sync(sync_op);
-    if (sync_res != 2)
-    {
-        // Put SYNC into the queue if it's not finished yet
-        submit_queue.push_back(sync_op);
-    }
-    // Restore unsynced_writes
-    unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
-    unsynced_big_write_count -= unsynced_big_writes.size();
-    unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
-    unsynced_big_write_count += unsynced_big_writes.size();
-    if (sync_res == 2)
-    {
-        // Sync is immediately completed
-        return NULL;
-    }
-    return sync_op;
-}
-
-// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
-int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
-{
-    bool add_sync = false;
-    ver_vector_t good_vers, bad_vers;
-    obj_ver_id* v;
-    int i, todo = 0;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
-    {
-        int action = decider(*v);
-        if (action < 0)
-        {
-            // Rollback changes
-            for (auto & ov: PRIV(op)->sync_big_writes)
-            {
-                unsynced_big_writes.push_back(ov);
-                unsynced_big_write_count++;
-            }
-            for (auto & ov: PRIV(op)->sync_small_writes)
-            {
-                unsynced_small_writes.push_back(ov);
-            }
-            free(good_vers.items);
-            good_vers.items = NULL;
-            free(bad_vers.items);
-            bad_vers.items = NULL;
-            // Error
-            op->retval = action;
-            FINISH_OP(op);
-            return 2;
-        }
-        else if (action == STAB_SPLIT_DONE)
-        {
-            // Already done
-            init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
-        }
-        else if (action == STAB_SPLIT_WAIT)
-        {
-            // Already in progress, we just have to wait until it finishes
-            init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
-            append_version(bad_vers, *v);
-        }
-        else if (action == STAB_SPLIT_SYNC)
-        {
-            // Needs a SYNC, we have to send a SYNC if not already in progress
-            //
-            // If the object is not present in unsynced_(big|small)_writes then
-            // it's currently being synced. If it's present then we can initiate
-            // its sync ourselves.
-            init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
-            append_version(bad_vers, *v);
-            if (!add_sync)
-            {
-                PRIV(op)->sync_big_writes.clear();
-                PRIV(op)->sync_small_writes.clear();
-                add_sync = true;
-            }
-            check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
-            check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
-        }
-        else /* if (action == STAB_SPLIT_TODO) */
-        {
-            if (good_vers.items)
-            {
-                // If we're selecting versions then append it
-                // Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
-                // And we don't want to select/allocate anything in that optimistic case
-                append_version(good_vers, *v);
-            }
-            todo++;
-        }
-    }
-    // In a pessimistic scenario, an operation may be split into 3:
-    // - Stabilize synced entries
-    // - Sync unsynced entries
-    // - Continue for unsynced entries after sync
-    add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
-    if (!todo && !bad_vers.size)
-    {
-        // Already stable
-        op->retval = 0;
-        FINISH_OP(op);
-        return 2;
-    }
-    op->retval = 0;
-    if (!todo && !add_sync)
-    {
-        // Only wait for inflight writes or current in-progress syncs
-        return 0;
-    }
-    blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
-    if (add_sync)
-    {
-        // Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
-        sync_op = selective_sync(op);
-    }
-    if (bad_vers.size)
-    {
-        // Split part of the request into a separate operation
-        split_stab_op = new blockstore_op_t;
-        split_stab_op->opcode = op->opcode;
-        split_stab_op->buf = bad_vers.items;
-        split_stab_op->len = bad_vers.size;
-        init_op(split_stab_op);
-        submit_queue.push_back(split_stab_op);
-    }
-    if (sync_op || split_stab_op || good_vers.items)
-    {
-        void *orig_buf = op->buf;
-        if (good_vers.items)
-        {
-            op->buf = good_vers.items;
-            op->len = good_vers.size;
-        }
-        // Make a wrapped callback
-        int *split_op_counter = (int*)malloc_or_die(sizeof(int));
-        *split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
-        auto cb = [this, op, good_items = good_vers.items,
-            bad_items = bad_vers.items, split_op_counter,
-            orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
-        {
-            if (split_op->retval != 0)
-                op->retval = split_op->retval;
-            (*split_op_counter)--;
-            assert((*split_op_counter) >= 0);
-            if (op != split_op)
-                delete split_op;
-            if (!*split_op_counter)
-            {
-                free(good_items);
-                free(bad_items);
-                free(split_op_counter);
-                op->buf = orig_buf;
-                real_cb(op);
-            }
-        };
-        if (sync_op)
-        {
-            sync_op->callback = cb;
-        }
-        if (split_stab_op)
-        {
-            split_stab_op->callback = cb;
-        }
-        op->callback = cb;
-    }
-    if (!todo)
-    {
-        // All work is postponed
-        op->callback = NULL;
-        return 2;
-    }
-    return 1;
-}
-
 int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
 {
    if (PRIV(op)->op_state)
    {
        return continue_stable(op);
    }
-    int r = split_stab_op(op, [this](obj_ver_id ov)
+    obj_ver_id* v;
+    int i, todo = 0;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
-        auto dirty_it = dirty_db.find(ov);
+        auto dirty_it = dirty_db.find(*v);
        if (dirty_it == dirty_db.end())
        {
-            auto & clean_db = clean_db_shard(ov.oid);
-            auto clean_it = clean_db.find(ov.oid);
-            if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
+            auto & clean_db = clean_db_shard(v->oid);
+            auto clean_it = clean_db.find(v->oid);
+            if (clean_it == clean_db.end() || clean_it->second.version < v->version)
            {
                // No such object version
-                printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
-                return -ENOENT;
+                op->retval = -ENOENT;
+                FINISH_OP(op);
+                return 2;
            }
            else
            {
                // Already stable
-                return STAB_SPLIT_DONE;
            }
        }
        else if (IS_IN_FLIGHT(dirty_it->second.state))
        {
            // Object write is still in progress. Wait until the write request completes
-            return STAB_SPLIT_WAIT;
+            return 0;
        }
        else if (!IS_SYNCED(dirty_it->second.state))
        {
-            // Object not synced yet - sync it
-            // In previous versions we returned EBUSY here and required
-            // the caller (OSD) to issue a global sync first. But a global sync
-            // waits for all writes in the queue including inflight writes. And
-            // inflight writes may themselves be blocked by unstable writes being
-            // still present in the journal and not flushed away from it.
-            // So we must sync specific objects here.
-            //
-            // Even more, we have to process "stabilize" request in parts. That is,
-            // we must stabilize all objects which are already synced. Otherwise
-            // they may block objects which are NOT synced yet.
-            return STAB_SPLIT_SYNC;
+            // Object not synced yet. Caller must sync it first
+            op->retval = -EBUSY;
+            FINISH_OP(op);
+            return 2;
        }
-        else if (IS_STABLE(dirty_it->second.state))
+        else if (!IS_STABLE(dirty_it->second.state))
        {
-            // Already stable
-            return STAB_SPLIT_DONE;
+            todo++;
        }
-        else
-        {
-            return STAB_SPLIT_TODO;
-        }
-    });
-    if (r != 1)
+    }
+    if (!todo)
    {
-        return r;
+        // Already stable
+        op->retval = 0;
+        FINISH_OP(op);
+        return 2;
    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
+    if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
    {
        return 0;
    }
@@ -351,9 +102,9 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
    BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
    // Prepare and submit journal entries
    int s = 0;
-    auto v = (obj_ver_id*)op->buf;
-    for (int i = 0; i < op->len; i++, v++)
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
+        // FIXME: Only stabilize versions that aren't stable yet
        if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
            journal.sector_info[journal.cur_sector].dirty)
        {
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -12,7 +12,7 @@
 #define SYNC_JOURNAL_SYNC_SENT 7
 #define SYNC_DONE 8

-int blockstore_impl_t::continue_sync(blockstore_op_t *op)
+int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
 {
    if (immediate_commit == IMMEDIATE_ALL)
    {
@@ -145,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            PRIV(op)->op_state = SYNC_DONE;
        }
    }
-    if (PRIV(op)->op_state == SYNC_DONE)
+    if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
    {
        ack_sync(op);
        return 2;
--- a/src/cli_rm_osd.cpp
+++ b/src/cli_rm_osd.cpp
@@ -278,7 +278,7 @@ struct rm_osd_t
            if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
            {
                // Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
-                retry_wait = parent->cli->config["mon_change_timeout"].uint64_value();
+                retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
                if (!retry_wait)
                    retry_wait = 1000;
                retry_wait += etcd_tx_retry_ms;
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -198,9 +198,9 @@ resume_2:
            }
            pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
        }
-        bool readonly = json_is_true(parent->cli->config["readonly"]);
-        bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
-        bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
+        bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
+        bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
+        bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
        if (parent->json_output)
        {
            // JSON output
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -18,12 +18,11 @@

 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
-    cli_config = config.object_items();
-    file_config = osd_messenger_t::read_config(config);
-    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
+    config = osd_messenger_t::read_config(config);

    this->ringloop = ringloop;
    this->tfd = tfd;
+    this->config = config;

    msgr.osd_num = 0;
    msgr.tfd = tfd;
@@ -59,7 +58,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
        msgr.stop_client(op->peer_fd);
        delete op;
    };
-    msgr.parse_config(config);
+    msgr.parse_config(this->config);

    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -277,10 +276,13 @@ restart:
    continuing_ops = 0;
 }

-void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
+void cluster_client_t::on_load_config_hook(json11::Json::object & config)
 {
-    this->etcd_global_config = etcd_global_config;
-    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
+    this->merged_config = config;
+    for (auto & kv: this->config.object_items())
+    {
+        this->merged_config[kv.first] = kv.second;
+    }
    if (config.find("client_max_dirty_bytes") != config.end())
    {
        client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -290,13 +292,14 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
        // Old name
        client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
    }
-    else
-        client_max_dirty_bytes = 0;
+    if (config.find("client_max_dirty_ops") != config.end())
+    {
+        client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
+    }
    if (!client_max_dirty_bytes)
    {
        client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
    }
-    client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
    if (!client_max_dirty_ops)
    {
        client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
@@ -311,7 +314,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
        up_wait_retry_interval = 50;
    }
    msgr.parse_config(config);
-    st_cli.parse_config(config);
+    msgr.parse_config(this->config);
    st_cli.load_pgs();
 }

@@ -1118,24 +1121,6 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
    if (part->op.reply.hdr.retval != expected)
    {
        // Operation failed, retry
-        part->flags |= PART_ERROR;
-        if (!op->retval || op->retval == -EPIPE)
-        {
-            // Don't overwrite other errors with -EPIPE
-            op->retval = part->op.reply.hdr.retval;
-        }
-        int stop_fd = -1;
-        if (op->retval != -EINTR && op->retval != -EIO)
-        {
-            stop_fd = part->op.peer_fd;
-            fprintf(
-                stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
-                osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
-            );
-        }
-        // All next things like timer, continue_sync/rw and stop_client may affect the operation again
-        // So do all these things after modifying operation state, otherwise we may hit reenterability bugs
-        // FIXME postpone such things to set_immediate here to avoid bugs
        if (part->op.reply.hdr.retval == -EPIPE)
        {
            // Mark op->up_wait = true before stopping the client
@@ -1149,17 +1134,20 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
                });
            }
        }
-        if (op->inflight_count == 0)
+        if (!op->retval || op->retval == -EPIPE)
        {
-            if (op->opcode == OSD_OP_SYNC)
-                continue_sync(op);
-            else
-                continue_rw(op);
+            // Don't overwrite other errors with -EPIPE
+            op->retval = part->op.reply.hdr.retval;
        }
-        if (stop_fd >= 0)
+        if (op->retval != -EINTR && op->retval != -EIO)
        {
-            msgr.stop_client(stop_fd);
+            fprintf(
+                stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
+                osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
+            );
+            msgr.stop_client(part->op.peer_fd);
        }
+        part->flags |= PART_ERROR;
    }
    else
    {
@@ -1173,13 +1161,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
            copy_part_bitmap(op, part);
            op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
        }
-        if (op->inflight_count == 0)
-        {
-            if (op->opcode == OSD_OP_SYNC)
-                continue_sync(op);
-            else
-                continue_rw(op);
-        }
+    }
+    if (op->inflight_count == 0)
+    {
+        if (op->opcode == OSD_OP_SYNC)
+            continue_sync(op);
+        else
+            continue_rw(op);
    }
 }

--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -112,8 +112,8 @@ public:
    osd_messenger_t msgr;
    void init_msgr();

-    json11::Json::object cli_config, file_config, etcd_global_config;
-    json11::Json::object config;
+    json11::Json config;
+    json11::Json::object merged_config;

    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
--- a/src/disk_tool_journal.cpp
+++ b/src/disk_tool_journal.cpp
@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
        if (je->big_write.size > sizeof(journal_entry_big_write))
        {
            printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
-            for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
+            for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
            {
                printf("%02x", ((uint8_t*)je)[i]);
            }
--- a/src/disk_tool_meta.cpp
+++ b/src/disk_tool_meta.cpp
@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
        buf_size = dsk.meta_len;
    void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
    lseek64(dsk.meta_fd, dsk.meta_offset, 0);
-    read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
+    read_blocking(dsk.meta_fd, data, buf_size);
    // Check superblock
    blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
    if (hdr->zero == 0 &&
@@ -41,11 +41,8 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
            if (buf_size % dsk.meta_block_size)
            {
                buf_size = 8*dsk.meta_block_size;
-                void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
-                memcpy(new_data, data, dsk.meta_block_size);
                free(data);
-                data = new_data;
-                hdr = (blockstore_meta_header_v1_t *)data;
+                data = memalign_or_die(MEM_ALIGNMENT, buf_size);
            }
        }
        dsk.bitmap_granularity = hdr->bitmap_granularity;
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -18,8 +18,12 @@ etcd_state_client_t::~etcd_state_client_t()
    }
    watches.clear();
    etcd_watches_initialised = -1;
+    if (ws_keepalive_timer >= 0)
+    {
+        tfd->clear_timer(ws_keepalive_timer);
+        ws_keepalive_timer = -1;
+    }
 #ifndef __MOCK__
-    stop_ws_keepalive();
    if (etcd_watch_ws)
    {
        http_close(etcd_watch_ws);
@@ -241,7 +245,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
        if (this->etcd_keepalive_timeout < 30)
            this->etcd_keepalive_timeout = 30;
    }
-    auto old_etcd_ws_keepalive_interval = this->etcd_ws_keepalive_interval;
    this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
    if (this->etcd_ws_keepalive_interval <= 0)
    {
@@ -262,13 +265,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
    {
        this->etcd_quick_timeout = 1000;
    }
-    if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
-    {
-#ifndef __MOCK__
-        stop_ws_keepalive();
-        start_ws_keepalive();
-#endif
-    }
 }

 void etcd_state_client_t::pick_next_etcd()
@@ -482,20 +478,6 @@ void etcd_state_client_t::start_etcd_watcher()
    {
        on_start_watcher_hook(etcd_watch_ws);
    }
-    start_ws_keepalive();
-}
-
-void etcd_state_client_t::stop_ws_keepalive()
-{
-    if (ws_keepalive_timer >= 0)
-    {
-        tfd->clear_timer(ws_keepalive_timer);
-        ws_keepalive_timer = -1;
-    }
-}
-
-void etcd_state_client_t::start_ws_keepalive()
-{
    if (ws_keepalive_timer < 0)
    {
        ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -132,8 +132,6 @@ public:
    void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
    void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
    void start_etcd_watcher();
-    void stop_ws_keepalive();
-    void start_ws_keepalive();
    void load_global_config();
    void load_pgs();
    void parse_state(const etcd_kv_t & kv);
--- a/src/freelist.cpp
+++ b/src/freelist.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) Vitaliy Filippov, 2023+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <assert.h>
+#include "freelist.h"
+
+uint64_t freelist_allocator_t::alloc(uint64_t data_size)
+{
+    for (int i = 0; i < freelist.size(); i++)
+    {
+        if (freelist[i].size >= data_size)
+        {
+            uint64_t r = freelist[i].start;
+            freelist[i].start += data_size;
+            freelist[i].size -= data_size;
+            return r;
+        }
+    }
+    return UINT64_MAX;
+}
+
+void freelist_allocator_t::free(uint64_t start, uint64_t size)
+{
+    int min = 0, max = freelist.size();
+    if (max && freelist[freelist.size()-1].start < start)
+    {
+        min = max;
+    }
+    if (max && freelist[0].start >= start)
+    {
+        max = 0;
+    }
+    while (max-min > 1)
+    {
+        int mid = (min+max)/2;
+        if (freelist[mid].start >= start)
+            max = mid;
+        else
+            min = mid;
+    }
+    // max = the first item where freelist[max].start >= start
+    if (max > 0 && freelist[max-1].start+freelist[max-1].size >= start)
+    {
+        assert(freelist[max-1].start+freelist[max-1].size == start);
+        freelist[max-1].size += size;
+    }
+    else if (max < freelist.size() && freelist[max].start <= size+start)
+    {
+        assert(freelist[max].start == size+start);
+        freelist[max].start -= size;
+        freelist[max].size += size;
+    }
+    else
+    {
+        freelist.insert(freelist.begin()+min, (freelist_item_t){ .start = start, .size = size });
+        max = min; // to skip the if below
+    }
+    if (min != max && max < freelist.size() && freelist[max].start == freelist[min].start+freelist[min].size)
+    {
+        freelist[min].size += freelist[max].size;
+        freelist.erase(freelist.begin()+max, freelist.begin()+max+1);
+    }
+}
--- a/src/freelist.h
+++ b/src/freelist.h
@@ -0,0 +1,23 @@
+// Copyright (c) Vitaliy Filippov, 2023+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+
+struct freelist_item_t
+{
+    uint64_t start, size;
+};
+
+// Really trivial freelist allocator
+// Should be fine for remote RDMA memory management because
+// most of the time fragmentation shouldn't be an issue as all
+// memory regions are short-lived
+struct freelist_allocator_t
+{
+    std::vector<freelist_item_t> freelist;
+    uint64_t alloc(uint64_t data_size);
+    void free(uint64_t start, uint64_t size);
+};
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -157,13 +157,16 @@ void osd_messenger_t::parse_config(const json11::Json & config)
        this->rdma_max_sge = 128;
    this->rdma_max_send = config["rdma_max_send"].uint64_value();
    if (!this->rdma_max_send)
-        this->rdma_max_send = 8;
+        this->rdma_max_send = 128;
    this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
    if (!this->rdma_max_recv)
-        this->rdma_max_recv = 16;
-    this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
-    if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
-        this->rdma_max_msg = 129*1024;
+        this->rdma_max_recv = 128;
+    this->rdma_op_slots = config["rdma_op_slots"].uint64_value();
+    if (!this->rdma_op_slots || this->rdma_op_slots >= 1024*1024)
+        this->rdma_op_slots = 4096;
+    this->rdma_op_memory = config["rdma_op_memory"].uint64_value();
+    if (!this->rdma_op_memory || this->rdma_op_memory >= 1024*1024*1024)
+        this->rdma_op_memory = 16*1024*1024;
 #endif
    this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
    if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
@@ -388,12 +391,16 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
 #ifdef WITH_RDMA
    if (rdma_context)
    {
-        cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
+        cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_op_slots, rdma_op_memory);
        if (cl->rdma_conn)
        {
+            clients_by_qp[cl->rdma_conn->qp->qp_num] = cl->peer_fd;
            json11::Json payload = json11::Json::object {
                { "connect_rdma", cl->rdma_conn->addr.to_string() },
-                { "rdma_max_msg", cl->rdma_conn->max_msg },
+                { "rdma_data_rkey", (uint64_t)cl->rdma_conn->in_data_mr->rkey },
+                { "rdma_op_rkey", (uint64_t)cl->rdma_conn->in_op_mr->rkey },
+                { "rdma_op_slots", cl->rdma_conn->op_slots },
+                { "rdma_op_memory", cl->rdma_conn->op_memory },
            };
            std::string payload_str = payload.dump();
            op->req.show_conf.json_len = payload_str.size();
@@ -453,12 +460,14 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
        {
            msgr_rdma_address_t addr;
            if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
+                config["rdma_op_memory"].uint64_value() == 0 ||
                cl->rdma_conn->connect(&addr) != 0)
            {
                fprintf(
                    stderr, "Failed to connect to OSD %lu (address %s) using RDMA\n",
                    cl->osd_num, config["rdma_address"].string_value().c_str()
                );
+                clients_by_qp.erase(cl->rdma_conn->qp->qp_num);
                delete cl->rdma_conn;
                cl->rdma_conn = NULL;
                // FIXME: Keep TCP connection in this case
@@ -470,11 +479,12 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
            }
            else
            {
-                uint64_t server_max_msg = config["rdma_max_msg"].uint64_value();
-                if (cl->rdma_conn->max_msg > server_max_msg)
-                {
-                    cl->rdma_conn->max_msg = server_max_msg;
-                }
+                cl->rdma_conn->set_out_capacity(
+                    config["rdma_data_rkey"].uint64_value(),
+                    config["rdma_op_rkey"].uint64_value(),
+                    config["rdma_op_slots"].uint64_value(),
+                    config["rdma_op_memory"].uint64_value()
+                );
                if (log_level > 0)
                {
                    fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
@@ -534,9 +544,8 @@ bool osd_messenger_t::is_rdma_enabled()
 }
 #endif

-json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
+json11::Json osd_messenger_t::read_config(const json11::Json & config)
 {
-    json11::Json::object file_config;
    const char *config_path = config["config_path"].string_value() != ""
        ? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
    int fd = open(config_path, O_RDONLY);
@@ -544,14 +553,14 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
    {
        if (errno != ENOENT)
            fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
-        return file_config;
+        return config;
    }
    struct stat st;
    if (fstat(fd, &st) != 0)
    {
        fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
        close(fd);
-        return file_config;
+        return config;
    }
    std::string buf;
    buf.resize(st.st_size);
@@ -563,125 +572,23 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
        {
            fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
            close(fd);
-            return file_config;
+            return config;
        }
        done += r;
    }
    close(fd);
    std::string json_err;
-    file_config = json11::Json::parse(buf, json_err).object_items();
+    json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
    if (json_err != "")
    {
        fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
+        return config;
+    }
+    file_config.erase("config_path");
+    file_config.erase("osd_num");
+    for (auto kv: config.object_items())
+    {
+        file_config[kv.first] = kv.second;
    }
    return file_config;
 }
-
-static const char* cli_only_params[] = {
-    // The list has to be sorted
-    "bitmap_granularity",
-    "block_size",
-    "data_device",
-    "data_offset",
-    "data_size",
-    "disable_data_fsync",
-    "disable_device_lock",
-    "disable_journal_fsync",
-    "disable_meta_fsync",
-    "disk_alignment",
-    "flush_journal",
-    "immediate_commit",
-    "inmemory_journal",
-    "inmemory_metadata",
-    "journal_block_size",
-    "journal_device",
-    "journal_no_same_sector_overwrites",
-    "journal_offset",
-    "journal_sector_buffer_count",
-    "journal_size",
-    "meta_block_size",
-    "meta_buf_size",
-    "meta_device",
-    "meta_offset",
-    "osd_num",
-    "readonly",
-};
-
-static const char **cli_only_end = cli_only_params + (sizeof(cli_only_params)/sizeof(cli_only_params[0]));
-
-static const char* local_only_params[] = {
-    // The list has to be sorted
-    "config_path",
-    "rdma_device",
-    "rdma_gid_index",
-    "rdma_max_msg",
-    "rdma_max_recv",
-    "rdma_max_send",
-    "rdma_max_sge",
-    "rdma_mtu",
-    "rdma_port_num",
-    "tcp_header_buffer_size",
-    "use_rdma",
-    "use_sync_send_recv",
-};
-
-static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
-
-// Basically could be replaced by std::lower_bound()...
-static int find_str_array(const char **start, const char **end, const std::string & s)
-{
-    int min = 0, max = end-start;
-    while (max-min >= 2)
-    {
-        int mid = (min+max)/2;
-        int r = strcmp(s.c_str(), start[mid]);
-        if (r < 0)
-            max = mid;
-        else if (r > 0)
-            min = mid;
-        else
-            return mid;
-    }
-    if (min < end-start && !strcmp(s.c_str(), start[min]))
-        return min;
-    return -1;
-}
-
-json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
-    const json11::Json::object & file_config,
-    const json11::Json::object & etcd_global_config,
-    const json11::Json::object & etcd_osd_config)
-{
-    // Priority: most important -> less important:
-    // etcd_osd_config -> cli_config -> etcd_global_config -> file_config
-    json11::Json::object res = file_config;
-    for (auto & kv: file_config)
-    {
-        int cli_only = find_str_array(cli_only_params, cli_only_end, kv.first);
-        if (cli_only < 0)
-        {
-            res[kv.first] = kv.second;
-        }
-    }
-    for (auto & kv: etcd_global_config)
-    {
-        int local_only = find_str_array(local_only_params, local_only_end, kv.first);
-        if (local_only < 0)
-        {
-            res[kv.first] = kv.second;
-        }
-    }
-    for (auto & kv: cli_config)
-    {
-        res[kv.first] = kv.second;
-    }
-    for (auto & kv: etcd_osd_config)
-    {
-        int local_only = find_str_array(local_only_params, local_only_end, kv.first);
-        if (local_only < 0)
-        {
-            res[kv.first] = kv.second;
-        }
-    }
-    return res;
-}
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -37,6 +37,7 @@

 #define MSGR_SENDP_HDR 1
 #define MSGR_SENDP_FREE 2
+#define MSGR_SENDP_LAST 4

 struct msgr_sendp_t
 {
@@ -131,9 +132,10 @@ protected:
    bool use_rdma = true;
    std::string rdma_device;
    uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
-    msgr_rdma_context_t *rdma_context = NULL;
    uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
-    uint64_t rdma_max_msg = 0;
+    uint64_t rdma_op_slots = 0, rdma_op_memory = 0;
+    msgr_rdma_context_t *rdma_context = NULL;
+    std::map<uint32_t, int> clients_by_qp;
 #endif

    std::vector<int> read_ready_clients;
@@ -166,15 +168,12 @@ public:
    void accept_connections(int listen_fd);
    ~osd_messenger_t();

-    static json11::Json::object read_config(const json11::Json & config);
-    static json11::Json::object merge_configs(const json11::Json::object & cli_config,
-        const json11::Json::object & file_config,
-        const json11::Json::object & etcd_global_config,
-        const json11::Json::object & etcd_osd_config);
+    static json11::Json read_config(const json11::Json & config);

 #ifdef WITH_RDMA
    bool is_rdma_enabled();
-    bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
+    bool connect_rdma(int peer_fd, std::string rdma_address,
+        uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory);
 #endif

 protected:
@@ -195,12 +194,13 @@ protected:
    bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
    bool handle_finished_read(osd_client_t *cl);
    void handle_op_hdr(osd_client_t *cl);
-    bool handle_reply_hdr(osd_client_t *cl);
+    bool handle_reply_hdr(void *reply_hdr, osd_client_t *cl);
    void handle_reply_ready(osd_op_t *op);

 #ifdef WITH_RDMA
    bool try_send_rdma(osd_client_t *cl);
    bool try_recv_rdma(osd_client_t *cl);
    void handle_rdma_events();
+    bool rdma_handle_op(osd_client_t *cl, uint32_t op_slot);
 #endif
 };
--- a/src/mock/messenger.cpp
+++ b/src/mock/messenger.cpp
@@ -43,15 +43,7 @@ void osd_messenger_t::send_replies()
 {
 }

-json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
+json11::Json osd_messenger_t::read_config(const json11::Json & config)
 {
-    return json11::Json::object();
-}
-
-json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
-    const json11::Json::object & file_config,
-    const json11::Json::object & etcd_global_config,
-    const json11::Json::object & etcd_osd_config)
-{
-    return cli_config;
+    return config;
 }
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@@ -46,9 +46,20 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
    ctx->used_max_cqe -= max_send+max_recv;
    if (qp)
        ibv_destroy_qp(qp);
-    if (recv_buffers.size())
-        for (auto b: recv_buffers)
-            free(b);
+    if (in_data_mr)
+        ibv_dereg_mr(in_data_mr);
+    if (in_op_mr)
+        ibv_dereg_mr(in_op_mr);
+    if (in_data_buf)
+        free(in_data_buf);
+    if (in_ops)
+        free(in_ops);
+    if (out_op_alloc)
+        delete out_op_alloc;
+    if (out_slot_data)
+        free(out_slot_data);
+    if (out_slot_ops)
+        free(out_slot_ops);
 }

 msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
@@ -149,7 +160,7 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
    ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
    if (!ctx->mr)
    {
-        fprintf(stderr, "Couldn't register RDMA memory region\n");
+        fprintf(stderr, "Couldn't register global RDMA memory region: %s\n", strerror(errno));
        goto cleanup;
    }

@@ -180,7 +191,7 @@ cleanup:
 }

 msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send,
-    uint32_t max_recv, uint32_t max_sge, uint32_t max_msg)
+    uint32_t max_recv, uint32_t max_sge, uint64_t op_slots, uint64_t op_memory)
 {
    msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;

@@ -190,7 +201,6 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
    conn->max_send = max_send;
    conn->max_recv = max_recv;
    conn->max_sge = max_sge;
-    conn->max_msg = max_msg;

    ctx->used_max_cqe += max_send+max_recv;
    if (ctx->used_max_cqe > ctx->max_cqe)
@@ -211,6 +221,30 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
        ctx->max_cqe = new_max_cqe;
    }

+    conn->op_memory = op_memory;
+    conn->in_data_buf = memalign_or_die(MEM_ALIGNMENT, op_memory);
+    conn->in_data_mr = ibv_reg_mr(ctx->pd, conn->in_data_buf, op_memory,
+        IBV_ACCESS_ZERO_BASED | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_ON_DEMAND);
+    if (!conn->in_data_mr)
+    {
+        fprintf(stderr, "Couldn't register %lu MB RDMA memory region for incoming data: %s\n",
+            (op_memory+1024*1024-1)/1024/1024, strerror(errno));
+        delete conn;
+        return NULL;
+    }
+
+    conn->op_slots = op_slots;
+    conn->in_ops = (msgr_rdma_cmd_t *)malloc_or_die(sizeof(msgr_rdma_cmd_t) * op_slots);
+    conn->in_op_mr = ibv_reg_mr(ctx->pd, conn->in_ops, sizeof(msgr_rdma_cmd_t) * op_slots,
+        IBV_ACCESS_ZERO_BASED | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_ON_DEMAND);
+    if (!conn->in_op_mr)
+    {
+        fprintf(stderr, "Couldn't register %lu KB RDMA memory region for incoming operation headers: %s\n",
+            (sizeof(msgr_rdma_cmd_t) * op_slots + 1023)/1024, strerror(errno));
+        delete conn;
+        return NULL;
+    }
+
    ibv_qp_init_attr init_attr = {
        .send_cq = ctx->cq,
        .recv_cq = ctx->cq,
@@ -237,7 +271,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,

    ibv_qp_attr attr = {
        .qp_state        = IBV_QPS_INIT,
-        .qp_access_flags = 0,
+        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE,
        .pkey_index      = 0,
        .port_num        = ctx->ib_port,
    };
@@ -265,6 +299,19 @@ static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
    return IBV_MTU_4096;
 }

+void msgr_rdma_connection_t::set_out_capacity(uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory)
+{
+    assert(!out_op_alloc);
+    this->out_data_rkey = out_data_rkey;
+    this->out_op_rkey = out_op_rkey;
+    this->out_op_slots = out_op_slots;
+    this->out_op_memory = out_op_memory;
+    out_op_alloc = new allocator(out_op_slots);
+    out_data_alloc.free(0, out_op_memory);
+    out_slot_data = (msgr_rdma_out_pos_t *)malloc_or_die(sizeof(msgr_rdma_out_pos_t) * out_op_slots);
+    out_slot_ops = (osd_op_t **)malloc_or_die(sizeof(osd_op_t *) * out_op_slots);
+}
+
 int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
 {
    auto conn = this;
@@ -311,17 +358,14 @@ int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
    return 0;
 }

-bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg)
+bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address,
+    uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory)
 {
    // Try to connect to the peer using RDMA
    msgr_rdma_address_t addr;
    if (msgr_rdma_address_t::from_string(rdma_address.c_str(), &addr))
    {
-        if (client_max_msg > rdma_max_msg)
-        {
-            client_max_msg = rdma_max_msg;
-        }
-        auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, client_max_msg);
+        auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_op_slots, rdma_op_memory);
        if (rdma_conn)
        {
            int r = rdma_conn->connect(&addr);
@@ -336,6 +380,8 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
            else
            {
                // Remember connection, but switch to RDMA only after sending the configuration response
+                clients_by_qp[rdma_conn->qp->qp_num] = peer_fd;
+                rdma_conn->set_out_capacity(out_data_rkey, out_op_rkey, out_op_slots, out_op_memory);
                auto cl = clients.at(peer_fd);
                cl->rdma_conn = rdma_conn;
                cl->peer_state = PEER_RDMA_CONNECTING;
@@ -346,83 +392,172 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
    return false;
 }

-static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
-{
-    ibv_send_wr *bad_wr = NULL;
-    ibv_send_wr wr = {
-        .wr_id = (uint64_t)(cl->peer_fd*2+1),
-        .sg_list = sge,
-        .num_sge = op_sge,
-        .opcode = IBV_WR_SEND,
-        .send_flags = IBV_SEND_SIGNALED,
-    };
-    int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
-    if (err || bad_wr)
-    {
-        fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
-        exit(1);
-    }
-    cl->rdma_conn->cur_send++;
-}
-
 bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
+    if (!cl->send_list.size() && !rc->in_slots_freed.size() || rc->cur_send >= rc->max_send)
+    {
+        return true;
+    }
+    int i = 0;
+    while (i < rc->in_slots_freed.size())
+    {
+        auto op_slot = rc->in_slots_freed[i++];
+        assert(op_slot < 0x80000000);
+        ibv_send_wr *bad_wr = NULL;
+        ibv_send_wr wr = {
+            .wr_id = 0,
+            .opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
+            .imm_data = 0x80000000 | op_slot,
+        };
+        int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
+        if (err || bad_wr)
+        {
+            fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
+            exit(1);
+        }
+        rc->cur_send++;
+        if (rc->cur_send >= rc->max_send)
+        {
+            break;
+        }
+    }
+    rc->in_slots_freed.erase(rc->in_slots_freed.begin(), rc->in_slots_freed.begin()+i);
    if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
    {
        return true;
    }
-    uint64_t op_size = 0, op_sge = 0;
    ibv_sge sge[rc->max_sge];
-    while (rc->send_pos < cl->send_list.size())
+    int op_start = 0;
+    while (op_start < cl->send_list.size())
    {
-        iovec & iov = cl->send_list[rc->send_pos];
-        if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
+        uint64_t op_data_size = 0;
+        int op_end = op_start;
+        while (!(cl->outbox[op_end].flags & MSGR_SENDP_LAST))
        {
-            rc->send_sizes.push_back(op_size);
-            try_send_rdma_wr(cl, sge, op_sge);
-            op_sge = 0;
-            op_size = 0;
-            if (rc->cur_send >= rc->max_send)
+            op_data_size += cl->send_list[op_end].iov_len;
+            op_end++;
+        }
+        op_data_size += cl->send_list[op_end].iov_len;
+        op_end++;
+        op_data_size -= cl->send_list[op_start].iov_len;
+        // Operation boundaries in send_list: op_start..op_end, first iovec is the header
+        uint64_t op_slot = rc->out_op_alloc->find_free();
+        if (op_slot == UINT64_MAX)
+        {
+            // op queue is full
+            return true;
+        }
+        uint64_t data_pos = UINT64_MAX;
+        if (op_data_size >= 0)
+        {
+            if (rc->cur_send > rc->max_send-1-(op_end-op_start-1+rc->max_sge)/rc->max_sge)
            {
-                break;
+                // RDMA queue is full
+                return true;
+            }
+            // FIXME: Oops, and what if op data is larger than the whole buffer... :)
+            data_pos = rc->out_data_alloc.alloc(op_data_size);
+            if (data_pos == UINT64_MAX)
+            {
+                // data buffers are full
+                return true;
+            }
+            int cur_sge = 0;
+            for (int data_sent = 1; data_sent < op_end; data_sent++)
+            {
+                sge[cur_sge++] = {
+                    .addr = (uintptr_t)cl->send_list[data_sent].iov_base,
+                    .length = (uint32_t)cl->send_list[data_sent].iov_len,
+                    .lkey = rc->ctx->mr->lkey,
+                };
+                if (data_sent == op_end-1 || cur_sge >= rc->max_sge)
+                {
+                    ibv_send_wr *bad_wr = NULL;
+                    ibv_send_wr wr = {
+                        .wr_id = op_slot,
+                        .next = NULL,
+                        .sg_list = sge,
+                        .num_sge = cur_sge,
+                        .opcode = IBV_WR_RDMA_WRITE,
+                        .send_flags = 0,
+                        .wr = {
+                            .rdma = {
+                                .remote_addr = data_pos,
+                                .rkey = rc->out_data_rkey,
+                            },
+                        },
+                    };
+                    int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
+                    if (err || bad_wr)
+                    {
+                        fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
+                        exit(1);
+                    }
+                    rc->cur_send++;
+                    cur_sge = 0;
+                }
            }
        }
-        uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
-            ? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
-        sge[op_sge++] = {
-            .addr = (uintptr_t)((uint8_t*)iov.iov_base+rc->send_buf_pos),
-            .length = len,
+        if (rc->cur_send > rc->max_send-1)
+        {
+            // RDMA queue is full
+            return true;
+        }
+        rc->out_op_alloc->set(op_slot, true);
+        assert(cl->send_list[op_start].iov_len == OSD_PACKET_SIZE);
+        sge[0] = {
+            .addr = (uintptr_t)cl->send_list[op_start].iov_base,
+            .length = (uint32_t)cl->send_list[op_start].iov_len,
            .lkey = rc->ctx->mr->lkey,
        };
-        op_size += len;
-        rc->send_buf_pos += len;
-        if (rc->send_buf_pos >= iov.iov_len)
+        rc->out_slot_data[op_slot] = { .data_pos = data_pos, .data_size = op_data_size };
+        rc->out_slot_ops[op_slot] = (cl->outbox[op_end-1].flags & MSGR_SENDP_FREE)
+            ? cl->outbox[op_end-1].op : NULL;
+        sge[1] = {
+            .addr = (uintptr_t)(rc->out_slot_data+op_slot),
+            .length = sizeof(rc->out_slot_data[op_slot]),
+            .lkey = rc->ctx->mr->lkey,
+        };
+        ibv_send_wr *bad_wr = NULL;
+        ibv_send_wr wr = {
+            .wr_id = op_slot,
+            .next = NULL,
+            .sg_list = sge,
+            .num_sge = 2,
+            .opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
+            .send_flags = IBV_SEND_SIGNALED,
+            .imm_data = (uint32_t)op_slot,
+            .wr = {
+                .rdma = {
+                    .remote_addr = op_slot*sizeof(msgr_rdma_cmd_t),
+                    .rkey = rc->out_op_rkey,
+                },
+            },
+        };
+        int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
+        if (err || bad_wr)
        {
-            rc->send_pos++;
-            rc->send_buf_pos = 0;
+            fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
+            exit(1);
        }
+        rc->cur_send++;
+        op_start = op_end;
    }
-    if (op_sge > 0)
+    if (op_start > 0)
    {
-        rc->send_sizes.push_back(op_size);
-        try_send_rdma_wr(cl, sge, op_sge);
+        cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_start);
    }
    return true;
 }

-static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
+static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
 {
-    ibv_sge sge = {
-        .addr = (uintptr_t)buf,
-        .length = (uint32_t)cl->rdma_conn->max_msg,
-        .lkey = cl->rdma_conn->ctx->mr->lkey,
-    };
    ibv_recv_wr *bad_wr = NULL;
    ibv_recv_wr wr = {
        .wr_id = (uint64_t)(cl->peer_fd*2),
-        .sg_list = &sge,
-        .num_sge = 1,
+        .sg_list = sge,
+        .num_sge = op_sge,
    };
    int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
    if (err || bad_wr)
@@ -433,18 +568,87 @@ static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
    cl->rdma_conn->cur_recv++;
 }

+static void copy_data_to_recv_list(uint8_t *data_buf, uint64_t data_size, osd_client_t *cl)
+{
+    uint64_t pos = 0;
+    while (cl->recv_list.done < cl->recv_list.count)
+    {
+        uint64_t cur = cl->recv_list.buf[cl->recv_list.done].iov_len;
+        assert(cur <= data_size-pos);
+        memcpy(cl->recv_list.buf[cl->recv_list.done].iov_base, data_buf+pos, cur);
+        pos += cur;
+    }
+    cl->recv_list.reset();
+}
+
 bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
    while (rc->cur_recv < rc->max_recv)
    {
-        void *buf = malloc_or_die(rc->max_msg);
-        rc->recv_buffers.push_back(buf);
-        try_recv_rdma_wr(cl, buf);
+        try_recv_rdma_wr(cl, NULL, 0);
    }
    return true;
 }

+bool osd_messenger_t::rdma_handle_op(osd_client_t *cl, uint32_t op_slot)
+{
+    auto rc = cl->rdma_conn;
+    if (op_slot >= rc->in_op_cap)
+    {
+        // Invalid incoming index
+        fprintf(stderr, "Client %d invalid incoming RDMA op slot: %u, dropping connection\n", cl->peer_fd, op_slot);
+        stop_client(cl->peer_fd);
+        return false;
+    }
+    osd_op_header_t *hdr = (osd_op_header_t *)rc->in_ops[op_slot].header;
+    uint8_t *data_buf = (uint8_t*)rc->in_data_buf + rc->in_ops[op_slot].pos.data_pos;
+    uint64_t data_size = rc->in_ops[op_slot].pos.data_size;
+    if (hdr->magic == SECONDARY_OSD_REPLY_MAGIC)
+    {
+        // Reply
+        if (cl->read_op)
+        {
+            delete cl->read_op;
+            cl->read_op = NULL;
+        }
+        if (!handle_reply_hdr(rc->in_ops[op_slot].header, cl))
+            return false;
+        if (cl->read_state == CL_READ_REPLY_DATA)
+        {
+            // copy reply data to cl->recv_list
+            copy_data_to_recv_list(data_buf, data_size, cl);
+            // and handle reply with data
+            handle_reply_ready(cl->read_op);
+            cl->read_op = NULL;
+            cl->read_state = 0;
+            cl->read_remaining = 0;
+        }
+    }
+    else
+    {
+        // Operation
+        cl->read_op = new osd_op_t;
+        cl->read_op->peer_fd = cl->peer_fd;
+        cl->read_op->op_type = OSD_OP_IN;
+        memcpy(&cl->read_op->req, hdr, OSD_PACKET_SIZE);
+        handle_op_hdr(cl);
+        if (cl->read_state == CL_READ_DATA)
+        {
+            copy_data_to_recv_list(data_buf, data_size, cl);
+            // And handle the incoming op with data
+            cl->received_ops.push_back(cl->read_op);
+            set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
+            cl->read_op = NULL;
+            cl->read_state = 0;
+        }
+    }
+    // We don't need the incoming data buffer anymore, notify peer about it
+    // FIXME: Allow to pass memory to the internal layer without copying and notify after handling it
+    rc->in_slots_freed.push_back(op_slot);
+    return true;
+}
+
 #define RDMA_EVENTS_AT_ONCE 32

 void osd_messenger_t::handle_rdma_events()
@@ -469,80 +673,60 @@ void osd_messenger_t::handle_rdma_events()
        event_count = ibv_poll_cq(rdma_context->cq, RDMA_EVENTS_AT_ONCE, wc);
        for (int i = 0; i < event_count; i++)
        {
-            int client_id = wc[i].wr_id >> 1;
-            bool is_send = wc[i].wr_id & 1;
-            auto cl_it = clients.find(client_id);
+            auto cqp_it = clients_by_qp.find(wc[i].qp_num);
+            int peer_fd = cqp_it != clients_by_qp.end() ? cqp_it->second : -1;
+            auto cl_it = clients.find(peer_fd);
            if (cl_it == clients.end())
            {
                continue;
            }
            osd_client_t *cl = cl_it->second;
-            auto rc = cl->rdma_conn;
            if (wc[i].status != IBV_WC_SUCCESS)
            {
-                fprintf(stderr, "RDMA work request failed for client %d", client_id);
+                fprintf(stderr, "RDMA work request failed for client %d", peer_fd);
                if (cl->osd_num)
-                {
                    fprintf(stderr, " (OSD %lu)", cl->osd_num);
-                }
                fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
-                stop_client(client_id);
+                if (peer_fd >= 0)
+                    stop_client(peer_fd);
                continue;
            }
-            if (!is_send)
+            auto rc = cl->rdma_conn;
+            if (wc[i].opcode == IBV_WC_RDMA_WRITE)
            {
-                rc->cur_recv--;
-                if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
+                // Operation or reply is sent, we can free it
+                auto & op = rc->out_slot_ops[wc[i].wr_id];
+                if (op)
                {
-                    // handle_read_buffer may stop the client
-                    continue;
+                    delete op;
+                    op = NULL;
                }
-                try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
-                rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
-            }
-            else
-            {
                rc->cur_send--;
-                uint64_t sent_size = rc->send_sizes.at(0);
-                rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
-                int send_pos = 0, send_buf_pos = 0;
-                while (sent_size > 0)
+                try_send_rdma(cl);
+            }
+            else if (wc[i].opcode == IBV_WC_RECV)
+            {
+                if (!(wc[i].imm_data & 0x80000000))
                {
-                    if (sent_size >= cl->send_list.at(send_pos).iov_len)
+                    // Operation or reply received. Handle it
+                    if (!rdma_handle_op(cl, wc[i].imm_data))
                    {
-                        sent_size -= cl->send_list[send_pos].iov_len;
-                        send_pos++;
-                    }
-                    else
-                    {
-                        send_buf_pos = sent_size;
-                        sent_size = 0;
+                        // false means that the client is stopped due to invalid operation
+                        continue;
                    }
+                    rc->cur_recv--;
+                    try_recv_rdma(cl);
                }
-                assert(rc->send_pos >= send_pos);
-                if (rc->send_pos == send_pos)
+                else
                {
-                    rc->send_buf_pos -= send_buf_pos;
-                }
-                rc->send_pos -= send_pos;
-                for (int i = 0; i < send_pos; i++)
-                {
-                    if (cl->outbox[i].flags & MSGR_SENDP_FREE)
-                    {
-                        // Reply fully sent
-                        delete cl->outbox[i].op;
-                    }
-                }
-                if (send_pos > 0)
-                {
-                    cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
-                    cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
-                }
-                if (send_buf_pos > 0)
-                {
-                    cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
-                    cl->send_list[0].iov_len -= send_buf_pos;
+                    // Outbox slot is marked as free (the remote side doesn't need it anymore)
+                    uint32_t op_slot = wc[i].imm_data & 0x7FFFFFFF;
+                    auto & pos = rc->in_ops[op_slot].pos;
+                    if (pos.data_size > 0)
+                        rc->out_data_alloc.free(pos.data_pos, pos.data_size);
+                    rc->out_op_alloc->set(op_slot, false);
                }
+                // Try to continue sending
                try_send_rdma(cl);
            }
        }
--- a/src/msgr_rdma.h
+++ b/src/msgr_rdma.h
@@ -5,6 +5,11 @@
 #include <infiniband/verbs.h>
 #include <string>
 #include <vector>
+#include "allocator.h"
+#include "freelist.h"
+#include "osd_ops.h"
+
+struct osd_op_t;

 struct msgr_rdma_address_t
 {
@@ -39,6 +44,17 @@ struct msgr_rdma_context_t
    ~msgr_rdma_context_t();
 };

+struct msgr_rdma_out_pos_t
+{
+    uint64_t data_pos, data_size;
+};
+
+struct msgr_rdma_cmd_t
+{
+    uint8_t header[OSD_PACKET_SIZE];
+    msgr_rdma_out_pos_t pos;
+};
+
 struct msgr_rdma_connection_t
 {
    msgr_rdma_context_t *ctx = NULL;
@@ -46,14 +62,24 @@ struct msgr_rdma_connection_t
    msgr_rdma_address_t addr;
    int max_send = 0, max_recv = 0, max_sge = 0;
    int cur_send = 0, cur_recv = 0;
-    uint64_t max_msg = 0;
+    uint64_t op_slots = 0, op_memory = 0;

-    int send_pos = 0, send_buf_pos = 0;
-    int next_recv_buf = 0;
-    std::vector<void*> recv_buffers;
-    std::vector<uint64_t> send_sizes;
+    ibv_mr *in_data_mr = NULL, *in_op_mr = NULL;
+    msgr_rdma_cmd_t *in_ops = NULL;
+    int in_op_cap = 0;
+    void *in_data_buf = NULL;
+    std::vector<uint32_t> in_slots_freed;
+
+    uint32_t out_data_rkey = 0, out_op_rkey = 0;
+    uint64_t out_op_slots = 0, out_op_memory = 0;
+    allocator *out_op_alloc = NULL;
+    freelist_allocator_t out_data_alloc;
+    msgr_rdma_out_pos_t *out_slot_data = NULL;
+    osd_op_t **out_slot_ops = NULL;

    ~msgr_rdma_connection_t();
-    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
+    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send,
+        uint32_t max_recv, uint32_t max_sge, uint64_t op_slots, uint64_t op_memory);
    int connect(msgr_rdma_address_t *dest);
+    void set_out_capacity(uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory);
 };
--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@@ -172,7 +172,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    if (cl->read_state == CL_READ_HDR)
    {
        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
-            return handle_reply_hdr(cl);
+            return handle_reply_hdr(cl->read_op->req.buf, cl);
        else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
            handle_op_hdr(cl);
        else
@@ -286,7 +286,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
    }
 }

-bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
+bool osd_messenger_t::handle_reply_hdr(void *reply_hdr, osd_client_t *cl)
 {
    auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
    if (req_it == cl->sent_ops.end())
@@ -297,7 +297,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        return false;
    }
    osd_op_t *op = req_it->second;
-    memcpy(op->reply.buf, cl->read_op->req.buf, OSD_PACKET_SIZE);
+    memcpy(op->reply.buf, reply_hdr, OSD_PACKET_SIZE);
    cl->sent_ops.erase(req_it);
    if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
    {
@@ -328,14 +328,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        {
            goto reuse;
        }
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
    }
    else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0)
    {
        assert(!op->iov.count);
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
@@ -345,7 +347,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    else if (op->reply.hdr.opcode == OSD_OP_SEC_READ_BMP && op->reply.hdr.retval > 0)
    {
        assert(!op->iov.count);
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_remaining = op->reply.hdr.retval;
@@ -355,7 +358,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    }
    else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
    {
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_remaining = op->reply.hdr.retval;
@@ -368,7 +372,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
 reuse:
        // It's fine to reuse cl->read_op for the next reply
        handle_reply_ready(op);
-        cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
+        if (cl->read_op)
+            cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
        cl->read_remaining = OSD_PACKET_SIZE;
        cl->read_state = CL_READ_HDR;
    }
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@@ -96,6 +96,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
            to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
        to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
    }
+    to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_LAST;
    if (cur_op->op_type == OSD_OP_IN)
    {
        to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_FREE;
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@@ -129,6 +129,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
 #ifdef WITH_RDMA
    if (cl->rdma_conn)
    {
+        clients_by_qp.erase(cl->rdma_conn->qp->qp_num);
        delete cl->rdma_conn;
    }
 #endif
--- a/src/object_id.h
+++ b/src/object_id.h
@@ -39,11 +39,6 @@ struct __attribute__((__packed__)) obj_ver_id
    uint64_t version;
 };

-inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
-{
-    return a.oid == b.oid && a.version == b.version;
-}
-
 inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
 {
    return a.oid < b.oid || a.oid == b.oid && a.version < b.version;
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -35,18 +35,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)

    this->ringloop = ringloop;

-    this->cli_config = config.object_items();
-    this->file_config = msgr.read_config(this->cli_config);
-    parse_config(true);
+    this->config = msgr.read_config(config).object_items();
+    if (this->config.find("log_level") == this->config.end())
+        this->config["log_level"] = 1;
+    parse_config(this->config, true);

    epmgr = new epoll_manager_t(ringloop);
    // FIXME: Use timerfd_interval based directly on io_uring
    this->tfd = epmgr->tfd;

-    if (!json_is_true(this->config["disable_blockstore"]))
+    auto bs_cfg = json_to_bs(this->config);
+    this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
    {
-        auto bs_cfg = json_to_bs(this->config);
-        this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
        // Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
        uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
        if (autosync_writes > max_autosync)
@@ -67,11 +67,11 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
        }
    }

-    print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
+    this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
    {
        print_stats();
    });
-    slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
+    this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
    {
        print_slow();
    });
@@ -91,42 +91,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)

 osd_t::~osd_t()
 {
-    if (slow_log_timer_id >= 0)
-    {
-        tfd->clear_timer(slow_log_timer_id);
-        slow_log_timer_id = -1;
-    }
-    if (print_stats_timer_id >= 0)
-    {
-        tfd->clear_timer(print_stats_timer_id);
-        print_stats_timer_id = -1;
-    }
-    if (autosync_timer_id >= 0)
-    {
-        tfd->clear_timer(autosync_timer_id);
-        autosync_timer_id = -1;
-    }
    ringloop->unregister_consumer(&consumer);
    delete epmgr;
-    if (bs)
-        delete bs;
+    delete bs;
    close(listen_fd);
    free(zero_buffer);
 }

-void osd_t::parse_config(bool init)
+void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
 {
-    config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
-    if (config.find("log_level") == this->config.end())
-        config["log_level"] = 1;
-    if (bs)
-    {
-        auto bs_cfg = json_to_bs(config);
-        bs->parse_config(bs_cfg);
-    }
    st_cli.parse_config(config);
    msgr.parse_config(config);
-    if (init)
+    if (allow_disk_params)
    {
        // OSD number
        osd_num = config["osd_num"].uint64_value();
@@ -148,27 +124,24 @@ void osd_t::parse_config(bool init)
            immediate_commit = IMMEDIATE_SMALL;
        else
            immediate_commit = IMMEDIATE_NONE;
-        // Bind address
-        bind_address = config["bind_address"].string_value();
-        if (bind_address == "")
-            bind_address = "0.0.0.0";
-        bind_port = config["bind_port"].uint64_value();
-        if (bind_port <= 0 || bind_port > 65535)
-            bind_port = 0;
-        // OSD configuration
-        etcd_report_interval = config["etcd_report_interval"].uint64_value();
-        if (etcd_report_interval <= 0)
-            etcd_report_interval = 5;
-        readonly = json_is_true(config["readonly"]);
-        run_primary = !json_is_false(config["run_primary"]);
-        allow_test_ops = json_is_true(config["allow_test_ops"]);
    }
+    // Bind address
+    bind_address = config["bind_address"].string_value();
+    if (bind_address == "")
+        bind_address = "0.0.0.0";
+    bind_port = config["bind_port"].uint64_value();
+    if (bind_port <= 0 || bind_port > 65535)
+        bind_port = 0;
+    // OSD configuration
    log_level = config["log_level"].uint64_value();
-    auto old_no_rebalance = no_rebalance;
+    etcd_report_interval = config["etcd_report_interval"].uint64_value();
+    if (etcd_report_interval <= 0)
+        etcd_report_interval = 5;
+    readonly = json_is_true(config["readonly"]);
+    run_primary = !json_is_false(config["run_primary"]);
    no_rebalance = json_is_true(config["no_rebalance"]);
-    auto old_no_recovery = no_recovery;
    no_recovery = json_is_true(config["no_recovery"]);
-    auto old_autosync_interval = autosync_interval;
+    allow_test_ops = json_is_true(config["allow_test_ops"]);
    if (!config["autosync_interval"].is_null())
    {
        // Allow to set it to 0
@@ -196,46 +169,15 @@ void osd_t::parse_config(bool init)
    recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
    if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
        recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
-    auto old_print_stats_interval = print_stats_interval;
    print_stats_interval = config["print_stats_interval"].uint64_value();
    if (!print_stats_interval)
        print_stats_interval = 3;
-    auto old_slow_log_interval = slow_log_interval;
    slow_log_interval = config["slow_log_interval"].uint64_value();
    if (!slow_log_interval)
        slow_log_interval = 10;
    inode_vanish_time = config["inode_vanish_time"].uint64_value();
    if (!inode_vanish_time)
        inode_vanish_time = 60;
-    if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
-        !(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
-    {
-        peering_state = peering_state | OSD_RECOVERING;
-    }
-    if (old_autosync_interval != autosync_interval && autosync_timer_id >= 0)
-    {
-        this->tfd->clear_timer(autosync_timer_id);
-        autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
-        {
-            autosync();
-        });
-    }
-    if (old_print_stats_interval != print_stats_interval && print_stats_timer_id >= 0)
-    {
-        tfd->clear_timer(print_stats_timer_id);
-        print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
-        {
-            print_stats();
-        });
-    }
-    if (old_slow_log_interval != slow_log_interval && slow_log_timer_id >= 0)
-    {
-        tfd->clear_timer(slow_log_timer_id);
-        slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
-        {
-            print_slow();
-        });
-    }
 }

 void osd_t::bind_socket()
@@ -533,7 +475,7 @@ void osd_t::print_slow()
            }
        }
    }
-    if (has_slow && bs)
+    if (has_slow)
    {
        bs->dump_diagnostics();
    }
--- a/src/osd.h
+++ b/src/osd.h
@@ -90,7 +90,7 @@ class osd_t
 {
    // config

-    json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
+    json11::Json::object config;
    int etcd_report_interval = 5;

    bool readonly = false;
@@ -126,7 +126,6 @@ class osd_t
    bool pg_config_applied = false;
    bool etcd_reporting_pg_state = false;
    bool etcd_reporting_stats = false;
-    int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;

    // peers and PGs

@@ -153,7 +152,7 @@ class osd_t

    bool stopping = false;
    int inflight_ops = 0;
-    blockstore_t *bs = NULL;
+    blockstore_t *bs;
    void *zero_buffer = NULL;
    uint64_t zero_buffer_size = 0;
    uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
@@ -174,7 +173,7 @@ class osd_t
    uint64_t recovery_stat_bytes[2][2] = {};

    // cluster connection
-    void parse_config(bool init);
+    void parse_config(const json11::Json & config, bool allow_disk_params);
    void init_cluster();
    void on_change_osd_state_hook(osd_num_t peer_osd);
    void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -75,7 +75,7 @@ void osd_t::init_cluster()
    }
    if (run_primary && autosync_interval > 0)
    {
-        autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
+        this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
        {
            autosync();
        });
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
    char time_str[50] = { 0 };
    sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
    st["time"] = time_str;
+    st["blockstore_ready"] = bs->is_started();
+    st["data_block_size"] = (uint64_t)bs->get_block_size();
    if (bs)
    {
-        st["blockstore_ready"] = bs->is_started();
-        st["data_block_size"] = (uint64_t)bs->get_block_size();
        st["size"] = bs->get_block_count() * bs->get_block_size();
        st["free"] = bs->get_free_block_count() * bs->get_block_size();
    }
@@ -233,8 +233,7 @@ void osd_t::report_statistics()
    json11::Json::object inode_space;
    json11::Json::object last_stat;
    pool_id_t last_pool = 0;
-    std::map<uint64_t, uint64_t> bs_empty_space;
-    auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
+    auto & bs_inode_space = bs->get_inode_space_stats();
    for (auto kv: bs_inode_space)
    {
        pool_id_t pool_id = INODE_POOL(kv.first);
@@ -375,11 +374,7 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)

 void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
 {
-    if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
-    {
-        etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
-        parse_config(false);
-    }
+    // FIXME apply config changes in runtime (maybe, some)
    if (run_primary)
    {
        apply_pg_count();
@@ -389,8 +384,11 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes

 void osd_t::on_load_config_hook(json11::Json::object & global_config)
 {
-    etcd_global_config = global_config;
-    parse_config(true);
+    json11::Json::object osd_config = this->config;
+    for (auto & kv: global_config)
+        if (osd_config.find(kv.first) == osd_config.end())
+            osd_config[kv.first] = kv.second;
+    parse_config(osd_config, false);
    bind_socket();
    acquire_lease();
 }
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -64,11 +64,6 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)

 void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
 {
-    if (log_level > 2)
-    {
-        printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
-            pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
-    }
    pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
    if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
    {
@@ -104,9 +99,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
        std::vector<osd_op_t*> continue_ops;
        auto & pg = pgs.at(pg_id);
        auto it = pg.flush_actions.begin(), prev_it = it;
+        auto erase_start = it;
        while (1)
        {
-            if (it == pg.flush_actions.end() || !it->second.submitted ||
+            if (it == pg.flush_actions.end() ||
                it->first.oid.inode != prev_it->first.oid.inode ||
                (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
            {
@@ -120,23 +116,29 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
                });
                if (wr_it != pg.write_queue.end())
                {
-                    if (log_level > 2)
-                    {
-                        printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
-                            pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
-                    }
                    continue_ops.push_back(wr_it->second);
+                    pg.write_queue.erase(wr_it);
                }
            }
-            if (it == pg.flush_actions.end() || !it->second.submitted)
+            if ((it == pg.flush_actions.end() || !it->second.submitted) &&
+                erase_start != it)
+            {
+                pg.flush_actions.erase(erase_start, it);
+            }
+            if (it == pg.flush_actions.end())
            {
-                if (it != pg.flush_actions.begin())
-                {
-                    pg.flush_actions.erase(pg.flush_actions.begin(), it);
-                }
                break;
            }
-            prev_it = it++;
+            prev_it = it;
+            if (!it->second.submitted)
+            {
+                it++;
+                erase_start = it;
+            }
+            else
+            {
+                it++;
+            }
        }
        delete fb;
        pg.flush_batch = NULL;
@@ -166,18 +168,6 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
    // Copy buffer so it gets freed along with the operation
    op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
    memcpy(op->buf, data, sizeof(obj_ver_id) * count);
-    if (log_level > 2)
-    {
-        printf(
-            "[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
-            pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
-        );
-        for (int i = 0; i < count; i++)
-        {
-            printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
-        }
-        printf("\n");
-    }
    if (peer_osd == this->osd_num)
    {
        // local
@@ -314,10 +304,9 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
            {
                // PG is stopped or one of the OSDs is gone, error is harmless
                printf(
-                    "[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
-                    INODE_POOL(op->oid.inode),
-                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
-                    op->oid.inode, op->oid.stripe
+                    "Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
+                    op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
+                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
                );
            }
            else
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -76,7 +76,7 @@ void osd_t::handle_peers()
            peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
        }
    }
-    if (!(peering_state & OSD_FLUSHING_PGS) && (peering_state & OSD_RECOVERING) && !readonly)
+    if ((peering_state & OSD_RECOVERING) && !readonly)
    {
        if (!continue_recovery())
        {
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -91,7 +91,7 @@ void pg_obj_state_check_t::walk()
        pg->state |= PG_DEGRADED;
    }
    pg->state |= PG_ACTIVE;
-    if (pg->cur_peers.size() < pg->all_peers.size())
+    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
    {
        pg->state |= PG_LEFT_ON_DEAD;
    }
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -53,10 +53,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
        inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
        inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
        if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
-        {
-            if (cur_op->op_data)
-                inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
-        }
+            inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
        else
            inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
    }
--- a/src/osd_primary_sync.cpp
+++ b/src/osd_primary_sync.cpp
@@ -166,7 +166,7 @@ resume_6:
                for (int i = 0; i < unstable_osd.len; i++)
                {
                    // Except those from peered PGs
-                    auto & w = op_data->unstable_writes[unstable_osd.start + i];
+                    auto & w = op_data->unstable_writes[i];
                    pool_pg_num_t wpg = {
                        .pool_id = INODE_POOL(w.oid.inode),
                        .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -12,7 +12,6 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
        .oid = op_data->oid,
        .osd_num = 0,
    });
-    op_data->st = 1;
    if (act_it != pg.flush_actions.end() &&
        act_it->first.oid.inode == op_data->oid.inode &&
        (act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
@@ -24,6 +23,7 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
    auto vo_it = pg.write_queue.find(op_data->oid);
    if (vo_it != pg.write_queue.end())
    {
+        op_data->st = 1;
        pg.write_queue.emplace(op_data->oid, cur_op);
        return false;
    }
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@@ -142,11 +142,11 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
    for (int i = 0; i < a.size && i < b.size; i++)
    {
        if (a.data[i] < b.data[i])
-            return true;
+            return -1;
        else if (a.data[i] > b.data[i])
-            return false;
+            return 1;
    }
-    return false;
+    return 0;
 }

 struct reed_sol_matrix_t
@@ -677,11 +677,11 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
 static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
 {
    uint32_t ns = 0, ne = 0, os = 0, oe = 0;
-    if (stripe.write_end > wr_start &&
-        stripe.write_start < wr_end)
+    if (stripe.req_end > wr_start &&
+        stripe.req_start < wr_end)
    {
-        ns = std::max(stripe.write_start, wr_start);
-        ne = std::min(stripe.write_end, wr_end);
+        ns = std::max(stripe.req_start, wr_start);
+        ne = std::min(stripe.req_end, wr_end);
    }
    if (stripe.read_end > wr_start &&
        stripe.read_start < wr_end)
@@ -692,7 +692,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
    if (ne && (!oe || ns <= os))
    {
        // NEW or NEW->OLD
-        bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
+        bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
        if (os < ne)
            os = ne;
        if (oe > os)
@@ -708,7 +708,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
        {
            // OLD->NEW or OLD->NEW->OLD
            bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
-            bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
+            bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
            if (oe > ne)
            {
                // OLD->NEW->OLD
@@ -759,18 +759,7 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
    uint32_t &start, uint32_t &end)
 {
-    bool required = false;
-    for (int role = pg_minsize; role < pg_size; role++)
-    {
-        if (write_osd_set[role] != 0)
-        {
-            // Whole parity chunk is needed when we move the object
-            if (write_osd_set[role] != read_osd_set[role])
-                end = chunk_size;
-            required = true;
-        }
-    }
-    if (required && end != chunk_size)
+    if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
    {
        // start & end are required for calc_rmw_parity
        for (int role = 0; role < pg_minsize; role++)
@@ -781,6 +770,14 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
                end = std::max(stripes[role].req_end, end);
            }
        }
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
+            {
+                start = 0;
+                end = chunk_size;
+            }
+        }
    }
    // Set bitmap bits accordingly
    if (bitmap_granularity > 0)
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@@ -17,7 +17,6 @@ void test4();
 void test5();
 void test6();
 void test7();
-void test_rmw_4k_degraded_into_lost_to_normal(bool ec);
 void test8();
 void test9();
 void test10();
@@ -25,7 +24,7 @@ void test11();
 void test12();
 void test13();
 void test14();
-void test15(bool second);
+void test15();
 void test16();

 int main(int narg, char *args[])
@@ -40,8 +39,6 @@ int main(int narg, char *args[])
    test6();
    // Test 7
    test7();
-    test_rmw_4k_degraded_into_lost_to_normal(false);
-    test_rmw_4k_degraded_into_lost_to_normal(true);
    // Test 8
    test8();
    // Test 9
@@ -57,8 +54,7 @@ int main(int narg, char *args[])
    // Test 14
    test14();
    // Test 15
-    test15(false);
-    test15(true);
+    test15();
    // Test 16
    test16();
    // End
@@ -319,69 +315,6 @@ void test7()

 /***

-7/2. calc_rmw(offset=48K, len=4K, osd_set=[0,2,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 48K, 52K ], [ 0, 0 ], [ 48K, 52K ] ],
-     input buffer: [ write0 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor/ec(): {
-     write: [ [ 0, 128K ], [ 0, 0 ], [ 48K, 52K ] ],
-     write0==read0,
-   }
-   + check write0, write2 buffers
-
-***/
-
-void test_rmw_4k_degraded_into_lost_to_normal(bool ec)
-{
-    osd_num_t osd_set[3] = { 0, 2, 3 };
-    osd_num_t write_osd_set[3] = { 1, 2, 3 };
-    osd_rmw_stripe_t stripes[3] = {};
-    // Subtest 1
-    split_stripes(2, 128*1024, 48*1024, 4096, stripes);
-    void *write_buf = malloc(4096);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
-    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
-    assert(stripes[0].write_start == 48*1024 && stripes[0].write_end == 52*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
-    assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
-    assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
-    assert(stripes[1].read_buf == (uint8_t*)rmw_buf+4*1024+128*1024);
-    assert(stripes[2].read_buf == (uint8_t*)rmw_buf+4*1024+2*128*1024);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == NULL);
-    assert(stripes[2].write_buf == rmw_buf);
-    // Subtest 2
-    set_pattern(write_buf, 4096, PATTERN2);
-    set_pattern(stripes[1].read_buf, 128*1024, PATTERN1);
-    set_pattern(stripes[2].read_buf, 128*1024, PATTERN0^PATTERN1);
-    if (!ec)
-        calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
-    else
-    {
-        use_ec(3, 2, true);
-        calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, 0);
-        use_ec(3, 2, false);
-    }
-    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
-    assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
-    assert(stripes[0].write_buf == stripes[0].read_buf);
-    assert(stripes[1].write_buf == NULL);
-    assert(stripes[2].write_buf == rmw_buf);
-    check_pattern(stripes[0].write_buf, 4096, PATTERN0);
-    check_pattern(stripes[0].write_buf+48*1024, 4096, PATTERN2);
-    check_pattern(stripes[2].write_buf, 4096, PATTERN2^PATTERN1); // new parity
-    free(rmw_buf);
-    free(write_buf);
-}
-
-/***
-
 8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
   = {
     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
@@ -893,11 +826,12 @@ void test14()

 ***/

-void test15(bool second)
+void test15()
 {
    const int bmp = 64*1024 / 4096 / 8;
    use_ec(4, 2, true);
-    osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
+    osd_num_t osd_set[4] = { 1, 2, 3, 0 };
+    osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
    osd_rmw_stripe_t stripes[4] = {};
    unsigned bitmaps[4] = { 0 };
    // Test 15.0
@@ -908,7 +842,7 @@ void test15(bool second)
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
    // Test 15.1
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
    for (int i = 0; i < 4; i++)
        stripes[i].bmp_buf = bitmaps+i;
    assert(rmw_buf);
@@ -918,34 +852,32 @@ void test15(bool second)
    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
-    assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
-    assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
+    assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
    assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
    assert(stripes[1].read_buf == NULL);
    assert(stripes[2].read_buf == NULL);
    assert(stripes[3].read_buf == NULL);
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == (uint8_t*)write_buf);
-    assert(stripes[2+second].write_buf == rmw_buf);
-    assert(stripes[3-second].write_buf == NULL);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == NULL);
    // Test 15.2 - encode
    set_pattern(write_buf, 4*1024, PATTERN1);
    set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
    memset(stripes[0].bmp_buf, 0, bmp);
    memset(stripes[1].bmp_buf, 0, bmp);
-    memset(stripes[2+second].write_buf, 0, 4096);
-    calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
-    assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
+    calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
+    assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
-    assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
-    assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
+    assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == (uint8_t*)write_buf);
-    assert(stripes[2+second].write_buf == rmw_buf);
-    assert(stripes[3-second].write_buf == NULL);
-    // first parity is always xor :), second isn't...
-    check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == NULL);
+    check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
    // Done
    free(rmw_buf);
    free(write_buf);
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@@ -166,15 +166,20 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
    {
        // Indicate that RDMA is enabled
        wire_config["rdma_enabled"] = true;
-        if (req_json["connect_rdma"].is_string())
+        if (req_json["connect_rdma"].is_string() && req_json["rdma_op_memory"].uint64_value() != 0)
        {
            // Peer is trying to connect using RDMA, try to satisfy him
-            bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_msg"].uint64_value());
+            bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(),
+                req_json["rdma_data_rkey"].uint64_value(), req_json["rdma_op_rkey"].uint64_value(),
+                req_json["rdma_op_slots"].uint64_value(), req_json["rdma_op_memory"].uint64_value());
            if (ok)
            {
                auto rc = msgr.clients.at(cur_op->peer_fd)->rdma_conn;
                wire_config["rdma_address"] = rc->addr.to_string();
-                wire_config["rdma_max_msg"] = rc->max_msg;
+                wire_config["rdma_data_rkey"] = (uint64_t)rc->in_data_mr->rkey;
+                wire_config["rdma_op_rkey"] = (uint64_t)rc->in_op_mr->rkey;
+                wire_config["rdma_op_slots"] = rc->op_slots;
+                wire_config["rdma_op_memory"] = rc->op_memory;
            }
        }
    }
--- a/src/osd_test.cpp
+++ b/src/osd_test.cpp
@@ -150,7 +150,6 @@ int connect_osd(const char *osd_address, int osd_port)
    if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
    {
        perror("connect");
-        close(connect_fd);
        return -1;
    }
    int one = 1;
--- a/src/rw_blocking.cpp
+++ b/src/rw_blocking.cpp
@@ -15,7 +15,7 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
    size_t done = 0;
    while (done < remaining)
    {
-        ssize_t r = read(fd, read_buf, remaining-done);
+        size_t r = read(fd, read_buf, remaining-done);
        if (r <= 0)
        {
            if (!errno)
@@ -41,7 +41,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
    size_t done = 0;
    while (done < remaining)
    {
-        ssize_t r = write(fd, write_buf, remaining-done);
+        size_t r = write(fd, write_buf, remaining-done);
        if (r < 0)
        {
            if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
--- a/src/stub_bench.cpp
+++ b/src/stub_bench.cpp
@@ -83,7 +83,6 @@ int connect_stub(const char *server_address, int server_port)
    if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
    {
        perror("connect");
-        close(connect_fd);
        return -1;
    }
    int one = 1;
--- a/src/test_freelist.cpp
+++ b/src/test_freelist.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) Vitaliy Filippov, 2023+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdexcept>
+#include "freelist.cpp"
+
+inline bool operator == (const freelist_item_t & a, const freelist_item_t & b)
+{
+    return a.start == b.start && a.size == b.size;
+}
+
+void dump(std::vector<freelist_item_t> & freelist)
+{
+    printf("free: ");
+    for (auto & item: freelist)
+    {
+        printf("%lx+%lx ", item.start, item.size);
+    }
+    printf("\n");
+}
+
+void dump(freelist_allocator_t &alloc)
+{
+    dump(alloc.freelist);
+}
+
+uint64_t test_alloc(freelist_allocator_t &alloc, uint64_t size)
+{
+    uint64_t r = alloc.alloc(size);
+    printf("alloc %lx: %lx\n", size, r);
+    return r;
+}
+
+void assert_eq(freelist_allocator_t &alloc, std::vector<freelist_item_t> v)
+{
+    if (alloc.freelist != v)
+    {
+        printf("expected ");
+        dump(v);
+        printf("got ");
+        dump(alloc);
+        throw std::runtime_error("test failed");
+    }
+    dump(alloc);
+}
+
+int main(int narg, char *args[])
+{
+    freelist_allocator_t alloc;
+    alloc.free(0, 0x1000000);
+    assert_eq(alloc, { { 0, 0x1000000 } });
+    assert(test_alloc(alloc, 0x1000) == 0);
+    assert_eq(alloc, { { 0x1000, 0xfff000 } });
+    assert(test_alloc(alloc, 0x4000) == 0x1000);
+    alloc.free(0x1000000, 0x4000);
+    assert_eq(alloc, { { 0x5000, 0xfff000 } });
+    alloc.free(0, 0x1000);
+    assert_eq(alloc, { { 0, 0x1000 }, { 0x5000, 0xfff000 } });
+    alloc.free(0x1000, 0x4000);
+    assert_eq(alloc, { { 0, 0x1004000 } });
+    return 0;
+}
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 0.8.7
+Version: 0.8.5
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/tests/common.sh
+++ b/tests/common.sh
@@ -64,4 +64,4 @@ echo leak:librbd >> testdata/lsan-suppress.txt
 echo leak:_M_mutate >> testdata/lsan-suppress.txt
 echo leak:_M_assign >> testdata/lsan-suppress.txt
 export LSAN_OPTIONS=report_objects=true:suppressions=`pwd`/testdata/lsan-suppress.txt
-export ASAN_OPTIONS=verify_asan_link_order=false:abort_on_error=1
+export ASAN_OPTIONS=verify_asan_link_order=false
--- a/tests/run_3osds.sh
+++ b/tests/run_3osds.sh
@@ -17,17 +17,17 @@ else
 fi

 if [ "$IMMEDIATE_COMMIT" != "" ]; then
-    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
+    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
 else
-    NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
+    NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
 fi

 start_osd()
 {
    local i=$1
-    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
+    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
    eval OSD${i}_PID=$!
 }

--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -43,6 +43,3 @@ SCHEME=ec ./test_snapshot.sh
 SCHEME=xor ./test_write.sh

 ./test_write_no_same.sh
-
-./test_heal.sh
-SCHEME=ec PG_MINSIZE=2 ./test_heal.sh
--- a/tests/test_heal.sh
+++ b/tests/test_heal.sh
@@ -43,7 +43,7 @@ kill_osds &

 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
-        -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
+        -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null

 qemu-img convert -S 4096 -p \
    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
--- a/tests/test_move_reappear.sh
+++ b/tests/test_move_reappear.sh
@@ -7,7 +7,7 @@ OSD_COUNT=5
 OSD_ARGS=
 for i in $(seq 1 $OSD_COUNT); do
    dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
-    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
+    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
    eval OSD${i}_PID=$!
 done

--- a/tests/test_resize.sh
+++ b/tests/test_resize.sh
@@ -53,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
        --data_device ./testdata/test_osd$i.bin \
        --meta_offset 0 \
        --journal_offset $((1024*1024)) \
-        --data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
+        --data_offset $((128*1024*1024)) &>./testdata/osd$i.log &
    eval OSD${i}_PID=$!
 done

--- a/tests/test_splitbrain.sh
+++ b/tests/test_splitbrain.sh
@@ -21,8 +21,7 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
 # Kill OSD 2, start OSD 1

 kill $OSD2_PID
-build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL \
-    $(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
+build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
 sleep 2

 # Check PG state - it should NOT become active
--- a/tests/test_vm_cont.sh
+++ b/tests/test_vm_cont.sh
@@ -10,7 +10,7 @@ etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state
 OSD_COUNT=3
 OSD_ARGS=
 for i in $(seq 1 $OSD_COUNT); do
-    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
+    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
    eval OSD${i}_PID=$!
 done