Fix eviction when random_pos selects the end

Implement min/max list_count to make listings during performance test reasonable
Fix and improve parallel allocation
2023-12-01 01:43:03 +03:00 · 2023-12-01 01:17:04 +03:00 · 2023-12-01 01:17:04 +03:00 · 2023-12-01 01:17:04 +03:00 · 2023-12-01 01:17:04 +03:00 · 2023-12-01 01:17:04 +03:00
150 changed files with 7314 additions and 1244 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "0.9.3")
+set(VERSION "1.2.0")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - Параметры
    - [Общие](docs/config/common.ru.md)
    - [Сетевые](docs/config/network.ru.md)
+    - [Клиентский код](docs/config/client.en.md)
    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
    - [Прочие параметры OSD](docs/config/osd.ru.md)
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Read more details below in the documentation.
  - Parameter Reference
    - [Common](docs/config/common.en.md)
    - [Network](docs/config/network.en.md)
+    - [Client](docs/config/client.en.md)
    - [Global Disk Layout](docs/config/layout-cluster.en.md)
    - [OSD Disk Layout](docs/config/layout-osd.en.md)
    - [OSD Runtime Parameters](docs/config/osd.en.md)
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.9.3
+VERSION ?= v1.2.0

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.9.3
+          image: vitalif/vitastor-csi:v1.2.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/005-csi-provisioner-rbac.yaml
+++ b/csi/deploy/005-csi-provisioner-rbac.yaml
@@ -35,10 +35,13 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshots"]
-    verbs: ["get", "list"]
+    verbs: ["get", "list", "patch"]
+  - apiGroups: ["snapshot.storage.k8s.io"]
+    resources: ["volumesnapshots/status"]
+    verbs: ["get", "list", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents"]
-    verbs: ["create", "get", "list", "watch", "update", "delete"]
+    verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotclasses"]
    verbs: ["get", "list", "watch"]
@@ -53,7 +56,7 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents/status"]
-    verbs: ["update"]
+    verbs: ["update", "patch"]
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get"]
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -23,6 +23,11 @@ metadata:
  name: csi-vitastor-provisioner
 spec:
  replicas: 3
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 1
+      maxSurge: 0
  selector:
    matchLabels:
      app: csi-vitastor-provisioner
@@ -46,7 +51,7 @@ spec:
      priorityClassName: system-cluster-critical
      containers:
        - name: csi-provisioner
-          image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
+          image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
          args:
            - "--csi-address=$(ADDRESS)"
            - "--v=5"
@@ -116,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.9.3
+          image: vitalif/vitastor-csi:v1.2.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/009-storage-class.yaml
+++ b/csi/deploy/009-storage-class.yaml
@@ -17,3 +17,4 @@ parameters:
  # multiple etcdUrls may be specified, delimited by comma
  #etcdUrl: "http://192.168.7.2:2379"
  #etcdPrefix: "/vitastor"
+allowVolumeExpansion: true
--- a/csi/deploy/example-snapshot-class.yaml
+++ b/csi/deploy/example-snapshot-class.yaml
@@ -0,0 +1,7 @@
+apiVersion: snapshot.storage.k8s.io/v1
+kind: VolumeSnapshotClass
+metadata:
+  name: vitastor-snapclass
+driver: csi.vitastor.io
+deletionPolicy: Delete
+parameters:
--- a/csi/deploy/example-snapshot-clone.yaml
+++ b/csi/deploy/example-snapshot-clone.yaml
@@ -0,0 +1,16 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: test-vitastor-clone
+spec:
+  storageClassName: vitastor
+  dataSource:
+    name: snap1
+    kind: VolumeSnapshot
+    apiGroup: snapshot.storage.k8s.io
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
--- a/csi/deploy/example-snapshot.yaml
+++ b/csi/deploy/example-snapshot.yaml
@@ -0,0 +1,8 @@
+apiVersion: snapshot.storage.k8s.io/v1
+kind: VolumeSnapshot
+metadata:
+  name: snap1
+spec:
+  volumeSnapshotClassName: vitastor-snapclass
+  source:
+    persistentVolumeClaimName: test-vitastor-pvc
--- a/csi/go.mod
+++ b/csi/go.mod
@@ -9,6 +9,7 @@ require (
 	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
+	google.golang.org/protobuf v1.24.0
 	k8s.io/klog v1.0.0
 	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
 )
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.9.3"
+    vitastorCSIDriverVersion = "1.2.0"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -20,6 +20,7 @@ import (

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
+    "google.golang.org/protobuf/types/known/timestamppb"

    "github.com/container-storage-interface/spec/lib/go/csi"
 )
@@ -45,6 +46,7 @@ type InodeConfig struct
    ParentPool uint64 `json:"parent_pool,omitempty"`
    ParentId uint64 `json:"parent_id,omitempty"`
    Readonly bool `json:"readonly,omitempty"`
+    CreateTs uint64 `json:"create_ts,omitempty"`
 }

 type ControllerServer struct
@@ -178,27 +180,43 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

+    args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
+
+    // Support creation from snapshot
+    var src *csi.VolumeContentSource
+    if (req.VolumeContentSource.GetSnapshot() != nil)
+    {
+        snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
+        if (snapId != "")
+        {
+            snapVars := make(map[string]string)
+            err := json.Unmarshal([]byte(snapId), &snapVars)
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+            }
+            args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
+            src = &csi.VolumeContentSource{
+                Type: &csi.VolumeContentSource_Snapshot{
+                    Snapshot: &csi.VolumeContentSource_SnapshotSource{
+                        SnapshotId: snapId,
+                    },
+                },
+            }
+        }
+    }
+
    // Create image using vitastor-cli
-    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
+    _, err := invokeCLI(ctxVars, args)
    if (err != nil)
    {
        if (strings.Index(err.Error(), "already exists") > 0)
        {
-            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
+            inodeCfg, err := invokeList(ctxVars, volName, true)
            if (err != nil)
            {
                return nil, err
            }
-            var inodeCfg []InodeConfig
-            err = json.Unmarshal(stat, &inodeCfg)
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
-            }
-            if (len(inodeCfg) == 0)
-            {
-                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
-            }
            if (inodeCfg[0].Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -217,6 +235,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
            // Ugly, but VolumeContext isn't passed to DeleteVolume :-(
            VolumeId: string(volumeIdJson),
            CapacityBytes: volSize,
+            ContentSource: src,
        },
    }, nil
 }
@@ -230,15 +249,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }

-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
-    volName := ctxVars["name"]
+    volName := volVars["name"]

-    ctxVars, _, _ = GetConnectionParams(ctxVars)
+    ctxVars, _, _ := GetConnectionParams(volVars)

    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
    if (err != nil)
@@ -344,6 +363,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
        csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
        csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
        csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
+        csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
+        // TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
    } {
        controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
    }
@@ -353,28 +374,214 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
    }, nil
 }

+func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
+{
+    stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
+    if (err != nil)
+    {
+        return nil, err
+    }
+    var inodeCfg []InodeConfig
+    err = json.Unmarshal(stat, &inodeCfg)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+    }
+    if (expectExist && len(inodeCfg) == 0)
+    {
+        return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
+    }
+    return inodeCfg, nil
+}
+
 // CreateSnapshot create snapshot of an existing PV
 func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
+    }
+    if (req.SourceVolumeId == "" || req.Name == "")
+    {
+        return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
+    }
+
+    // snapshot name
+    snapName := req.Name
+
+    // req.VolumeId is an ugly json string in our case :)
+    ctxVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+    }
+    volName := ctxVars["name"]
+
+    // Create image using vitastor-cli
+    _, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
+    if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
+    {
+        return nil, err
+    }
+
+    // Check created snapshot
+    inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    // Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
+    ctxVars["snapshot"] = snapName
+    snapIdJson, _ := json.Marshal(ctxVars)
+    return &csi.CreateSnapshotResponse{
+        Snapshot: &csi.Snapshot{
+            SizeBytes: int64(inodeCfg[0].Size),
+            SnapshotId: string(snapIdJson),
+            SourceVolumeId: req.SourceVolumeId,
+            CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
+            ReadyToUse: true,
+        },
+    }, nil
 }

 // DeleteSnapshot delete provided snapshot of a PV
 func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
+    }
+    if (req.SnapshotId == "")
+    {
+        return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
+    }
+
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
+    }
+    volName := volVars["name"]
+    snapName := volVars["snapshot"]
+
+    ctxVars, _, _ := GetConnectionParams(volVars)
+
+    _, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    return &csi.DeleteSnapshotResponse{}, nil
 }

 // ListSnapshots list the snapshots of a PV
 func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
+    }
+
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+    }
+    volName := volVars["name"]
+    ctxVars, _, _ := GetConnectionParams(volVars)
+
+    inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    resp := &csi.ListSnapshotsResponse{}
+    for _, ino := range inodeCfg
+    {
+        snapName := ino.Name[len(volName)+1:]
+        if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
+        {
+        }
+        else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
+        {
+            volVars["snapshot"] = snapName
+            snapIdJson, _ := json.Marshal(volVars)
+            resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
+                Snapshot: &csi.Snapshot{
+                    SizeBytes: int64(ino.Size),
+                    SnapshotId: string(snapIdJson),
+                    SourceVolumeId: req.SourceVolumeId,
+                    CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
+                    ReadyToUse: true,
+                },
+            })
+        }
+        else
+        {
+            resp.NextToken = snapName
+            break
+        }
+    }
+
+    return resp, nil
 }

-// ControllerExpandVolume resizes a volume
+// ControllerExpandVolume increases the size of a volume
 func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
+    }
+    if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
+    {
+        return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
+    }
+
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+    }
+    volName := volVars["name"]
+    ctxVars, _, _ := GetConnectionParams(volVars)
+
+    inodeCfg, err := invokeList(ctxVars, volName, true)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
+    {
+        sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
+        _, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
+        if (err != nil)
+        {
+            return nil, err
+        }
+        inodeCfg, err = invokeList(ctxVars, volName, true)
+        if (err != nil)
+        {
+            return nil, err
+        }
+    }
+
+    return &csi.ControllerExpandVolumeResponse{
+        CapacityBytes: int64(inodeCfg[0].Size),
+        NodeExpansionRequired: false,
+    }, nil
 }

 // ControllerGetVolume get volume info
--- a/csi/src/identityserver.go
+++ b/csi/src/identityserver.go
@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
                    },
                },
            },
+            {
+                Type: &csi.PluginCapability_VolumeExpansion_{
+                    VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
+                        Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
+                    },
+                },
+            },
        },
    }, nil
 }
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@@ -70,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    isBlock := req.GetVolumeCapability().GetBlock() != nil

    // Check that it's not already mounted
-    _, error := mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (error != nil)
+    _, err := mount.IsNotMountPoint(ns.mounter, targetPath)
+    if (err != nil)
    {
-        if (os.IsNotExist(error))
+        if (os.IsNotExist(err))
        {
            if (isBlock)
            {
@@ -102,12 +102,12 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
        }
        else
        {
-            return nil, status.Error(codes.Internal, error.Error())
+            return nil, status.Error(codes.Internal, err.Error())
        }
    }

    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
@@ -147,70 +147,74 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    }
    devicePath := strings.TrimSpace(stdoutStr)

-    // Check existing format
    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
-    existingFormat, err := diskMounter.GetDiskFormat(devicePath)
-    if (err != nil)
-    {
-        klog.Errorf("failed to get disk format for path %s, error: %v", err)
-        // unmap NBD device
-        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-        if (unmapErr != nil)
-        {
-            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-        }
-        return nil, err
-    }
-
-    // Format the device (ext4 or xfs)
-    fsType := req.GetVolumeCapability().GetMount().GetFsType()
-    opt := req.GetVolumeCapability().GetMount().GetMountFlags()
-    opt = append(opt, "_netdev")
-    if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
-        req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
-        !Contains(opt, "ro"))
-    {
-        opt = append(opt, "ro")
-    }
-    if (fsType == "xfs")
-    {
-        opt = append(opt, "nouuid")
-    }
-    readOnly := Contains(opt, "ro")
-    if (existingFormat == "" && !readOnly)
-    {
-        args := []string{}
-        switch fsType
-        {
-            case "ext4":
-                args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
-            case "xfs":
-                args = []string{"-K", devicePath}
-        }
-        if (len(args) > 0)
-        {
-            cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
-            if (cmdErr != nil)
-            {
-                klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
-                // unmap NBD device
-                unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-                if (unmapErr != nil)
-                {
-                    klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-                }
-                return nil, status.Error(codes.Internal, cmdErr.Error())
-            }
-        }
-    }
    if (isBlock)
    {
-        opt = append(opt, "bind")
-        err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
+        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
    }
    else
    {
+        // Check existing format
+        existingFormat, err := diskMounter.GetDiskFormat(devicePath)
+        if (err != nil)
+        {
+            klog.Errorf("failed to get disk format for path %s, error: %v", err)
+            goto unmap
+        }
+
+        // Format the device (ext4 or xfs)
+        fsType := req.GetVolumeCapability().GetMount().GetFsType()
+        opt := req.GetVolumeCapability().GetMount().GetMountFlags()
+        opt = append(opt, "_netdev")
+        if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
+            req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
+            !Contains(opt, "ro"))
+        {
+            opt = append(opt, "ro")
+        }
+        if (fsType == "xfs")
+        {
+            opt = append(opt, "nouuid")
+        }
+        readOnly := Contains(opt, "ro")
+        if (existingFormat == "" && !readOnly)
+        {
+            var cmdOut []byte
+            switch fsType
+            {
+                case "ext4":
+                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
+                    cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
+                case "xfs":
+                    cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
+            }
+            if (err != nil)
+            {
+                klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
+                goto unmap
+            }
+        }
+
        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
+
+        // Try to run online resize on mount.
+        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
+        if (err == nil && existingFormat != "" && !readOnly)
+        {
+            var cmdOut []byte
+            switch (fsType)
+            {
+                case "ext4":
+                    cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
+                case "xfs":
+                    cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
+            }
+            if (err != nil)
+            {
+                klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
+                goto unmap
+            }
+        }
    }
    if (err != nil)
    {
@@ -218,15 +222,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
            devicePath, targetPath, volName, err,
        )
-        // unmap NBD device
-        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-        if (unmapErr != nil)
-        {
-            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-        }
-        return nil, status.Error(codes.Internal, err.Error())
+        goto unmap
    }
    return &csi.NodePublishVolumeResponse{}, nil
+
+unmap:
+    // unmap NBD device
+    unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+    if (unmapErr != nil)
+    {
+        klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+    }
+    return nil, status.Error(codes.Internal, err.Error())
 }

 // NodeUnpublishVolume unmounts the volume from the target path
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.9.3-1) unstable; urgency=medium
+vitastor (1.2.0-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.9.3-1) unstable; urgency=medium
+vitastor (1.2.0-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: vitastor
 Section: admin
 Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
 Rules-Requires-Root: no
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -28,13 +28,19 @@ RUN apt-get --download-only source qemu

 ADD patches /root/vitastor/patches
 ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
+
+#RUN set -e; \
+#    apt-get install -y wget; \
+#    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
+#    (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
+#    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
+#    apt-get update; \
+#    apt-get install -y vitastor-client vitastor-client-dev quilt
+
 RUN set -e; \
-    apt-get install -y wget; \
-    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
-    (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
-    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
+    dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
    apt-get update; \
-    apt-get install -y vitastor-client vitastor-client-dev quilt; \
+    apt-get install -y quilt; \
    mkdir -p /root/packages/qemu-$REL; \
    rm -rf /root/packages/qemu-$REL/*; \
    cd /root/packages/qemu-$REL; \
@@ -48,7 +54,8 @@ RUN set -e; \
    quilt add block/vitastor.c; \
    cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
+    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
+    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.9.3; \
-    cd vitastor-0.9.3; \
+    cp -r /root/vitastor vitastor-1.2.0; \
+    cd vitastor-1.2.0; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.3.orig.tar.xz vitastor-0.9.3; \
-    cd vitastor-0.9.3; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
+    cd vitastor-1.2.0; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config.en.md
+++ b/docs/config.en.md
@@ -33,6 +33,7 @@ In the future, additional configuration methods may be added:

 - [Common](config/common.en.md)
 - [Network](config/network.en.md)
+- [Client](config/client.en.md)
 - [Global Disk Layout](config/layout-cluster.en.md)
 - [OSD Disk Layout](config/layout-osd.en.md)
 - [OSD Runtime Parameters](config/osd.en.md)
--- a/docs/config.ru.md
+++ b/docs/config.ru.md
@@ -36,6 +36,7 @@

 - [Общие](config/common.ru.md)
 - [Сеть](config/network.ru.md)
+- [Клиентский код](config/client.ru.md)
 - [Глобальные дисковые параметры](config/layout-cluster.ru.md)
 - [Дисковые параметры OSD](config/layout-osd.ru.md)
 - [Прочие параметры OSD](config/osd.ru.md)
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -0,0 +1,103 @@
+[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
+
+-----
+
+[Читать на русском](client.ru.md)
+
+# Client Parameters
+
+These parameters apply only to clients and affect their interaction with
+the cluster.
+
+- [client_max_dirty_bytes](#client_max_dirty_bytes)
+- [client_max_dirty_ops](#client_max_dirty_ops)
+- [client_enable_writeback](#client_enable_writeback)
+- [client_max_buffered_bytes](#client_max_buffered_bytes)
+- [client_max_buffered_ops](#client_max_buffered_ops)
+- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
+
+## client_max_dirty_bytes
+
+- Type: integer
+- Default: 33554432
+- Can be changed online: yes
+
+Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
+(not committed by fsync) data allowed by the client before forcing an
+additional fsync and committing the data. Also note that the client always
+holds a copy of uncommitted data in memory so this setting also affects
+RAM usage of clients.
+
+## client_max_dirty_ops
+
+- Type: integer
+- Default: 1024
+- Can be changed online: yes
+
+Same as client_max_dirty_bytes, but instead of total size, limits the number
+of uncommitted write operations.
+
+## client_enable_writeback
+
+- Type: boolean
+- Default: false
+- Can be changed online: yes
+
+This parameter enables client-side write buffering. This means that write
+requests are accumulated in memory for a short time before being sent to
+a Vitastor cluster which allows to send them in parallel and increase
+performance of some applications. Writes are buffered until client forces
+a flush with fsync() or until the amount of buffered writes exceeds the
+limit.
+
+Write buffering significantly increases performance of some applications,
+for example, CrystalDiskMark under Windows (LOL :-D), but also any other
+applications if they do writes in one of two non-optimal ways: either if
+they do a lot of small (4 kb or so) sequential writes, or if they do a lot
+of small random writes, but without any parallelism or asynchrony, and also
+without calling fsync().
+
+With write buffering enabled, you can expect around 22000 T1Q1 random write
+iops in QEMU more or less regardless of the quality of your SSDs, and this
+number is in fact bound by QEMU itself rather than Vitastor (check it
+yourself by adding a "driver=null-co" disk in QEMU). Without write
+buffering, the current record is 9900 iops, but the number is usually
+even lower with non-ideal hardware, for example, it may be 5000 iops.
+
+Even when this parameter is enabled, write buffering isn't enabled until
+the client explicitly allows it, because enabling it without the client
+being aware of the fact that his writes may be buffered may lead to data
+loss. Because of this, older versions of clients don't support write
+buffering at all, newer versions of the QEMU driver allow write buffering
+only if it's enabled in disk settings with `-blockdev cache.direct=false`,
+and newer versions of FIO only allow write buffering if you don't specify
+`-direct=1`. NBD and NFS drivers allow write buffering by default.
+
+You can overcome this restriction too with the `client_writeback_allowed`
+parameter, but you shouldn't do that unless you **really** know what you
+are doing.
+
+## client_max_buffered_bytes
+
+- Type: integer
+- Default: 33554432
+- Can be changed online: yes
+
+Maximum total size of buffered writes which triggers write-back when reached.
+
+## client_max_buffered_ops
+
+- Type: integer
+- Default: 1024
+- Can be changed online: yes
+
+Maximum number of buffered writes which triggers write-back when reached.
+Multiple consecutive modified data regions are counted as 1 write here.
+
+## client_max_writeback_iodepth
+
+- Type: integer
+- Default: 256
+- Can be changed online: yes
+
+Maximum number of parallel writes when flushing buffered data to the server.
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -0,0 +1,103 @@
+[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
+
+-----
+
+[Read in English](client.en.md)
+
+# Параметры клиентского кода
+
+Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
+затрагивают логику их работы с кластером.
+
+- [client_max_dirty_bytes](#client_max_dirty_bytes)
+- [client_max_dirty_ops](#client_max_dirty_ops)
+- [client_enable_writeback](#client_enable_writeback)
+- [client_max_buffered_bytes](#client_max_buffered_bytes)
+- [client_max_buffered_ops](#client_max_buffered_ops)
+- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
+
+## client_max_dirty_bytes
+
+- Тип: целое число
+- Значение по умолчанию: 33554432
+- Можно менять на лету: да
+
+При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
+зафиксированных fsync-ом) данных, при достижении которого клиент будет
+принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+что в этом случае до момента fsync клиент хранит копию незафиксированных
+данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+
+## client_max_dirty_ops
+
+- Тип: целое число
+- Значение по умолчанию: 1024
+- Можно менять на лету: да
+
+Аналогично client_max_dirty_bytes, но ограничивает количество
+незафиксированных операций записи вместо их общего объёма.
+
+## client_enable_writeback
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+- Можно менять на лету: да
+
+Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
+означает, что операции записи отправляются на кластер Vitastor не сразу, а
+могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
+до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
+пока клиент не вызовет fsync.
+
+Буферизация значительно повышает производительность некоторых приложений,
+например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
+которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
+(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
+есть, например, отправляя 128 операций записи в разные места диска, но не
+все сразу с помощью асинхронного I/O, а по одной.
+
+В QEMU с буферизацией записи можно ожидать показателя примерно 22000
+операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
+без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
+цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
+в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
+в секунду.
+
+При этом, даже если данный параметр включён, буферизация не включается, если
+явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
+буферизуются, это может приводить к потере данных. Поэтому в старых версиях
+клиентских драйверов буферизация записи не включается вообще, в новых
+версиях QEMU-драйвера включается, только если разрешена опцией диска
+`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
+В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
+
+Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
+но делать так не надо, если только вы не уверены в том, что делаете, на все
+100%. :-)
+
+## client_max_buffered_bytes
+
+- Тип: целое число
+- Значение по умолчанию: 33554432
+- Можно менять на лету: да
+
+Максимальный общий размер буферизованных записей, при достижении которого
+начинается процесс сброса данных на сервер.
+
+## client_max_buffered_ops
+
+- Тип: целое число
+- Значение по умолчанию: 1024
+- Можно менять на лету: да
+
+Максимальное количество буферизованных записей, при достижении которого
+начинается процесс сброса данных на сервер. При этом несколько
+последовательных изменённых областей здесь считаются 1 записью.
+
+## client_max_writeback_iodepth
+
+- Тип: целое число
+- Значение по умолчанию: 256
+- Можно менять на лету: да
+
+Максимальное число параллельных операций записи при сбросе буферов на сервер.
--- a/docs/config/layout-cluster.en.md
+++ b/docs/config/layout-cluster.en.md
@@ -96,8 +96,9 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
 it (they have internal SSD cache even though it's not stated in datasheets).

 Setting this parameter to "all" or "small" in OSD parameters requires enabling
-disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
-enabling disable_data_fsync.
+[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
+[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
+"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).

 TLDR: For optimal performance, set immediate_commit to "all" if you only use
 SSDs with supercapacitor-based power loss protection (nonvolatile
--- a/docs/config/layout-cluster.ru.md
+++ b/docs/config/layout-cluster.ru.md
@@ -103,8 +103,9 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
 указано в спецификациях).

 Указание "all" или "small" в настройках / командной строке OSD требует
-включения disable_journal_fsync и disable_meta_fsync, значение "all" также
-требует включения disable_data_fsync.
+включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
+[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
+также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).

 Итого, вкратце: для оптимальной производительности установите
 immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/layout-osd.en.md
+++ b/docs/config/layout-osd.en.md
@@ -197,21 +197,22 @@ Must be equal or a multiple of [bitmap_granularity](layout-cluster.en.md#bitmap_

 Checksums increase metadata size by 4 bytes per each csum_block_size of data.

-Checksums are always a compromise:
+Checksums are always a tradeoff:
 1. You either sacrifice +1 GB RAM per 1 TB of data
 2. Or you raise csum_block_size, for example, to 32k and sacrifice
   50% random write iops due to checksum read-modify-write
 3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
   sacrifice 50% random read iops due to checksum reads

-Option 1 (default) is recommended for all-flash setups because these usually
-have enough RAM.
+All-flash clusters usually have enough RAM to use default csum_block_size,
+which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.

-Option 2 is recommended for HDD-only setups. HDD-only setups usually do NOT
-have enough RAM for the default 4 KB csum_block_size.
+Thus, recommended setups are:
+1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
+2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
+3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+4. HDD-only, faster random read: csum_block_size=32k
+5. HDD-only, faster random write: csum_block_size=4k +
+   inmemory_metadata=false + meta_io=cached

-Option 3 is recommended for SSD+HDD setups (because metadata SSDs will handle
-extra reads without any performance drop) and also *maybe* for NVMe all-flash
-setups when you don't have enough RAM (because NVMe drives have plenty
-of read iops to spare). You may also consider enabling
-[cached_read_meta](osd.en.md#cached_read_meta) in this case.
+See also [meta_io](osd.en.md#meta_io).
--- a/docs/config/layout-osd.ru.md
+++ b/docs/config/layout-osd.ru.md
@@ -220,17 +220,12 @@ csum_block_size данных.
   жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
   с диска

-Вариант 1 (при настройках по умолчанию) рекомендуется для SSD (All-Flash)
-кластеров, потому что памяти в них обычно хватает.
+Таким образом, рекомендуются следующие варианты настроек:
+1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
+2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
+3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+4. Только HDD, быстрее случайное чтение: csum_block_size=32k
+5. Только HDD, быстрее случайная запись: csum_block_size=4k +
+   inmemory_metadata=false + meta_io=cached

-Вариант 2 рекомендуется для кластеров на одних жёстких дисках (без SSD
-под метаданные). На 4 кб блок контрольной суммы памяти в таких кластерах
-обычно НЕ хватает.
-
-Вариант 3 рекомендуется для гибридных кластеров (SSD+HDD), потому что
-скорости SSD под метаданными хватит, чтобы обработать дополнительные чтения
-без снижения производительности. Также вариант 3 *может* рекомендоваться
-для All-Flash кластеров на основе NVMe-дисков, когда памяти НЕ достаточно,
-потому что NVMe-диски имеют огромный запас производительности по чтению.
-В таких случаях, возможно, также имеет смысл включать параметр
-[cached_read_meta](osd.ru.md#cached_read_meta).
+Смотрите также [meta_io](osd.ru.md#meta_io).
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -20,6 +20,7 @@ between clients, OSDs and etcd.
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
+- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -30,7 +31,6 @@ between clients, OSDs and etcd.
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@@ -69,11 +69,14 @@ but they are not connected to the cluster.
 - Type: string

 RDMA device name to use for Vitastor OSD communications (for example,
-"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
-Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
-to work. For example, Mellanox ConnectX-3 and older adapters don't have
-Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
-root to list available RDMA devices and their features.
+"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
+ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
+
+Versions up to Vitastor 1.2.0 required ODP which is only present in
+Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
+
+Run `ibv_devinfo -v` as root to list available RDMA devices and their
+features.

 Remember that you also have to configure your network switches if you use
 RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -148,6 +151,28 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
 Doesn't affect memory usage - additional memory isn't allocated for send
 operations.

+## rdma_odp
+
+- Type: boolean
+- Default: false
+
+Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
+ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
+for RDMA adapter to be able to use it. This, in turn, allows to skip memory
+copying during sending. One would think this should improve performance, but
+**in reality** RDMA performance with ODP is **drastically** worse. Example
+3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
+without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
+
+This happens because Mellanox ODP implementation seems to be based on
+message retransmissions when the adapter doesn't know about the buffer yet -
+it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
+which is generally slow in RDMA/RoCE networks. Here's a presentation about
+it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+ODP support is retained in the code just in case a good ODP implementation
+appears one day.
+
 ## peer_connect_interval

 - Type: seconds
@@ -240,17 +265,3 @@ etcd_report_interval to guarantee that keepalive actually works.

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
-
-## client_dirty_limit
-
- Type: integer
- Default: 33554432
- Can be changed online: yes
-
-Without immediate_commit=all this parameter sets the limit of "dirty"
-(not committed by fsync) data allowed by the client before forcing an
-additional fsync and committing the data. Also note that the client always
-holds a copy of uncommitted data in memory so this setting also affects
-RAM usage of clients.
-
-This parameter doesn't affect OSDs themselves.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -20,6 +20,7 @@
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
+- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -30,7 +31,6 @@
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@@ -72,12 +72,15 @@ RDMA может быть нужно только если у клиентов е
 - Тип: строка

 Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
-Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
-адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
-потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
-суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
-параметры и возможности.
+Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
+нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
+картами производства не Mellanox.
+
+Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
+на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
+
+Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
+список доступных RDMA-устройств, их параметры и возможности.

 Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
 правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -156,6 +159,29 @@ OSD в любом случае согласовывают реальное зн
 Не влияет на потребление памяти - дополнительная память на операции отправки
 не выделяется.

+## rdma_odp
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+
+Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
+исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
+не регистрировать память для её использования RDMA-картой. Благодаря этому
+можно не копировать данные при отправке их в сеть и, казалось бы, это должно
+улучшать производительность - но **по факту** получается так, что
+производительность только ухудшается, причём сильно. Пример - на 3-узловом
+кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
+удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
+
+Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
+основана на повторной передаче сообщений, когда карте не известен буфер -
+вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
+А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
+Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+Возможность использования ODP сохранена в коде на случай, если вдруг в один
+прекрасный день появится хорошая реализация ODP.
+
 ## peer_connect_interval

 - Тип: секунды
@@ -251,17 +277,3 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
-
-## client_dirty_limit
-
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
-
-При работе без immediate_commit=all - это лимит объёма "грязных" (не
-зафиксированных fsync-ом) данных, при достижении которого клиент будет
-принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-что в этом случае до момента fsync клиент хранит копию незафиксированных
-данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
-
-Параметр не влияет на сами OSD.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -11,6 +11,7 @@ initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.

 - [etcd_report_interval](#etcd_report_interval)
+- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -31,9 +32,9 @@ them, even without restarting by updating configuration in etcd.
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
- [cached_read_data](#cached_read_data)
- [cached_read_meta](#cached_read_meta)
- [cached_read_journal](#cached_read_journal)
+- [data_io](#data_io)
+- [meta_io](#meta_io)
+- [journal_io](#journal_io)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@@ -56,11 +57,21 @@ them, even without restarting by updating configuration in etcd.
 - Type: seconds
 - Default: 5

-Interval at which OSDs report their state to etcd. Affects OSD lease time
+Interval at which OSDs report their liveness to etcd. Affects OSD lease time
 and thus the failover speed. Lease time is equal to this parameter value
 plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
 that every OSD always refreshes its lease in time.

+## etcd_stats_interval
+
+- Type: seconds
+- Default: 30
+
+Interval at which OSDs report their statistics to etcd. Highly affects the
+imposed load on etcd, because statistics include a key for every OSD and
+for every PG. At the same time, low statistic intervals make `vitastor-cli`
+statistics more responsive.
+
 ## run_primary

 - Type: boolean
@@ -258,45 +269,59 @@ is typically very small because it's sufficient to have 16-32 MB journal
 for SSD OSDs. However, in theory it's possible that you'll want to turn it
 off for hybrid (HDD+SSD) OSDs with large journals on quick devices.

-## cached_read_data
+## data_io

- Type: boolean
- Default: false
+- Type: string
+- Default: direct

-Read data through Linux page cache, i.e. use a file descriptor opened without
-O_DIRECT for data reads. May improve read performance for frequently accessed
-data if it fits in RAM. Memory in page cache is shared by all processes and
-not accounted in OSD memory consumption.
+I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
+to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.

-## cached_read_meta
+Choose "cached" to use Linux page cache. This may improve read performance
+for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
+decrease write performance for fast disks because page cache is an overhead
+itself.

- Type: boolean
- Default: false
+Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
+(which requires disable_data_fsync) with drives having write-back cache
+which can't be turned off, for example, Intel Optane. Also note that *some*
+desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
+disable_data_fsync unsafe even with "directsync".

-Read metadata through Linux page cache. May be beneficial when checksums
-are enabled and [inmemory_metadata](#inmemory_metadata) is disabled, because
-in this case metadata blocks are read from disk to verify checksums on every
-read request and caching them may reduce this extra read load.
+## meta_io

-Absolutely pointless to enable with enabled inmemory_metadata because all
-metadata is kept in memory anyway, and likely pointless without checksums,
-because in that case, metadata blocks are read from disk only during journal
+- Type: string
+- Default: direct
+
+I/O mode for *metadata*. One of "direct", "cached" or "directsync".
+
+"cached" may improve read performance, but only under the following conditions:
+1. your drives are relatively slow (HDD, SATA SSD), and
+2. checksums are enabled, and
+3. [inmemory_metadata](#inmemory_metadata) is disabled.
+Under all these conditions, metadata blocks are read from disk on every
+read request to verify checksums and caching them may reduce this extra
+read load. Without (3) metadata is never read from the disk after starting,
+and without (2) metadata blocks are read from disk only during journal
 flushing.

-If the same device is used for data and metadata, enabling [cached_read_data](#cached_read_data)
-also enables this parameter, given that it isn't turned off explicitly.
+"directsync" is the same as above.

-## cached_read_journal
+If the same device is used for data and metadata, meta_io by default is set
+to the same value as [data_io](#data_io).

- Type: boolean
- Default: false
+## journal_io

-Read buffered data from journal through Linux page cache. Does not have sense
-without disabling [inmemory_journal](#inmemory_journal), which, again, is
-enabled by default.
+- Type: string
+- Default: direct

-If the same device is used for metadata and journal, enabling [cached_read_meta](#cached_read_meta)
-also enables this parameter, given that it isn't turned off explicitly.
+I/O mode for *journal*. One of "direct", "cached" or "directsync".
+
+Here, "cached" may only improve read performance for recent writes and
+only if [inmemory_journal](#inmemory_journal) is turned off.
+
+If the same device is used for metadata and journal, journal_io by default
+is set to the same value as [meta_io](#meta_io).

 ## journal_sector_buffer_count

--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -12,6 +12,7 @@
 изменения конфигурации в etcd.

 - [etcd_report_interval](#etcd_report_interval)
+- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -32,9 +33,9 @@
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
- [cached_read_data](#cached_read_data)
- [cached_read_meta](#cached_read_meta)
- [cached_read_journal](#cached_read_journal)
+- [data_io](#data_io)
+- [meta_io](#meta_io)
+- [journal_io](#journal_io)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@@ -57,11 +58,21 @@
 - Тип: секунды
 - Значение по умолчанию: 5

-Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
-влияет на время резервации (lease) OSD и поэтому на скорость переключения
+Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
+влияет на время резервации (lease) OSD и поэтому - на скорость переключения
 при падении OSD. Время lease равняется значению этого параметра плюс
 max_etcd_attempts * etcd_quick_timeout.

+## etcd_stats_interval
+
+- Тип: секунды
+- Значение по умолчанию: 30
+
+Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
+создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
+каждый OSD и на каждую PG. В то же время низкий интервал делает
+статистику, печатаемую `vitastor-cli`, отзывчивей.
+
 ## run_primary

 - Тип: булево (да/нет)
@@ -266,50 +277,62 @@ Flusher - это микро-поток (корутина), которая коп
 параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
 журналами, расположенными на быстром по сравнению с HDD устройстве.

-## cached_read_data
+## data_io

- Тип: булево (да/нет)
- Значение по умолчанию: false
+- Тип: строка
+- Значение по умолчанию: direct

-Читать данные через системный кэш Linux (page cache), то есть, использовать
-для чтения данных файловый дескриптор, открытый без флага O_DIRECT. Может
-улучшить производительность чтения для часто используемых данных, если они
-помещаются в память. Память кэша разделяется между всеми процессами в
-системе и не учитывается в потреблении памяти процессом OSD.
+Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
+"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.

-## cached_read_meta
+Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
+чтении и записи. Это может улучшить скорость чтения горячих данных с
+относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
+снижает производительность записи для быстрых дисков, так как кэш сам по
+себе тоже добавляет накладные расходы.

- Тип: булево (да/нет)
- Значение по умолчанию: false
+Выберите "directsync", если хотите задействовать
+[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
+включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
+дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
+настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
+fsync небезопасным даже с режимом "directsync".

-Читать метаданные через системный кэш Linux. Может быть полезно, когда
-включены контрольные суммы, а параметр [inmemory_metadata](#inmemory_metadata)
-отключён, так как в этом случае блоки метаданных читаются с диска при каждом
-запросе чтения для проверки контрольных сумм и их кэширование может снизить
-дополнительную нагрузку на диск.
+## meta_io

-Абсолютно бессмысленно включать данный параметр, если параметр
-inmemory_metadata включён (по умолчанию это так), и также вероятно
-бессмысленно включать его, если не включены контрольные суммы, так как в
-этом случае блоки метаданных читаются с диска только во время сброса
-журнала.
+- Тип: строка
+- Значение по умолчанию: direct

-Если одно и то же устройство используется для данных и метаданных, включение
-[cached_read_data](#cached_read_data) также включает данный параметр, при
-условии, что он не отключён явным образом.
+Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
+"directsync".

-## cached_read_journal
+"cached" может улучшить скорость чтения, если:
+1. у вас медленные диски (HDD, SATA SSD)
+2. контрольные суммы включены
+3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
+При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
+для проверки контрольных сумм и их кэширование может снизить дополнительную
+нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
+запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.

- Тип: булево (да/нет)
- Значение по умолчанию: false
+Если одно и то же устройство используется для данных и метаданных, режим
+ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).

-Читать буферизованные в журнале данные через системный кэш Linux. Не имеет
-смысла без отключения параметра [inmemory_journal](#inmemory_journal),
-который, опять же, по умолчанию включён.
+## journal_io
+
+- Тип: строка
+- Значение по умолчанию: direct
+
+Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
+"directsync".
+
+Здесь "cached" может улучшить скорость чтения только недавно записанных
+данных и только если параметр [inmemory_journal](#inmemory_journal)
+отключён.

 Если одно и то же устройство используется для метаданных и журнала,
-включение [cached_read_meta](#cached_read_meta) также включает данный
-параметр, при условии, что он не отключён явным образом.
+режим ввода-вывода журнала по умолчанию устанавливается равным
+[meta_io](#meta_io).

 ## journal_sector_buffer_count

--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -205,9 +205,8 @@ This parameter usually doesn't require to be changed.
 - Default: 131072

 Block size for this pool. The value from /vitastor/config/global is used when
-unspecified. If your cluster has OSDs with different block sizes then pool must
-be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
-size.
+unspecified. Only OSDs with matching block_size are used for each pool. If you
+want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).

@@ -216,10 +215,9 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Type: integer
 - Default: 4096

-"Sector" size of virtual disks in this pool. The value from
-/vitastor/config/global is used when unspecified. Similar to block_size, the
-pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
-matching bitmap_granularity.
+"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
+is used when unspecified. Similarly to block_size, only OSDs with matching
+bitmap_granularity are used for each pool.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).

@@ -229,10 +227,11 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Default: none

 Immediate commit setting for this pool. The value from /vitastor/config/global
-is used when unspecified. Similar to block_size, the pool must be restricted by
-[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
-Compatible means that a pool with non-immediate commit will work with OSDs with
-immediate commit enabled, but not vice versa.
+is used when unspecified. Similarly to block_size, only OSDs with compatible
+bitmap_granularity are used for each pool. "Compatible" means that a pool with
+non-immediate commit will use OSDs with immediate commit enabled, but not vice
+versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
+with "all" or "small", and pools with "all" only use OSDs with "all".

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).

--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -208,8 +208,9 @@ PG в Vitastor эферемерны, то есть вы можете менят

 Размер блока для данного пула. Если не задан, используется значение из
 /vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
-блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
-с помощью [osd_tags](#osd_tags).
+блока, пул будет использовать только OSD с размером блока, равным размеру блока
+пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
+используйте [osd_tags](#osd_tags).

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).

@@ -219,9 +220,8 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: 4096

 Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
-значение из /vitastor/config/global. Аналогично block_size, пул должен быть
-ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
-[osd_tags](#osd_tags).
+значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
+использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).

@@ -231,11 +231,13 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: none

 Настройка мгновенного коммита для данного пула. Если не задана, используется
-значение из /vitastor/config/global. Аналогично block_size, пул должен быть
-ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
-помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
-мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
-не наоборот.
+значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
+использовать только OSD с *совместимыми* настройками immediate_commit.
+"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
+использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
+пул со значением "none" будет использовать все OSD, пул со "small" будет
+использовать OSD с "all" или "small", а пул с "all" будет использовать только
+OSD с "all".

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).

--- a/docs/config/src/client.en.md
+++ b/docs/config/src/client.en.md
@@ -0,0 +1,4 @@
+# Client Parameters
+
+These parameters apply only to clients and affect their interaction with
+the cluster.
--- a/docs/config/src/client.ru.md
+++ b/docs/config/src/client.ru.md
@@ -0,0 +1,4 @@
+# Параметры клиентского кода
+
+Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
+затрагивают логику их работы с кластером.
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -0,0 +1,124 @@
+- name: client_max_dirty_bytes
+  type: int
+  default: 33554432
+  online: true
+  info: |
+    Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
+    (not committed by fsync) data allowed by the client before forcing an
+    additional fsync and committing the data. Also note that the client always
+    holds a copy of uncommitted data in memory so this setting also affects
+    RAM usage of clients.
+  info_ru: |
+    При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
+    зафиксированных fsync-ом) данных, при достижении которого клиент будет
+    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+    что в этом случае до момента fsync клиент хранит копию незафиксированных
+    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+- name: client_max_dirty_ops
+  type: int
+  default: 1024
+  online: true
+  info: |
+    Same as client_max_dirty_bytes, but instead of total size, limits the number
+    of uncommitted write operations.
+  info_ru: |
+    Аналогично client_max_dirty_bytes, но ограничивает количество
+    незафиксированных операций записи вместо их общего объёма.
+- name: client_enable_writeback
+  type: bool
+  default: false
+  online: true
+  info: |
+    This parameter enables client-side write buffering. This means that write
+    requests are accumulated in memory for a short time before being sent to
+    a Vitastor cluster which allows to send them in parallel and increase
+    performance of some applications. Writes are buffered until client forces
+    a flush with fsync() or until the amount of buffered writes exceeds the
+    limit.
+
+    Write buffering significantly increases performance of some applications,
+    for example, CrystalDiskMark under Windows (LOL :-D), but also any other
+    applications if they do writes in one of two non-optimal ways: either if
+    they do a lot of small (4 kb or so) sequential writes, or if they do a lot
+    of small random writes, but without any parallelism or asynchrony, and also
+    without calling fsync().
+
+    With write buffering enabled, you can expect around 22000 T1Q1 random write
+    iops in QEMU more or less regardless of the quality of your SSDs, and this
+    number is in fact bound by QEMU itself rather than Vitastor (check it
+    yourself by adding a "driver=null-co" disk in QEMU). Without write
+    buffering, the current record is 9900 iops, but the number is usually
+    even lower with non-ideal hardware, for example, it may be 5000 iops.
+
+    Even when this parameter is enabled, write buffering isn't enabled until
+    the client explicitly allows it, because enabling it without the client
+    being aware of the fact that his writes may be buffered may lead to data
+    loss. Because of this, older versions of clients don't support write
+    buffering at all, newer versions of the QEMU driver allow write buffering
+    only if it's enabled in disk settings with `-blockdev cache.direct=false`,
+    and newer versions of FIO only allow write buffering if you don't specify
+    `-direct=1`. NBD and NFS drivers allow write buffering by default.
+
+    You can overcome this restriction too with the `client_writeback_allowed`
+    parameter, but you shouldn't do that unless you **really** know what you
+    are doing.
+  info_ru: |
+    Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
+    означает, что операции записи отправляются на кластер Vitastor не сразу, а
+    могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
+    до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
+    пока клиент не вызовет fsync.
+
+    Буферизация значительно повышает производительность некоторых приложений,
+    например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
+    которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
+    (например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
+    есть, например, отправляя 128 операций записи в разные места диска, но не
+    все сразу с помощью асинхронного I/O, а по одной.
+
+    В QEMU с буферизацией записи можно ожидать показателя примерно 22000
+    операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
+    без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
+    цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
+    в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
+    в секунду.
+
+    При этом, даже если данный параметр включён, буферизация не включается, если
+    явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
+    буферизуются, это может приводить к потере данных. Поэтому в старых версиях
+    клиентских драйверов буферизация записи не включается вообще, в новых
+    версиях QEMU-драйвера включается, только если разрешена опцией диска
+    `-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
+    В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
+
+    Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
+    но делать так не надо, если только вы не уверены в том, что делаете, на все
+    100%. :-)
+- name: client_max_buffered_bytes
+  type: int
+  default: 33554432
+  online: true
+  info: |
+    Maximum total size of buffered writes which triggers write-back when reached.
+  info_ru: |
+    Максимальный общий размер буферизованных записей, при достижении которого
+    начинается процесс сброса данных на сервер.
+- name: client_max_buffered_ops
+  type: int
+  default: 1024
+  online: true
+  info: |
+    Maximum number of buffered writes which triggers write-back when reached.
+    Multiple consecutive modified data regions are counted as 1 write here.
+  info_ru: |
+    Максимальное количество буферизованных записей, при достижении которого
+    начинается процесс сброса данных на сервер. При этом несколько
+    последовательных изменённых областей здесь считаются 1 записью.
+- name: client_max_writeback_iodepth
+  type: int
+  default: 256
+  online: true
+  info: |
+    Maximum number of parallel writes when flushing buffered data to the server.
+  info_ru: |
+    Максимальное число параллельных операций записи при сбросе буферов на сервер.
--- a/docs/config/src/included.en.md
+++ b/docs/config/src/included.en.md
@@ -28,6 +28,8 @@

 {{../../config/network.en.md|indent=2}}

+{{../../config/client.en.md|indent=2}}
+
 {{../../config/layout-cluster.en.md|indent=2}}

 {{../../config/layout-osd.en.md|indent=2}}
--- a/docs/config/src/included.ru.md
+++ b/docs/config/src/included.ru.md
@@ -28,6 +28,8 @@

 {{../../config/network.ru.md|indent=2}}

+{{../../config/client.ru.md|indent=2}}
+
 {{../../config/layout-cluster.ru.md|indent=2}}

 {{../../config/layout-osd.ru.md|indent=2}}
--- a/docs/config/src/layout-cluster.yml
+++ b/docs/config/src/layout-cluster.yml
@@ -87,8 +87,9 @@
    it (they have internal SSD cache even though it's not stated in datasheets).

    Setting this parameter to "all" or "small" in OSD parameters requires enabling
-    disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
-    enabling disable_data_fsync.
+    [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
+    [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
+    "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).

    TLDR: For optimal performance, set immediate_commit to "all" if you only use
    SSDs with supercapacitor-based power loss protection (nonvolatile
@@ -140,8 +141,9 @@
    указано в спецификациях).

    Указание "all" или "small" в настройках / командной строке OSD требует
-    включения disable_journal_fsync и disable_meta_fsync, значение "all" также
-    требует включения disable_data_fsync.
+    включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
+    [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
+    также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).

    Итого, вкратце: для оптимальной производительности установите
    immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/src/layout-osd.yml
+++ b/docs/config/src/layout-osd.yml
@@ -228,24 +228,25 @@

    Checksums increase metadata size by 4 bytes per each csum_block_size of data.

-    Checksums are always a compromise:
+    Checksums are always a tradeoff:
    1. You either sacrifice +1 GB RAM per 1 TB of data
    2. Or you raise csum_block_size, for example, to 32k and sacrifice
       50% random write iops due to checksum read-modify-write
    3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
       sacrifice 50% random read iops due to checksum reads

-    Option 1 (default) is recommended for all-flash setups because these usually
-    have enough RAM.
+    All-flash clusters usually have enough RAM to use default csum_block_size,
+    which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.

-    Option 2 is recommended for HDD-only setups. HDD-only setups usually do NOT
-    have enough RAM for the default 4 KB csum_block_size.
+    Thus, recommended setups are:
+    1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
+    2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
+    3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+    4. HDD-only, faster random read: csum_block_size=32k
+    5. HDD-only, faster random write: csum_block_size=4k +
+       inmemory_metadata=false + meta_io=cached

-    Option 3 is recommended for SSD+HDD setups (because metadata SSDs will handle
-    extra reads without any performance drop) and also *maybe* for NVMe all-flash
-    setups when you don't have enough RAM (because NVMe drives have plenty
-    of read iops to spare). You may also consider enabling
-    [cached_read_meta](osd.en.md#cached_read_meta) in this case.
+    See also [meta_io](osd.en.md#meta_io).
  info_ru: |
    Размер блока расчёта контрольных сумм.

@@ -264,17 +265,12 @@
       жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
       с диска

-    Вариант 1 (при настройках по умолчанию) рекомендуется для SSD (All-Flash)
-    кластеров, потому что памяти в них обычно хватает.
+    Таким образом, рекомендуются следующие варианты настроек:
+    1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
+    2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
+    3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+    4. Только HDD, быстрее случайное чтение: csum_block_size=32k
+    5. Только HDD, быстрее случайная запись: csum_block_size=4k +
+       inmemory_metadata=false + meta_io=cached

-    Вариант 2 рекомендуется для кластеров на одних жёстких дисках (без SSD
-    под метаданные). На 4 кб блок контрольной суммы памяти в таких кластерах
-    обычно НЕ хватает.
-
-    Вариант 3 рекомендуется для гибридных кластеров (SSD+HDD), потому что
-    скорости SSD под метаданными хватит, чтобы обработать дополнительные чтения
-    без снижения производительности. Также вариант 3 *может* рекомендоваться
-    для All-Flash кластеров на основе NVMe-дисков, когда памяти НЕ достаточно,
-    потому что NVMe-диски имеют огромный запас производительности по чтению.
-    В таких случаях, возможно, также имеет смысл включать параметр
-    [cached_read_meta](osd.ru.md#cached_read_meta).
+    Смотрите также [meta_io](osd.ru.md#meta_io).
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -48,11 +48,14 @@
  type: string
  info: |
    RDMA device name to use for Vitastor OSD communications (for example,
-    "rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
-    Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
-    to work. For example, Mellanox ConnectX-3 and older adapters don't have
-    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
-    root to list available RDMA devices and their features.
+    "rocep5s0f0"). Now Vitastor supports all adapters, even ones without
+    ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
+
+    Versions up to Vitastor 1.2.0 required ODP which is only present in
+    Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
+
+    Run `ibv_devinfo -v` as root to list available RDMA devices and their
+    features.

    Remember that you also have to configure your network switches if you use
    RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -61,12 +64,15 @@
    PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
-    Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
-    адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
-    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
-    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
-    параметры и возможности.
+    Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
+    нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
+    картами производства не Mellanox.
+
+    Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
+    на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
+
+    Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
+    список доступных RDMA-устройств, их параметры и возможности.

    Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
    правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -160,6 +166,45 @@
    у принимающей стороны в процессе работы не заканчивались буферы на приём.
    Не влияет на потребление памяти - дополнительная память на операции отправки
    не выделяется.
+- name: rdma_odp
+  type: bool
+  default: false
+  online: false
+  info: |
+    Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
+    ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
+    for RDMA adapter to be able to use it. This, in turn, allows to skip memory
+    copying during sending. One would think this should improve performance, but
+    **in reality** RDMA performance with ODP is **drastically** worse. Example
+    3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
+    without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
+
+    This happens because Mellanox ODP implementation seems to be based on
+    message retransmissions when the adapter doesn't know about the buffer yet -
+    it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
+    which is generally slow in RDMA/RoCE networks. Here's a presentation about
+    it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+    ODP support is retained in the code just in case a good ODP implementation
+    appears one day.
+  info_ru: |
+    Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
+    исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
+    не регистрировать память для её использования RDMA-картой. Благодаря этому
+    можно не копировать данные при отправке их в сеть и, казалось бы, это должно
+    улучшать производительность - но **по факту** получается так, что
+    производительность только ухудшается, причём сильно. Пример - на 3-узловом
+    кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
+    удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
+
+    Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
+    основана на повторной передаче сообщений, когда карте не известен буфер -
+    вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
+    А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
+    Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+    Возможность использования ODP сохранена в коде на случай, если вдруг в один
+    прекрасный день появится хорошая реализация ODP.
 - name: peer_connect_interval
  type: sec
  min: 1
@@ -259,23 +304,3 @@
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
- name: client_dirty_limit
-  type: int
-  default: 33554432
-  online: true
-  info: |
-    Without immediate_commit=all this parameter sets the limit of "dirty"
-    (not committed by fsync) data allowed by the client before forcing an
-    additional fsync and committing the data. Also note that the client always
-    holds a copy of uncommitted data in memory so this setting also affects
-    RAM usage of clients.
-
-    This parameter doesn't affect OSDs themselves.
-  info_ru: |
-    При работе без immediate_commit=all - это лимит объёма "грязных" (не
-    зафиксированных fsync-ом) данных, при достижении которого клиент будет
-    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-    что в этом случае до момента fsync клиент хранит копию незафиксированных
-    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
-
-    Параметр не влияет на сами OSD.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -2,15 +2,28 @@
  type: sec
  default: 5
  info: |
-    Interval at which OSDs report their state to etcd. Affects OSD lease time
+    Interval at which OSDs report their liveness to etcd. Affects OSD lease time
    and thus the failover speed. Lease time is equal to this parameter value
    plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
    that every OSD always refreshes its lease in time.
  info_ru: |
-    Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
-    влияет на время резервации (lease) OSD и поэтому на скорость переключения
+    Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
+    влияет на время резервации (lease) OSD и поэтому - на скорость переключения
    при падении OSD. Время lease равняется значению этого параметра плюс
    max_etcd_attempts * etcd_quick_timeout.
+- name: etcd_stats_interval
+  type: sec
+  default: 30
+  info: |
+    Interval at which OSDs report their statistics to etcd. Highly affects the
+    imposed load on etcd, because statistics include a key for every OSD and
+    for every PG. At the same time, low statistic intervals make `vitastor-cli`
+    statistics more responsive.
+  info_ru: |
+    Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
+    создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
+    каждый OSD и на каждую PG. В то же время низкий интервал делает
+    статистику, печатаемую `vitastor-cli`, отзывчивей.
 - name: run_primary
  type: bool
  default: true
@@ -260,70 +273,96 @@
    достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
    параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
    журналами, расположенными на быстром по сравнению с HDD устройстве.
- name: cached_read_data
-  type: bool
-  default: false
+- name: data_io
+  type: string
+  default: direct
  info: |
-    Read data through Linux page cache, i.e. use a file descriptor opened without
-    O_DIRECT for data reads. May improve read performance for frequently accessed
-    data if it fits in RAM. Memory in page cache is shared by all processes and
-    not accounted in OSD memory consumption.
-  info_ru: |
-    Читать данные через системный кэш Linux (page cache), то есть, использовать
-    для чтения данных файловый дескриптор, открытый без флага O_DIRECT. Может
-    улучшить производительность чтения для часто используемых данных, если они
-    помещаются в память. Память кэша разделяется между всеми процессами в
-    системе и не учитывается в потреблении памяти процессом OSD.
- name: cached_read_meta
-  type: bool
-  default: false
-  info: |
-    Read metadata through Linux page cache. May be beneficial when checksums
-    are enabled and [inmemory_metadata](#inmemory_metadata) is disabled, because
-    in this case metadata blocks are read from disk to verify checksums on every
-    read request and caching them may reduce this extra read load.
+    I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
+    to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.

-    Absolutely pointless to enable with enabled inmemory_metadata because all
-    metadata is kept in memory anyway, and likely pointless without checksums,
-    because in that case, metadata blocks are read from disk only during journal
+    Choose "cached" to use Linux page cache. This may improve read performance
+    for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
+    decrease write performance for fast disks because page cache is an overhead
+    itself.
+
+    Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
+    (which requires disable_data_fsync) with drives having write-back cache
+    which can't be turned off, for example, Intel Optane. Also note that *some*
+    desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
+    disable_data_fsync unsafe even with "directsync".
+  info_ru: |
+    Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
+    "directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
+
+    Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
+    чтении и записи. Это может улучшить скорость чтения горячих данных с
+    относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
+    снижает производительность записи для быстрых дисков, так как кэш сам по
+    себе тоже добавляет накладные расходы.
+
+    Выберите "directsync", если хотите задействовать
+    [immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
+    включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
+    дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
+    настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
+    fsync небезопасным даже с режимом "directsync".
+- name: meta_io
+  type: string
+  default: direct
+  info: |
+    I/O mode for *metadata*. One of "direct", "cached" or "directsync".
+
+    "cached" may improve read performance, but only under the following conditions:
+    1. your drives are relatively slow (HDD, SATA SSD), and
+    2. checksums are enabled, and
+    3. [inmemory_metadata](#inmemory_metadata) is disabled.
+    Under all these conditions, metadata blocks are read from disk on every
+    read request to verify checksums and caching them may reduce this extra
+    read load. Without (3) metadata is never read from the disk after starting,
+    and without (2) metadata blocks are read from disk only during journal
    flushing.

-    If the same device is used for data and metadata, enabling [cached_read_data](#cached_read_data)
-    also enables this parameter, given that it isn't turned off explicitly.
+    "directsync" is the same as above.
+
+    If the same device is used for data and metadata, meta_io by default is set
+    to the same value as [data_io](#data_io).
  info_ru: |
-    Читать метаданные через системный кэш Linux. Может быть полезно, когда
-    включены контрольные суммы, а параметр [inmemory_metadata](#inmemory_metadata)
-    отключён, так как в этом случае блоки метаданных читаются с диска при каждом
-    запросе чтения для проверки контрольных сумм и их кэширование может снизить
-    дополнительную нагрузку на диск.
+    Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
+    "directsync".

-    Абсолютно бессмысленно включать данный параметр, если параметр
-    inmemory_metadata включён (по умолчанию это так), и также вероятно
-    бессмысленно включать его, если не включены контрольные суммы, так как в
-    этом случае блоки метаданных читаются с диска только во время сброса
-    журнала.
+    "cached" может улучшить скорость чтения, если:
+    1. у вас медленные диски (HDD, SATA SSD)
+    2. контрольные суммы включены
+    3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
+    При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
+    для проверки контрольных сумм и их кэширование может снизить дополнительную
+    нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
+    запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.

-    Если одно и то же устройство используется для данных и метаданных, включение
-    [cached_read_data](#cached_read_data) также включает данный параметр, при
-    условии, что он не отключён явным образом.
- name: cached_read_journal
-  type: bool
-  default: false
+    Если одно и то же устройство используется для данных и метаданных, режим
+    ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
+- name: journal_io
+  type: string
+  default: direct
  info: |
-    Read buffered data from journal through Linux page cache. Does not have sense
-    without disabling [inmemory_journal](#inmemory_journal), which, again, is
-    enabled by default.
+    I/O mode for *journal*. One of "direct", "cached" or "directsync".

-    If the same device is used for metadata and journal, enabling [cached_read_meta](#cached_read_meta)
-    also enables this parameter, given that it isn't turned off explicitly.
+    Here, "cached" may only improve read performance for recent writes and
+    only if [inmemory_journal](#inmemory_journal) is turned off.
+
+    If the same device is used for metadata and journal, journal_io by default
+    is set to the same value as [meta_io](#meta_io).
  info_ru: |
-    Читать буферизованные в журнале данные через системный кэш Linux. Не имеет
-    смысла без отключения параметра [inmemory_journal](#inmemory_journal),
-    который, опять же, по умолчанию включён.
+    Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
+    "directsync".
+
+    Здесь "cached" может улучшить скорость чтения только недавно записанных
+    данных и только если параметр [inmemory_journal](#inmemory_journal)
+    отключён.

    Если одно и то же устройство используется для метаданных и журнала,
-    включение [cached_read_meta](#cached_read_meta) также включает данный
-    параметр, при условии, что он не отключён явным образом.
+    режим ввода-вывода журнала по умолчанию устанавливается равным
+    [meta_io](#meta_io).
 - name: journal_sector_buffer_count
  type: int
  default: 32
--- a/docs/installation/kubernetes.en.md
+++ b/docs/installation/kubernetes.en.md
@@ -17,4 +17,15 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
+After that you'll be able to create PersistentVolumes.
+
+## Features
+
+Vitastor CSI supports:
+- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
+- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
+- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
+- Volume expansion
+- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
+
+Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
--- a/docs/installation/kubernetes.ru.md
+++ b/docs/installation/kubernetes.ru.md
@@ -17,4 +17,15 @@
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
+После этого вы сможете создавать PersistentVolume.
+
+## Возможности
+
+CSI-плагин Vitastor поддерживает:
+- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
+- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
+- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
+- Расширение размера томов
+- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
+
+Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -14,6 +14,8 @@
  - Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
  - Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
+  - Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
+    stable version from 0.9.x branch instead of 1.x
 - For Debian 10 (Buster) also enable backports repository:
  `deb http://deb.debian.org/debian buster-backports main`
 - Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -14,6 +14,8 @@
  - Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
  - Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
+  - Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
+    установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
 - Для Debian 10 (Buster) также включите репозиторий backports:
  `deb http://deb.debian.org/debian buster-backports main`
 - Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
--- a/docs/installation/source.en.md
+++ b/docs/installation/source.en.md
@@ -21,7 +21,7 @@

 ## Basic instructions

-Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
+Download source, for example using git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`

 Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
 you can disable it by passing `-DWITH_FIO=no` to cmake.
@@ -41,7 +41,7 @@ It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
 QEMU build process. To do that:
 - Install vitastor client library headers (from source or from vitastor-client-dev package)
 - Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
+- Copy `src/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
 - Build QEMU as usual

 But it is also possible to build it out-of-tree. To do that:
--- a/docs/installation/source.ru.md
+++ b/docs/installation/source.ru.md
@@ -21,7 +21,7 @@

 ## Базовая инструкция

-Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
+Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`

 Скачайте исходные коды пакета `fio`, распакуйте их и создайте символическую ссылку на них
 в директории исходников Vitastor: `<vitastor>/fio`. Либо, если вы не хотите собирать плагин fio,
@@ -41,7 +41,7 @@ cmake .. && make -j8 install
 Драйвер QEMU (qemu_driver.c) рекомендуется собирать вместе с самим QEMU. Для этого:
 - Установите заголовки клиентской библиотеки Vitastor (из исходников или из пакета vitastor-client-dev)
 - Возьмите соответствующий патч из `patches/qemu-*-vitastor.patch` и примените его к исходникам QEMU
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/block-vitastor.c`
+- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
 - Соберите QEMU как обычно

 Однако в целях отладки драйвер также можно собирать отдельно от QEMU. Для этого:
@@ -60,7 +60,7 @@ cmake .. && make -j8 install
      * Для QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
   - `config-host.h` и `qapi` нужны, т.к. в них содержатся автогенерируемые заголовки
 - Сконфигурируйте cmake Vitastor с `WITH_QEMU=yes` (`cmake .. -DWITH_QEMU=yes`) и, если вы
-  используете RHEL-подобый дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
+  используете RHEL-подобный дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
 - После этого в процессе сборки Vitastor также будет собираться подходящий для вашей
  версии QEMU `block-vitastor.so`.
 - Таким образом можно использовать драйвер даже с немодифицированным QEMU, но в этом случае
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -29,8 +29,9 @@
 - Snapshots and copy-on-write image clones
 - [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)
+- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
 - [Checksums](../config/layout-osd.en.md#data_csum_type)
+- [Client write-back cache](../config/client.en.md#client_enable_writeback)

 ## Plugins and tools

@@ -50,13 +51,15 @@

 The following features are planned for the future:

+- File system
+- Control plane optimisation
 - Other administrative tools
 - Web GUI
 - OpenNebula plugin
- iSCSI proxy
+- iSCSI and NVMeoF gateways
 - Multi-threaded client
 - Faster failover
+- S3
 - Tiered storage (SSD caching)
 - NVDIMM support
 - Compression (possibly)
- Read caching using system page cache (possibly)
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -31,8 +31,9 @@
 - Снапшоты и copy-on-write клоны
 - [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)
+- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
 - [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
+- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)

 ## Драйверы и инструменты

@@ -50,12 +51,15 @@

 ## Планы развития

+- Файловая система
+- Оптимизация слоя управления
 - Другие инструменты администрирования
 - Web-интерфейс
 - Плагин для OpenNebula
- iSCSI-прокси
+- iSCSI и NVMeoF прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
+- S3
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
 - Возможно, сжатие
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -102,7 +102,7 @@ checks the device cache status on start and tries to disable cache for SATA/SAS
 If it doesn't succeed it issues a warning in the system log.

 You can also pass other OSD options here as arguments and they'll be persisted
-in the superblock: cached_read_data, cached_read_meta, cached_read_journal,
+in the superblock: cached_io_data, cached_io_meta, cached_io_journal,
 inmemory_metadata, inmemory_journal, max_write_iodepth,
 min_flusher_count, max_flusher_count, journal_sector_buffer_count,
 journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -103,8 +103,8 @@ vitastor-disk - инструмент командной строки для уп
 это не удаётся, в системный журнал выводится предупреждение.

 Вы можете передать данной команде и некоторые другие опции OSD в качестве аргументов
-и они тоже будут сохранены в суперблок: cached_read_data, cached_read_meta,
-cached_read_journal, inmemory_metadata, inmemory_journal, max_write_iodepth,
+и они тоже будут сохранены в суперблок: cached_io_data, cached_io_meta,
+cached_io_journal, inmemory_metadata, inmemory_journal, max_write_iodepth,
 min_flusher_count, max_flusher_count, journal_sector_buffer_count,
 journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
 throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -34,6 +34,20 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```

+With a separate I/O thread:
+
+```
+qemu-system-x86_64 -enable-kvm -m 1024 \
+    -object iothread,id=vitastor1 \
+    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
+        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
+        id=virtio-disk0,bootindex=1,write-cache=off' \
+    -vnc 0.0.0.0:0
+```
+
+You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
+
 ## qemu-img

 For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -84,37 +98,88 @@ This can be used for backups. Just note that exporting an image that is currentl
 is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
 on a live VM.

+## vhost-user-blk
+
+QEMU, starting with 6.0, includes support for attaching disks via a separate
+userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
+lower latency.
+
+Example commands to use it with Vitastor:
+
+```
+qemu-storage-daemon \
+    --daemonize \
+    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
+
+qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
+    -object memory-backend-memfd,id=mem,size=2G,share=on \
+    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
+    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
+    -vnc 0.0.0.0:0
+```
+
+memfd memory-backend is crucial, vhost-user-blk does not work without it.
+
 ## VDUSE

 Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
 to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
 exporting QEMU block devices over this protocol using qemu-storage-daemon.

-VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
-for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
-hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
-In this case reboot will be the only way to remove VDUSE devices from system.
+VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
+- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
+- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
+- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
+  and block device will continue operation
+- It doesn't seem to have the device number limit

-On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
-performance is important for you. Approximate performance numbers:
-direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
+Example performance comparison:
+
+|                      | direct fio  | NBD         | VDUSE       |
+|----------------------|-------------|-------------|-------------|
+| linear write         | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
+| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
+| 4k random write Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
+| linear read          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
+| 4k random read Q128  | 287000 iops | 140000 iops | 189000 iops |
+| 4k random read Q1    | 9600 iops   | 7640 iops   | 7780 iops   |

 To try VDUSE you need at least Linux 5.15, built with VDUSE support
-(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
-disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
-[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
+(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
+
+Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
+use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
+or build modules for Debian kernel manually:
+
+```
+mkdir build
+cd build
+apt-get install linux-headers-`uname -r`
+apt-get build-dep linux-image-`uname -r`-unsigned
+apt-get source linux-image-`uname -r`-unsigned
+cd linux*/drivers/vdpa
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
+cd ../virtio
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+depmod -a
+```
+
+You also need `vdpa` tool from the `iproute2` package.

 Commands to attach Vitastor image as a VDUSE device:

 ```
-modprobe vduse virtio-vdpa
+modprobe vduse
+modprobe virtio-vdpa
 qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
  "etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
  --export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
 vdpa dev add name test1 mgmtdev vduse
 ```

-After running these commands /dev/vda device will appear in the system and you'll be able to
+After running these commands, `/dev/vda` device will appear in the system and you'll be able to
 use it as a normal disk.

 To remove the device:
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -36,6 +36,18 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```

+С отдельным потоком ввода-вывода:
+
+```
+qemu-system-x86_64 -enable-kvm -m 1024 \
+    -object iothread,id=vitastor1 \
+    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
+        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
+        id=virtio-disk0,bootindex=1,write-cache=off' \
+    -vnc 0.0.0.0:0
+```
+
 Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.

 ## qemu-img
@@ -88,37 +100,89 @@ qemu-img rebase -u -b '' testimg.qcow2
 в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
 с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.

+## vhost-user-blk
+
+QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
+Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
+задержку (ниже на 20-30 микросекунд, чем при обычном методе).
+
+Пример команд для использования vhost-user-blk с Vitastor:
+
+```
+qemu-storage-daemon \
+    --daemonize \
+    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
+
+qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
+    -object memory-backend-memfd,id=mem,size=2G,share=on \
+    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
+    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
+    -vnc 0.0.0.0:0
+```
+
+Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
+
 ## VDUSE

 В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
 к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
 экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.

-VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
-подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
-процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
-через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
+VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
+устройств на уровне ядра, ибо:
+- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
+- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
+- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
+  перезапустить (!) и блочное устройство продолжит работать
+- По-видимому, у него нет предела числа подключаемых в систему устройств

-С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
-быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
-прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
+Пример сравнения производительности:

-Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
-VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
-отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
-[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
+|                          | Прямой fio  | NBD         | VDUSE       |
+|--------------------------|-------------|-------------|-------------|
+| линейная запись          | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
+| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
+| 4k случайная запись Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
+| линейное чтение          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
+| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
+| 4k случайное чтение Q1   | 9600 iops   | 7640 iops   | 7780 iops   |
+
+Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
+VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
+
+В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
+на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
+из Proxmox или соберите модули для ядра Debian вручную:
+
+```
+mkdir build
+cd build
+apt-get install linux-headers-`uname -r`
+apt-get build-dep linux-image-`uname -r`-unsigned
+apt-get source linux-image-`uname -r`-unsigned
+cd linux*/drivers/vdpa
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
+cd ../virtio
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+depmod -a
+```
+
+Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.

 Команды для подключения виртуального диска через VDUSE:

 ```
-modprobe vduse virtio-vdpa
+modprobe vduse
+modprobe virtio-vdpa
 qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
  "etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
  --export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
 vdpa dev add name test1 mgmtdev vduse
 ```

-После этого в системе появится устройство /dev/vda, которое можно будет использовать как
+После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
 обычный диск.

 Для удаления устройства из системы:
--- a/mon/90-vitastor.rules
+++ b/mon/90-vitastor.rules
@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
    IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
    SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"

-ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
-ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -78,9 +78,15 @@ const etcd_tree = {
            disk_alignment: 4096,
            bitmap_granularity: 4096,
            immediate_commit: false, // 'all' or 'small'
+            // client - configurable online
+            client_max_dirty_bytes: 33554432,
+            client_max_dirty_ops: 1024,
+            client_enable_writeback: false,
+            client_max_buffered_bytes: 33554432,
+            client_max_buffered_ops: 1024,
+            client_max_writeback_iodepth: 256,
            // client and osd - configurable online
            log_level: 0,
-            client_dirty_limit: 33554432,
            peer_connect_interval: 5, // seconds. min: 1
            peer_connect_timeout: 5, // seconds. min: 1
            osd_idle_timeout: 5, // seconds. min: 1
@@ -93,6 +99,7 @@ const etcd_tree = {
            etcd_ws_keepalive_interval: 30, // seconds
            // osd
            etcd_report_interval: 5, // seconds
+            etcd_stats_interval: 30, // seconds
            run_primary: true,
            osd_network: null, // "192.168.7.0/24" or an array of masks
            bind_address: "0.0.0.0",
@@ -390,12 +397,13 @@ class Mon
        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
        this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
+        this.prev_stats = { osd_stats: {}, osd_diff: {} };
        this.signals_set = false;
-        this.stat_time = Date.now();
        this.ws = null;
        this.ws_alive = false;
        this.ws_keepalive_timer = null;
        this.on_stop_cb = () => this.on_stop(0).catch(console.error);
+        this.recheck_pgs_active = false;
    }

    parse_etcd_addresses(addrs)
@@ -539,10 +547,18 @@ class Mon
        {
            retries = 1;
        }
+        const tried = {};
        while (retries < 0 || retry < retries)
        {
            const cur_addr = this.pick_next_etcd();
            const base = 'ws'+cur_addr.substr(4);
+            let now = Date.now();
+            if (tried[base] && now-tried[base] < this.etcd_start_timeout)
+            {
+                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
+                now = Date.now();
+            }
+            tried[base] = now;
            const ok = await new Promise((ok, no) =>
            {
                const timer_id = setTimeout(() =>
@@ -677,8 +693,27 @@ class Mon
        });
    }

+    // Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
+    schedule_save_last_clean()
+    {
+        if (!this.save_last_clean_timer)
+        {
+            this.save_last_clean_timer = setTimeout(() =>
+            {
+                this.save_last_clean_timer = null;
+                this.save_last_clean().catch(this.die);
+            }, this.config.mon_change_timeout || 1000);
+        }
+    }
+
    async save_last_clean()
    {
+        if (this.save_last_clean_running)
+        {
+            this.schedule_save_last_clean();
+            return;
+        }
+        this.save_last_clean_running = true;
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
        const new_clean_pgs = { items: {} };
    next_pool:
@@ -715,6 +750,7 @@ class Mon
                value: b64(JSON.stringify(this.state.history.last_clean_pgs))
            } } ],
        }, this.etcd_start_timeout, 0);
+        this.save_last_clean_running = false;
    }

    get_mon_state()
@@ -1148,6 +1184,33 @@ class Mon
        }
    }

+    filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
+    {
+        for (const host in flat_tree)
+        {
+            let found = 0;
+            for (const osd in flat_tree[host])
+            {
+                const osd_stat = this.state.osd.stats[osd];
+                if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
+                    osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
+                    osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
+                    osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
+                {
+                    delete flat_tree[host][osd];
+                }
+                else
+                {
+                    found++;
+                }
+            }
+            if (!found)
+            {
+                delete flat_tree[host];
+            }
+        }
+    }
+
    get_affinity_osds(pool_cfg, up_osds, osd_tree)
    {
        let aff_osds = up_osds;
@@ -1161,6 +1224,12 @@ class Mon

    async recheck_pgs()
    {
+        if (this.recheck_pgs_active)
+        {
+            this.schedule_recheck();
+            return;
+        }
+        this.recheck_pgs_active = true;
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
        // FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1182,6 +1251,7 @@ class Mon
                    // Pool deleted. Delete all PGs, but first stop them.
                    if (!await this.stop_all_pgs(pool_id))
                    {
+                        this.recheck_pgs_active = false;
                        this.schedule_recheck();
                        return;
                    }
@@ -1208,6 +1278,12 @@ class Mon
                pool_tree = pool_tree ? pool_tree.children : [];
                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
+                this.filter_osds_by_block_layout(
+                    pool_tree,
+                    pool_cfg.block_size || this.config.block_size || 131072,
+                    pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
+                    pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
+                );
                // These are for the purpose of building history.osd_sets
                const real_prev_pgs = [];
                let pg_history = [];
@@ -1244,9 +1320,16 @@ class Mon
                        // PG count changed. Need to bring all PGs down.
                        if (!await this.stop_all_pgs(pool_id))
                        {
+                            this.recheck_pgs_active = false;
                            this.schedule_recheck();
                            return;
                        }
+                    }
+                    if (prev_pgs.length != pool_cfg.pg_count)
+                    {
+                        // Scale PG count
+                        // Do it even if old_pg_count is already equal to pool_cfg.pg_count,
+                        // because last_clean_pgs may still contain the old number of PGs
                        const new_pg_history = [];
                        PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
                        pg_history = new_pg_history;
@@ -1348,6 +1431,7 @@ class Mon
                await this.save_pg_config(new_config_pgs);
            }
        }
+        this.recheck_pgs_active = false;
    }

    async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1397,7 +1481,6 @@ class Mon
    }

    // Schedule a recheck to run after a small timeout (1s)
-    // If already scheduled, cancel previous timer and schedule it again
    // This is required for multiple change events to trigger at most 1 recheck in 1s
    schedule_recheck()
    {
@@ -1411,15 +1494,15 @@ class Mon
        }
    }

-    derive_osd_stats(st, prev)
+    derive_osd_stats(st, prev, prev_diff)
    {
        const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
-        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
-        if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
+        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
+        if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
        {
-            return diff;
+            return prev_diff || diff;
        }
-        const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
+        const timediff = BigInt(st.time*1000 - prev.time*1000);
        for (const op in st.op_stats||{})
        {
            const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1451,25 +1534,47 @@ class Mon
            if (n > 0)
                diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
        }
+        for (const pool_id in st.inode_stats||{})
+        {
+            const pool_diff = diff.inode_stats[pool_id] = {};
+            for (const inode_num in st.inode_stats[pool_id])
+            {
+                const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
+                for (const op of [ 'read', 'write', 'delete' ])
+                {
+                    const c = st.inode_stats[pool_id][inode_num][op];
+                    const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
+                        prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
+                    const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
+                    inode_diff[op] = {
+                        bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
+                        iops: n*1000n/timediff,
+                        lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
+                    };
+                }
+            }
+        }
        return diff;
    }

-    sum_op_stats(timestamp, prev_stats)
+    sum_op_stats()
    {
-        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
-        if (!prev_stats || prev_stats.timestamp >= timestamp)
+        for (const osd in this.state.osd.stats)
        {
-            return sum_diff;
+            const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
+            this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
+                cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
+            );
+            this.prev_stats.osd_stats[osd] = cur;
        }
-        const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
+        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
        // Sum derived values instead of deriving summed
        for (const osd in this.state.osd.stats)
        {
-            const derived = this.derive_osd_stats(this.state.osd.stats[osd],
-                this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
-            for (const type in derived)
+            const derived = this.prev_stats.osd_diff[osd];
+            for (const type in sum_diff)
            {
-                for (const op in derived[type])
+                for (const op in derived[type]||{})
                {
                    for (const k in derived[type][op])
                    {
@@ -1497,10 +1602,14 @@ class Mon
                    break;
                }
            }
+            const pool_cfg = (this.state.config.pools[pool_id]||{});
            if (!object_size)
            {
-                object_size = (this.state.config.pools[pool_id]||{}).block_size ||
-                    this.config.block_size || 131072;
+                object_size = pool_cfg.block_size || this.config.block_size || 131072;
+            }
+            if (pool_cfg.scheme !== 'replicated')
+            {
+                object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
            }
            object_size = BigInt(object_size);
            for (const pg_num in this.state.pg.stats[pool_id])
@@ -1522,14 +1631,14 @@ class Mon
        return { object_counts, object_bytes };
    }

-    sum_inode_stats(prev_stats, timestamp, prev_timestamp)
+    sum_inode_stats()
    {
        const inode_stats = {};
        const inode_stub = () => ({
            raw_used: 0n,
-            read: { count: 0n, usec: 0n, bytes: 0n },
-            write: { count: 0n, usec: 0n, bytes: 0n },
-            delete: { count: 0n, usec: 0n, bytes: 0n },
+            read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
+            write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
+            delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
        });
        const seen_pools = {};
        for (const pool_id in this.state.config.pools)
@@ -1581,11 +1690,25 @@ class Mon
                }
            }
        }
-        if (prev_stats && prev_timestamp >= timestamp)
+        for (const osd in this.prev_stats.osd_diff)
        {
-            prev_stats = null;
+            for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
+            {
+                for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
+                {
+                    inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
+                    for (const op of [ 'read', 'write', 'delete' ])
+                    {
+                        const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
+                        const op_st = inode_stats[pool_id][inode_num][op];
+                        op_st.bps += op_diff.bps;
+                        op_st.iops += op_diff.iops;
+                        op_st.lat += op_diff.lat;
+                        op_st.n_osd = (op_st.n_osd || 0) + 1;
+                    }
+                }
+            }
        }
-        const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
        for (const pool_id in inode_stats)
        {
            for (const inode_num in inode_stats[pool_id])
@@ -1594,11 +1717,12 @@ class Mon
                for (const op of [ 'read', 'write', 'delete' ])
                {
                    const op_st = inode_stats[pool_id][inode_num][op];
-                    const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
-                    op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
-                    op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
-                    op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
-                    if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
+                    if (op_st.n_osd)
+                    {
+                        op_st.lat /= BigInt(op_st.n_osd);
+                        delete op_st.n_osd;
+                    }
+                    if (op_st.bps > 0 || op_st.iops > 0)
                        nonzero = true;
                }
                if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1631,15 +1755,9 @@ class Mon
    async update_total_stats()
    {
        const txn = [];
-        const timestamp = Date.now();
        const { object_counts, object_bytes } = this.sum_object_counts();
-        let stats = this.sum_op_stats(timestamp, this.prev_stats);
-        let { inode_stats, seen_pools } = this.sum_inode_stats(
-            this.prev_stats ? this.prev_stats.inode_stats : null,
-            timestamp, this.prev_stats ? this.prev_stats.timestamp : null
-        );
-        this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
-        this.stat_time = Date.now();
+        let stats = this.sum_op_stats();
+        let { inode_stats, seen_pools } = this.sum_inode_stats();
        stats.object_counts = object_counts;
        stats.object_bytes = object_bytes;
        stats = this.serialize_bigints(stats);
@@ -1784,10 +1902,18 @@ class Mon
        {
            retries = 1;
        }
+        const tried = {};
        while (retries < 0 || retry < retries)
        {
            retry++;
            const base = this.pick_next_etcd();
+            let now = Date.now();
+            if (tried[base] && now-tried[base] < timeout)
+            {
+                await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
+                now = Date.now();
+            }
+            tried[base] = now;
            const res = await POST(base+path, body, timeout);
            if (res.error)
            {
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.0.0",
+  "version": "1.2.0",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.9.3'
+VERSION = '1.2.0'

 LOG = logging.getLogger(__name__)

--- a/patches/qemu-2.12-vitastor.patch
+++ b/patches/qemu-2.12-vitastor.patch
@@ -0,0 +1,176 @@
+diff --git a/block/Makefile.objs b/block/Makefile.objs
+index d644bac60a..e404236291 100644
+--- a/block/Makefile.objs
+++ b/block/Makefile.objs
+@@ -19,6 +19,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
+ block-obj-$(CONFIG_LIBNFS) += nfs.o
+ block-obj-$(CONFIG_CURL) += curl.o
+ block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
+ block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+ block-obj-$(CONFIG_VXHS) += vxhs.o
+ block-obj-$(CONFIG_LIBSSH2) += ssh.o
+@@ -39,6 +40,8 @@ curl.o-cflags      := $(CURL_CFLAGS)
+ curl.o-libs        := $(CURL_LIBS)
+ rbd.o-cflags       := $(RBD_CFLAGS)
+ rbd.o-libs         := $(RBD_LIBS)
+vitastor.o-cflags  := $(VITASTOR_CFLAGS)
+vitastor.o-libs    := $(VITASTOR_LIBS)
+ gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
+ gluster.o-libs     := $(GLUSTERFS_LIBS)
+ vxhs.o-libs        := $(VXHS_LIBS)
+diff --git a/configure b/configure
+index 0a19b033bc..58b7fbf24c 100755
+--- a/configure
+++ b/configure
+@@ -398,6 +398,7 @@ trace_backends="log"
+ trace_file="trace"
+ spice=""
+ rbd=""
+vitastor=""
+ smartcard=""
+ libusb=""
+ usb_redir=""
+@@ -1213,6 +1214,10 @@ for opt do
+   ;;
+   --enable-rbd) rbd="yes"
+   ;;
+  --disable-vitastor) vitastor="no"
+  ;;
+  --enable-vitastor) vitastor="yes"
+  ;;
+   --disable-xfsctl) xfs="no"
+   ;;
+   --enable-xfsctl) xfs="yes"
+@@ -1601,6 +1606,7 @@ disabled with --disable-FEATURE, default is enabled if available:
+   vhost-crypto    vhost-crypto acceleration support
+   spice           spice
+   rbd             rados block device (rbd)
+  vitastor        vitastor block device
+   libiscsi        iscsi support
+   libnfs          nfs support
+   smartcard       smartcard support (libcacard)
+@@ -3594,6 +3600,27 @@ EOF
+   fi
+ fi
+ 
+##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+  cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+  vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return 0;
+}
+EOF
+  vitastor_libs="-lvitastor_client"
+  if compile_prog "" "$vitastor_libs" ; then
+    vitastor=yes
+  else
+    if test "$vitastor" = "yes" ; then
+      feature_not_found "vitastor block device" "Install vitastor-client-dev"
+    fi
+    vitastor=no
+  fi
+fi
+
+ ##########################################
+ # libssh2 probe
+ min_libssh2_version=1.2.8
+@@ -5837,6 +5864,7 @@ echo "Trace output file $trace_file-<pid>"
+ fi
+ echo "spice support     $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
+ echo "rbd support       $rbd"
+echo "vitastor support  $vitastor"
+ echo "xfsctl support    $xfs"
+ echo "smartcard support $smartcard"
+ echo "libusb            $libusb"
+@@ -6416,6 +6444,11 @@ if test "$rbd" = "yes" ; then
+   echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
+   echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
+ fi
+if test "$vitastor" = "yes" ; then
+  echo "CONFIG_VITASTOR=m" >> $config_host_mak
+  echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
+  echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
+ 
+ echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
+ if test "$coroutine_pool" = "yes" ; then
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index c50517bff3..c780bb2c1c 100644
+--- a/qapi/block-core.json
+++ b/qapi/block-core.json
+@@ -2514,7 +2514,7 @@
+             'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
+             'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
+             'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
+-            'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
+            'quorum', 'raw', 'rbd', 'vitastor', 'replication', 'sheepdog', 'ssh',
+             'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+ 
+ ##
+@@ -3217,6 +3217,28 @@
+             '*snap-id': 'uint32',
+             '*tag': 'str' } }
+ 
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+ ##
+ # @ReplicationMode:
+ #
+@@ -3547,6 +3569,7 @@
+       'rbd':        'BlockdevOptionsRbd',
+       'replication':'BlockdevOptionsReplication',
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -3991,6 +4014,17 @@
+             '*subformat':           'BlockdevVhdxSubformat',
+             '*block-state-zero':    'bool' } }
+ 
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+ ##
+ # @BlockdevVpcSubformat:
+ #
+@@ -4074,6 +4108,7 @@
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'replication':    'BlockdevCreateNotSupported',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'throttle':       'BlockdevCreateNotSupported',
+       'vdi':            'BlockdevCreateOptionsVdi',
--- a/patches/qemu-5.2-vitastor.patch
+++ b/patches/qemu-5.2-vitastor.patch
@@ -0,0 +1,181 @@
+Index: qemu-5.2+dfsg/qapi/block-core.json
+===================================================================
+--- qemu-5.2+dfsg.orig/qapi/block-core.json
+++ qemu-5.2+dfsg/qapi/block-core.json
+@@ -2831,7 +2831,7 @@
+             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
+             'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 
+ ##
+@@ -3668,6 +3668,28 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -4015,6 +4037,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4404,6 +4427,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4665,6 +4699,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu-5.2+dfsg/block/meson.build
+===================================================================
+--- qemu-5.2+dfsg.orig/block/meson.build
+++ qemu-5.2+dfsg/block/meson.build
+@@ -76,6 +76,7 @@ foreach m : [
+   ['CONFIG_LIBNFS', 'nfs', libnfs, 'nfs.c'],
+   ['CONFIG_LIBSSH', 'ssh', libssh, 'ssh.c'],
+   ['CONFIG_RBD', 'rbd', rbd, 'rbd.c'],
+  ['CONFIG_VITASTOR', 'vitastor', vitastor, 'vitastor.c'],
+ ]
+   if config_host.has_key(m[0])
+     if enable_modules
+Index: qemu-5.2+dfsg/configure
+===================================================================
+--- qemu-5.2+dfsg.orig/configure
+++ qemu-5.2+dfsg/configure
+@@ -372,6 +372,7 @@ trace_backends="log"
+ trace_file="trace"
+ spice=""
+ rbd=""
+vitastor=""
+ smartcard=""
+ u2f="auto"
+ libusb=""
+@@ -1263,6 +1264,10 @@ for opt do
+   ;;
+   --enable-rbd) rbd="yes"
+   ;;
+  --disable-vitastor) vitastor="no"
+  ;;
+  --enable-vitastor) vitastor="yes"
+  ;;
+   --disable-xfsctl) xfs="no"
+   ;;
+   --enable-xfsctl) xfs="yes"
+@@ -1827,6 +1832,7 @@ disabled with --disable-FEATURE, default
+   vhost-vdpa      vhost-vdpa kernel backend support
+   spice           spice
+   rbd             rados block device (rbd)
+  vitastor        vitastor block device
+   libiscsi        iscsi support
+   libnfs          nfs support
+   smartcard       smartcard support (libcacard)
+@@ -3719,6 +3725,27 @@ EOF
+ fi
+ 
+ ##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+  cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+  vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return 0;
+}
+EOF
+  vitastor_libs="-lvitastor_client"
+  if compile_prog "" "$vitastor_libs" ; then
+    vitastor=yes
+  else
+    if test "$vitastor" = "yes" ; then
+      feature_not_found "vitastor block device" "Install vitastor-client-dev"
+    fi
+    vitastor=no
+  fi
+fi
+
+##########################################
+ # libssh probe
+ if test "$libssh" != "no" ; then
+   if $pkg_config --exists libssh; then
+@@ -6456,6 +6483,10 @@ if test "$rbd" = "yes" ; then
+   echo "CONFIG_RBD=y" >> $config_host_mak
+   echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
+ fi
+if test "$vitastor" = "yes" ; then
+  echo "CONFIG_VITASTOR=y" >> $config_host_mak
+  echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
+ 
+ echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
+ if test "$coroutine_pool" = "yes" ; then
+Index: qemu-5.2+dfsg/meson.build
+===================================================================
+--- qemu-5.2+dfsg.orig/meson.build
+++ qemu-5.2+dfsg/meson.build
+@@ -596,6 +596,10 @@ rbd = not_found
+ if 'CONFIG_RBD' in config_host
+   rbd = declare_dependency(link_args: config_host['RBD_LIBS'].split())
+ endif
+vitastor = not_found
+if 'CONFIG_VITASTOR' in config_host
+  vitastor = declare_dependency(link_args: config_host['VITASTOR_LIBS'].split())
+endif
+ glusterfs = not_found
+ if 'CONFIG_GLUSTERFS' in config_host
+   glusterfs = declare_dependency(compile_args: config_host['GLUSTERFS_CFLAGS'].split(),
+@@ -2145,6 +2149,7 @@ endif
+ # TODO: add back protocol and server version
+ summary_info += {'spice support':     config_host.has_key('CONFIG_SPICE')}
+ summary_info += {'rbd support':       config_host.has_key('CONFIG_RBD')}
+summary_info += {'vitastor support':  config_host.has_key('CONFIG_VITASTOR')}
+ summary_info += {'xfsctl support':    config_host.has_key('CONFIG_XFS')}
+ summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
+ summary_info += {'U2F support':       u2f.found()}
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.9.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.3$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *
--- a/rpm/qemu-kvm-4.2-el7.spec.patch
+++ b/rpm/qemu-kvm-4.2-el7.spec.patch
@@ -22,7 +22,7 @@
 Name: qemu-kvm
 Version: 4.2.0
 -Release: 29.vitastor%{?dist}.6
-+Release: 32.vitastor%{?dist}.6
+Release: 34.vitastor%{?dist}.6
 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
 Epoch: 15
 License: GPLv2 and GPLv2+ and CC-BY
--- a/rpm/qemu-kvm-4.2-el8.spec.patch
+++ b/rpm/qemu-kvm-4.2-el8.spec.patch
@@ -13,7 +13,7 @@
 Name: qemu-kvm
 Version: 4.2.0
 -Release: 29%{?dist}.6
-+Release: 32.vitastor%{?dist}.6
+Release: 33.vitastor%{?dist}.6
 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
 Epoch: 15
 License: GPLv2 and GPLv2+ and CC-BY
--- a/rpm/qemu-kvm-6.2-el8.spec.patch
+++ b/rpm/qemu-kvm-6.2-el8.spec.patch
@@ -0,0 +1,103 @@
+--- qemu-kvm-6.2.spec.orig	2023-07-18 13:52:57.636625440 +0000
+++ qemu-kvm-6.2.spec	2023-07-18 13:52:19.011683886 +0000
+@@ -73,6 +73,7 @@ Requires: %{name}-hw-usbredir = %{epoch}
+ %endif                                                           \
+ Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release}   \
+ Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release}     \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
+ Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
+ 
+ # Macro to properly setup RHEL/RHEV conflict handling
+@@ -83,7 +84,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version
+ Summary: QEMU is a machine emulator and virtualizer
+ Name: qemu-kvm
+ Version: 6.2.0
+-Release: 32%{?rcrel}%{?dist}
+Release: 32.vitastor%{?rcrel}%{?dist}
+ # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
+ Epoch: 15
+ License: GPLv2 and GPLv2+ and CC-BY
+@@ -122,6 +123,7 @@ Source37: tests_data_acpi_pc_SSDT.dimmpx
+ Source38: tests_data_acpi_q35_FACP.slic
+ Source39: tests_data_acpi_q35_SSDT.dimmpxm
+ Source40: tests_data_acpi_virt_SSDT.memhp
+Source41: qemu-vitastor.c
+ 
+ Patch0001: 0001-redhat-Adding-slirp-to-the-exploded-tree.patch
+ Patch0005: 0005-Initial-redhat-build.patch
+@@ -652,6 +654,7 @@ Patch255: kvm-scsi-protect-req-aiocb-wit
+ Patch256: kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch
+ # For bz#2090990 - qemu crash with error scsi_req_unref(SCSIRequest *): Assertion `req->refcount > 0' failed or scsi_dma_complete(void *, int): Assertion `r->req.aiocb != NULL' failed [8.7.0]
+ Patch257: kvm-virtio-scsi-reset-SCSI-devices-from-main-loop-thread.patch
+Patch258: qemu-6.2-vitastor.patch
+ 
+ BuildRequires: wget
+ BuildRequires: rpm-build
+@@ -689,6 +692,7 @@ BuildRequires: libcurl-devel
+ BuildRequires: libssh-devel
+ BuildRequires: librados-devel
+ BuildRequires: librbd-devel
+BuildRequires: vitastor-client-devel
+ %if %{have_gluster}
+ # For gluster block driver
+ BuildRequires: glusterfs-api-devel
+@@ -926,6 +930,14 @@ Install this package if you want to acce
+ using the rbd protocol.
+ 
+ 
+%package  block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
+ %package  block-ssh
+ Summary: QEMU SSH block driver
+ Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+@@ -979,6 +991,7 @@ This package provides usbredir support.
+ rm -fr slirp
+ mkdir slirp
+ %autopatch -p1
+cp %{SOURCE41} ./block/vitastor.c
+ 
+ %global qemu_kvm_build qemu_kvm_build
+ mkdir -p %{qemu_kvm_build}
+@@ -994,7 +1007,7 @@ cp -f %{SOURCE40} tests/data/acpi/virt/S
+ # --build-id option is used for giving info to the debug packages.
+ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ 
+-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+ 
+ %if 0%{have_gluster}
+     %global block_drivers_list %{block_drivers_list},gluster
+@@ -1149,9 +1162,7 @@ pushd %{qemu_kvm_build}
+   --firmwarepath=%{_prefix}/share/qemu-firmware \
+   --meson="git" \
+   --target-list="%{buildarch}" \
+-  --block-drv-rw-whitelist=%{block_drivers_list} \
+   --audio-drv-list= \
+-  --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
+   --with-coroutine=ucontext \
+   --with-git=git \
+   --tls-priority=@QEMU,SYSTEM \
+@@ -1197,6 +1208,7 @@ pushd %{qemu_kvm_build}
+ %endif
+   --enable-pie \
+   --enable-rbd \
+  --enable-vitastor \
+ %if 0%{have_librdma}
+   --enable-rdma \
+ %endif
+@@ -1794,6 +1806,9 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.
+ %files block-rbd
+ %{_libdir}/qemu-kvm/block-rbd.so
+ 
+%files block-vitastor
+%{_libdir}/qemu-kvm/block-vitastor.so
+
+ %files block-ssh
+ %{_libdir}/qemu-kvm/block-ssh.so
+ 
--- a/rpm/qemu-kvm-7.2-el9.spec.patch
+++ b/rpm/qemu-kvm-7.2-el9.spec.patch
@@ -0,0 +1,93 @@
+--- qemu-kvm-7.2.spec.orig	2023-06-22 13:56:19.000000000 +0000
+++ qemu-kvm-7.2.spec	2023-07-18 07:55:22.347090196 +0000
+@@ -100,8 +100,6 @@
+ %endif
+ 
+ %global target_list %{kvm_target}-softmmu
+-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
+-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
+ %define qemudocdir %{_docdir}/%{name}
+ %global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
+ 
+@@ -126,6 +124,7 @@ Requires: %{name}-device-usb-host = %{ep
+ Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release}   \
+ %endif                                                           \
+ Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release}     \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
+ Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
+ 
+ # Since SPICE is removed from RHEL-9, the following Obsoletes:
+@@ -148,7 +147,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
+ Summary: QEMU is a machine emulator and virtualizer
+ Name: qemu-kvm
+ Version: 7.2.0
+-Release: 14%{?rcrel}%{?dist}%{?cc_suffix}.1
+Release: 14.vitastor%{?rcrel}%{?dist}%{?cc_suffix}.1
+ # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
+ # Epoch 15 used for RHEL 8
+ # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
+@@ -171,6 +170,7 @@ Source28: 95-kvm-memlock.conf
+ Source30: kvm-s390x.conf
+ Source31: kvm-x86.conf
+ Source36: README.tests
+Source37: qemu-vitastor.c
+ 
+ 
+ Patch0004: 0004-Initial-redhat-build.patch
+@@ -418,6 +418,7 @@ Patch134: kvm-target-i386-Fix-BZHI-instr
+ Patch135: kvm-intel-iommu-fail-DEVIOTLB_UNMAP-without-dt-mode.patch
+ # For bz#2203745 - Disk detach is unsuccessful while the guest is still booting [rhel-9.2.0.z]
+ Patch136: kvm-acpi-pcihp-allow-repeating-hot-unplug-requests.patch
+Patch137: qemu-7.2-vitastor.patch
+ 
+ %if %{have_clang}
+ BuildRequires: clang
+@@ -449,6 +450,7 @@ BuildRequires: libcurl-devel
+ %if %{have_block_rbd}
+ BuildRequires: librbd-devel
+ %endif
+BuildRequires: vitastor-client-devel
+ # We need both because the 'stap' binary is probed for by configure
+ BuildRequires: systemtap
+ BuildRequires: systemtap-sdt-devel
+@@ -642,6 +644,14 @@ using the rbd protocol.
+ %endif
+ 
+ 
+%package  block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
+ %package  audio-pa
+ Summary: QEMU PulseAudio audio driver
+ Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+@@ -719,6 +729,7 @@ This package provides usbredir support.
+ %prep
+ %setup -q -n qemu-%{version}%{?rcstr}
+ %autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
+ 
+ %global qemu_kvm_build qemu_kvm_build
+ mkdir -p %{qemu_kvm_build}
+@@ -946,6 +957,7 @@ run_configure \
+ %if %{have_block_rbd}
+   --enable-rbd \
+ %endif
+  --enable-vitastor \
+ %if %{have_librdma}
+   --enable-rdma \
+ %endif
+@@ -1426,6 +1438,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
+ %files block-rbd
+ %{_libdir}/%{name}/block-rbd.so
+ %endif
+%files block-vitastor
+%{_libdir}/%{name}/block-vitastor.so
+
+ %files audio-pa
+ %{_libdir}/%{name}/audio-pa.so
+ 
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.3.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.3
+Version:        1.2.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.3.el7.tar.gz
+Source0:        vitastor-1.2.0.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.3.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.3
+Version:        1.2.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.3.el8.tar.gz
+Source0:        vitastor-1.2.0.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.3.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.3
+Version:        1.2.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.3.el9.tar.gz
+Source0:        vitastor-1.2.0.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,10 +16,11 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.9.3")
-add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
+add_definitions(-DVERSION="1.2.0")
+add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
+add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
-	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
+	add_definitions(-fsanitize=address)
 	add_link_options(-fsanitize=address -fno-omit-frame-pointer)
 endif (${WITH_ASAN})

@@ -137,6 +138,7 @@ endif (${WITH_FIO})
 add_library(vitastor_client SHARED
 	cluster_client.cpp
 	cluster_client_list.cpp
+	cluster_client_wb.cpp
 	vitastor_c.cpp
 	cli_common.cpp
 	cli_alloc_osd.cpp
@@ -179,6 +181,25 @@ target_link_libraries(vitastor-nbd
 	vitastor_client
 )

+# vitastor-kv
+add_executable(vitastor-kv
+	kv_cli.cpp
+	kv_db.cpp
+	kv_db.h
+)
+target_link_libraries(vitastor-kv
+	vitastor_client
+)
+
+add_executable(vitastor-kv-stress
+	kv_stress.cpp
+	kv_db.cpp
+	kv_db.h
+)
+target_link_libraries(vitastor-kv-stress
+	vitastor_client
+)
+
 # vitastor-nfs
 add_executable(vitastor-nfs
 	nfs_proxy.cpp
@@ -300,7 +321,7 @@ target_link_libraries(test_crc32
 add_executable(test_cluster_client
 	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
-	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
+	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
 	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
--- a/src/addr_util.cpp
+++ b/src/addr_util.cpp
@@ -19,8 +19,8 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
        if (p != std::string::npos && !(str.length() > 0 && str[p-1] == ']')) // "[ipv6]" which contains ':'
        {
            char null_byte = 0;
-            int n = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
-            if (n != 1 || default_port >= 0x10000)
+            int scanned = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
+            if (scanned != 1 || default_port >= 0x10000)
                return false;
            str = str.substr(0, p);
        }
--- a/src/blockstore_disk.cpp
+++ b/src/blockstore_disk.cpp
@@ -45,13 +45,31 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    meta_block_size = parse_size(config["meta_block_size"]);
    bitmap_granularity = parse_size(config["bitmap_granularity"]);
    meta_format = stoull_full(config["meta_format"]);
-    cached_read_data = config["cached_read_data"] == "true" || config["cached_read_data"] == "yes" || config["cached_read_data"] == "1";
-    cached_read_meta = cached_read_data && (meta_device == data_device || meta_device == "") &&
-        config.find("cached_read_meta") == config.end() ||
-        config["cached_read_meta"] == "true" || config["cached_read_meta"] == "yes" || config["cached_read_meta"] == "1";
-    cached_read_journal = cached_read_meta && (journal_device == meta_device || journal_device == "") &&
-        config.find("cached_read_journal") == config.end() ||
-        config["cached_read_journal"] == "true" || config["cached_read_journal"] == "yes" || config["cached_read_journal"] == "1";
+    if (config.find("data_io") == config.end() &&
+        config.find("meta_io") == config.end() &&
+        config.find("journal_io") == config.end())
+    {
+        bool cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
+        bool cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
+            config.find("cached_io_meta") == config.end() ||
+            config["cached_io_meta"] == "true" || config["cached_io_meta"] == "yes" || config["cached_io_meta"] == "1";
+        bool cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
+            config.find("cached_io_journal") == config.end() ||
+            config["cached_io_journal"] == "true" || config["cached_io_journal"] == "yes" || config["cached_io_journal"] == "1";
+        data_io = cached_io_data ? "cached" : "direct";
+        meta_io = cached_io_meta ? "cached" : "direct";
+        journal_io = cached_io_journal ? "cached" : "direct";
+    }
+    else
+    {
+        data_io = config.find("data_io") != config.end() ? config["data_io"] : "direct";
+        meta_io = config.find("meta_io") != config.end()
+            ? config["meta_io"]
+            : (meta_device == data_device || meta_device == "" ? data_io : "direct");
+        journal_io = config.find("journal_io") != config.end()
+            ? config["journal_io"]
+            : (journal_device == meta_device || journal_device == "" ? meta_io : "direct");
+    }
    if (config["data_csum_type"] == "crc32c")
    {
        data_csum_type = BLOCKSTORE_CSUM_CRC32C;
@@ -272,9 +290,19 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
    }
 }

+static int bs_openmode(const std::string & mode)
+{
+    if (mode == "directsync")
+        return O_DIRECT|O_SYNC;
+    else if (mode == "cached")
+        return O_SYNC;
+    else
+        return O_DIRECT;
+}
+
 void blockstore_disk_t::open_data()
 {
-    data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
+    data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
    if (data_fd == -1)
    {
        throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
@@ -295,25 +323,13 @@ void blockstore_disk_t::open_data()
    {
        throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
    }
-    if (cached_read_data)
-    {
-        read_data_fd = open(data_device.c_str(), O_RDWR);
-        if (read_data_fd == -1)
-        {
-            throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
-        }
-    }
-    else
-    {
-        read_data_fd = data_fd;
-    }
 }

 void blockstore_disk_t::open_meta()
 {
-    if (meta_device != data_device)
+    if (meta_device != data_device || meta_io != data_io)
    {
-        meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
+        meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
        if (meta_fd == -1)
        {
            throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
@@ -323,22 +339,10 @@ void blockstore_disk_t::open_meta()
        {
            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
        }
-        if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
+        if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
        {
            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
        }
-        if (cached_read_meta)
-        {
-            read_meta_fd = open(meta_device.c_str(), O_RDWR);
-            if (read_meta_fd == -1)
-            {
-                throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
-            }
-        }
-        else
-        {
-            read_meta_fd = meta_fd;
-        }
    }
    else
    {
@@ -357,35 +361,19 @@ void blockstore_disk_t::open_meta()
            ") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
        );
    }
-    if (!cached_read_meta)
-    {
-        read_meta_fd = meta_fd;
-    }
-    else if (meta_device == data_device && cached_read_data)
-    {
-        read_meta_fd = read_data_fd;
-    }
-    else
-    {
-        read_meta_fd = open(meta_device.c_str(), O_RDWR);
-        if (read_meta_fd == -1)
-        {
-            throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
-        }
-    }
 }

 void blockstore_disk_t::open_journal()
 {
-    if (journal_device != meta_device)
+    if (journal_device != meta_device || journal_io != meta_io)
    {
-        journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
+        journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
        if (journal_fd == -1)
        {
            throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
        }
        check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
-        if (!disable_flock && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
+        if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
        {
            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
        }
@@ -407,26 +395,6 @@ void blockstore_disk_t::open_journal()
            ") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
        );
    }
-    if (!cached_read_journal)
-    {
-        read_journal_fd = journal_fd;
-    }
-    else if (journal_device == meta_device && cached_read_meta)
-    {
-        read_journal_fd = read_meta_fd;
-    }
-    else if (journal_device == data_device && cached_read_data)
-    {
-        read_journal_fd = read_data_fd;
-    }
-    else
-    {
-        read_journal_fd = open(journal_device.c_str(), O_RDWR);
-        if (read_journal_fd == -1)
-        {
-            throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
-        }
-    }
 }

 void blockstore_disk_t::close_all()
@@ -437,12 +405,5 @@ void blockstore_disk_t::close_all()
        close(meta_fd);
    if (journal_fd >= 0 && journal_fd != meta_fd)
        close(journal_fd);
-    if (read_data_fd >= 0 && read_data_fd != data_fd)
-        close(read_data_fd);
-    if (read_meta_fd >= 0 && read_meta_fd != meta_fd)
-        close(read_meta_fd);
-    if (read_journal_fd >= 0 && read_journal_fd != journal_fd)
-        close(read_journal_fd);
    data_fd = meta_fd = journal_fd = -1;
-    read_data_fd = read_meta_fd = read_journal_fd = -1;
 }
--- a/src/blockstore_disk.h
+++ b/src/blockstore_disk.h
@@ -31,11 +31,11 @@ struct blockstore_disk_t
    uint32_t csum_block_size = 4096;
    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
    bool disable_flock = false;
-    // Use linux page cache for reads. If enabled, separate buffered FDs will be opened for reading
-    bool cached_read_data = false, cached_read_meta = false, cached_read_journal = false;
+    // I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
+    // O_SYNC without O_DIRECT = use Linux page cache for reads and writes
+    std::string data_io, meta_io, journal_io;

    int meta_fd = -1, data_fd = -1, journal_fd = -1;
-    int read_meta_fd = -1, read_data_fd = -1, read_journal_fd = -1;
    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
    uint64_t data_offset, data_device_sect, data_device_size, data_len;
    uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -1087,7 +1087,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
        data->iov = (struct iovec){ vi.buf, vi.len };
        data->callback = simple_callback_r;
        my_uring_prep_readv(
-            sqe, bs->dsk.read_data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
+            sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
        );
        wait_count++;
        bs->find_holes(v, vi.offset, vi.offset+vi.len, [this, buf = (uint8_t*)vi.buf-vi.offset](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
@@ -1119,7 +1119,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
                data->iov = (struct iovec){ v[i].buf, (size_t)v[i].len };
                data->callback = simple_callback_rj;
                my_uring_prep_readv(
-                    sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset
+                    sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + v[i].disk_offset
                );
                wait_journal_count++;
            }
@@ -1212,7 +1212,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
        data->callback = simple_callback_r;
        wr.submitted = true;
        my_uring_prep_readv(
-            sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector
+            sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + wr.sector
        );
        wait_count++;
    }
@@ -1372,7 +1372,8 @@ bool journal_flusher_co::trim_journal(int wait_base)
                    ? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
                .reserved = 0,
                .journal_start = new_trim_pos,
-                .version = JOURNAL_VERSION_V2,
+                .version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
+                    ? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
                .data_csum_type = bs->dsk.data_csum_type,
                .csum_block_size = bs->dsk.csum_block_size,
            };
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -384,6 +384,10 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
+    if (op->opcode == BS_OP_SYNC)
+    {
+        unsynced_queued_ops = 0;
+    }
    init_op(op);
    submit_queue.push_back(op);
    ringloop->wakeup();
@@ -393,6 +397,7 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
 {
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
+    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -210,7 +210,7 @@ struct blockstore_op_private_t
    std::vector<copy_buffer_t> read_vec;

    // Sync, write
-    int min_flushed_journal_sector, max_flushed_journal_sector;
+    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;

    // Write
    struct iovec iov_zerofill[3];
@@ -220,7 +220,6 @@ struct blockstore_op_private_t

    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
-    int sync_small_checked, sync_big_checked;
 };

 typedef uint32_t pool_id_t;
@@ -263,6 +262,8 @@ class blockstore_impl_t
    int throttle_target_parallelism = 1;
    // Minimum difference in microseconds between target and real execution times to throttle the response
    int throttle_threshold_us = 50;
+    // Maximum writes between automatically added fsync operations
+    uint64_t autosync_writes = 128;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;
@@ -273,7 +274,8 @@ class blockstore_impl_t
    blockstore_dirty_db_t dirty_db;
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    int unsynced_big_write_count = 0;
+    int unsynced_big_write_count = 0, unstable_unsynced = 0;
+    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;

--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -65,7 +65,7 @@ int blockstore_init_meta::loop()
    GET_SQE();
    data->iov = { metadata_buffer, bs->dsk.meta_block_size };
    data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
-    my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset);
+    my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
    bs->ringloop->submit();
    submitted++;
 resume_1:
@@ -202,7 +202,7 @@ resume_2:
                data->iov = { bufs[i].buf, bufs[i].size };
                data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
                if (!zero_on_init)
-                    my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
+                    my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
                else
                {
                    // Fill metadata with zeroes
@@ -259,7 +259,7 @@ resume_2:
            GET_SQE();
            data->iov = { metadata_buffer, bs->dsk.meta_block_size };
            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
-            my_uring_prep_readv(sqe, bs->dsk.read_meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
+            my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
            submitted++;
 resume_5:
            if (submitted > 0)
@@ -467,7 +467,7 @@ int blockstore_init_journal::loop()
    data = ((ring_data_t*)sqe->user_data);
    data->iov = { submitted_buf, bs->journal.block_size };
    data->callback = simple_callback;
-    my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset);
+    my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
    bs->ringloop->submit();
    wait_count = 1;
 resume_1:
@@ -553,7 +553,7 @@ resume_1:
        }
        if (je_start->size == JE_START_V0_SIZE ||
            (je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
-            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
+            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
        {
            fprintf(
                stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,7 +562,8 @@ resume_1:
            );
            exit(1);
        }
-        if (je_start->version == JOURNAL_VERSION_V1)
+        if (je_start->version == JOURNAL_VERSION_V1 ||
+            je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
        {
            je_start->data_csum_type = 0;
            je_start->csum_block_size = 0;
@@ -607,7 +608,7 @@ resume_1:
                    end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
                };
                data->callback = [this](ring_data_t *data1) { handle_event(data1); };
-                my_uring_prep_readv(sqe, bs->dsk.read_journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
+                my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
                bs->ringloop->submit();
            }
            while (done.size() > 0)
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -145,6 +145,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
+        assert(journal.next_free != journal.used_start);
        memset(journal.inmemory
            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
            : (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
@@ -198,6 +199,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
    priv->pending_ops++;
    if (!priv->min_flushed_journal_sector)
        priv->min_flushed_journal_sector = 1+cur_sector;
+    assert(priv->min_flushed_journal_sector <= journal.sector_count);
    priv->max_flushed_journal_sector = 1+cur_sector;
 }

--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -13,12 +13,6 @@
 #define JOURNAL_BUFFER_SIZE 4*1024*1024
 #define JOURNAL_ENTRY_HEADER_SIZE 16

-// We reserve some extra space for future stabilize requests during writes
-// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
-// writing more than can be stabilized afterwards
-#define JOURNAL_STABILIZE_RESERVATION 65536
-#define JOURNAL_INSTANT_RESERVATION 131072
-
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -19,6 +19,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
+    if (config.find("autosync_writes") != config.end())
+    {
+        autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
+    }
    if (!max_flusher_count)
    {
        max_flusher_count = 256;
@@ -85,11 +89,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
        immediate_commit = IMMEDIATE_SMALL;
    }
    metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
-    inmemory_meta = config["inmemory_metadata"] != "false";
+    inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
+        config["inmemory_metadata"] != "no";
    journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
    journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
        config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
-    journal.inmemory = config["inmemory_journal"] != "false";
+    journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
+        config["inmemory_journal"] != "no";
    // Validate
    if (journal.sector_count < 2)
    {
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -29,7 +29,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
    PRIV(op)->pending_ops++;
    my_uring_prep_readv(
        sqe,
-        IS_JOURNAL(item_state) ? dsk.read_journal_fd : dsk.read_data_fd,
+        IS_JOURNAL(item_state) ? dsk.journal_fd : dsk.data_fd,
        &data->iov, 1,
        (IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset
    );
@@ -348,7 +348,7 @@ bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uin
        .csum_buf = vi->csum_buf,
        .dyn_data = vi->dyn_data,
    };
-    int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.read_journal_fd : dsk.read_data_fd);
+    int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd);
    uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
    uint32_t d_pos = 0;
    for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
@@ -702,7 +702,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
    BS_SUBMIT_GET_SQE(sqe, data);
    data->iov = (struct iovec){ buf, dsk.meta_block_size };
    PRIV(op)->pending_ops++;
-    my_uring_prep_readv(sqe, dsk.read_meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
+    my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
    data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
    // return pointer to checksums + bitmap
    return buf + pos + sizeof(clean_disk_entry);
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -27,8 +27,6 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
        unsynced_big_write_count -= unsynced_big_writes.size();
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
-        PRIV(op)->sync_small_checked = 0;
-        PRIV(op)->sync_big_checked = 0;
        unsynced_big_writes.clear();
        unsynced_small_writes.clear();
        if (PRIV(op)->sync_big_writes.size() > 0)
@@ -88,14 +86,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
                auto & dirty_entry = dirty_db.at(sbw);
                uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
                if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-                    left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
+                    (unstable_writes.size()+unstable_unsynced)*journal.block_size))
                {
                    return 0;
                }
            }
        }
        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -186,6 +185,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            mark_stable(dirty_it->first);
        }
+        else
+        {
+            unstable_unsynced--;
+            assert(unstable_unsynced >= 0);
+        }
        dirty_it++;
        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
        {
@@ -216,6 +220,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
            {
                mark_stable(*it);
            }
+            else
+            {
+                unstable_unsynced--;
+                assert(unstable_unsynced >= 0);
+            }
        }
    }
    op->retval = 0;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        dyn = calloc_or_die(1, dyn_size+sizeof(int));
        *((int*)dyn) = 1;
    }
-    uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
+    uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -127,8 +127,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            return false;
        }
    }
-    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size &&
-        immediate_commit != IMMEDIATE_ALL)
+    bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL));
+    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm ||
+        !imm && unsynced_queued_ops >= autosync_writes)
    {
        // Issue an additional sync so that the previous big write can reach the journal
        blockstore_op_t *sync_op = new blockstore_op_t;
@@ -139,6 +140,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        };
        enqueue_op(sync_op);
    }
+    else if (!imm)
+        unsynced_queued_ops++;
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
@@ -286,13 +289,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
 #endif
        auto prev_it = dirty_it;
-        prev_it--;
-        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
+        if (prev_it != dirty_db.begin())
        {
-            // Original version is still invalid
-            // All subsequent writes to the same object must be canceled too
-            cancel_all_writes(op, dirty_it, -EEXIST);
-            return 2;
+            prev_it--;
+            if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
+            {
+                // Original version is still invalid
+                // All subsequent writes to the same object must be canceled too
+                printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
+                    op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
+                cancel_all_writes(op, dirty_it, -EEXIST);
+                return 2;
+            }
        }
        op->version = PRIV(op)->real_version;
        PRIV(op)->real_version = 0;
@@ -312,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
            sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
-            (dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -378,7 +386,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        if (immediate_commit != IMMEDIATE_ALL && !(dirty_it->second.state & BS_ST_INSTANT))
+        {
+            unstable_unsynced++;
+        }
        if (immediate_commit != IMMEDIATE_ALL)
        {
            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -401,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
            || !space_check.check_available(op, 1,
                sizeof(journal_entry_small_write) + dyn_size,
-                op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+                (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -415,16 +426,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        write_iodepth++;
        // Got SQEs. Prepare previous journal sector write if required
        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        if (immediate_commit == IMMEDIATE_NONE)
+        if (immediate_commit == IMMEDIATE_NONE &&
+            !journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
        {
-            if (!journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
-            {
-                prepare_journal_sector_write(journal.cur_sector, op);
-            }
-            else
-            {
-                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-            }
+            prepare_journal_sector_write(journal.cur_sector, op);
        }
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
@@ -498,6 +503,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        if (journal.next_free >= journal.len)
        {
            journal.next_free = dsk.journal_block_size;
+            assert(journal.next_free != journal.used_start);
+        }
+        if (immediate_commit == IMMEDIATE_NONE && !(dirty_it->second.state & BS_ST_INSTANT))
+        {
+            unstable_unsynced++;
        }
        if (!PRIV(op)->pending_ops)
        {
@@ -537,7 +547,7 @@ resume_2:
        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-            ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -581,14 +591,20 @@ resume_4:
 #endif
        bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
        bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
+        bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
        if (imm)
        {
            auto & unstab = unstable_writes[op->oid];
            unstab = unstab < op->version ? op->version : unstab;
        }
+        else if (!is_instant)
+        {
+            unstable_unsynced--;
+            assert(unstable_unsynced >= 0);
+        }
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
            | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
-        if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
+        if (imm && is_instant)
        {
            // Deletions and 'instant' operations are treated as immediately stable
            mark_stable(dirty_it->first);
@@ -734,7 +750,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
    {
        return 0;
    }
@@ -750,17 +766,11 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    }
    write_iodepth++;
    // Prepare journal sector write
-    if (immediate_commit == IMMEDIATE_NONE)
+    if (immediate_commit == IMMEDIATE_NONE &&
+        (dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
+        journal.sector_info[journal.cur_sector].dirty)
    {
-        if ((dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            prepare_journal_sector_write(journal.cur_sector, op);
-        }
-        else
-        {
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        }
+        prepare_journal_sector_write(journal.cur_sector, op);
    }
    // Pre-fill journal entry
    journal_entry_del *je = (journal_entry_del*)prefill_single_journal_entry(
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -17,7 +17,7 @@
 static const char *exe_name = NULL;

 static const char* help_text =
-    "Vitastor command-line tool\n"
+    "Vitastor command-line tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -331,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
    {
        // Create client
        json11::Json cfg_j = cfg;
-        p->ringloop = new ring_loop_t(512);
+        p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
        p->epmgr = new epoll_manager_t(p->ringloop);
        p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
        // Smaller timeout by default for more interactiveness
@@ -349,6 +349,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
                p->ringloop->wait();
        }
        // Destroy the client
+        p->cli->flush();
        delete p->cli;
        delete p->epmgr;
        delete p->ringloop;
@@ -357,6 +358,8 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
        p->ringloop = NULL;
    }
    // Print result
+    fflush(stderr);
+    fflush(stdout);
    if (p->json_output && !result.data.is_null())
    {
        printf("%s\n", result.data.dump().c_str());
--- a/src/cli_alloc_osd.cpp
+++ b/src/cli_alloc_osd.cpp
@@ -77,8 +77,8 @@ struct alloc_osd_t
                    std::string key = base64_decode(kv["key"].string_value());
                    osd_num_t cur_osd;
                    char null_byte = 0;
-                    sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%lu%c", &cur_osd, &null_byte);
-                    if (!cur_osd || null_byte != 0)
+                    int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%lu%c", &cur_osd, &null_byte);
+                    if (scanned != 1 || !cur_osd)
                    {
                        fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
                        continue;
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -67,8 +67,8 @@ resume_1:
            // pool ID
            pool_id_t pool_id;
            char null_byte = 0;
-            sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
-            if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
+            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
+            if (scanned != 1 || !pool_id || pool_id >= POOL_ID_MAX)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
                continue;
@@ -82,8 +82,8 @@ resume_1:
            // osd ID
            osd_num_t osd_num;
            char null_byte = 0;
-            sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%lu%c", &osd_num, &null_byte);
-            if (!osd_num || osd_num >= POOL_ID_MAX || null_byte != 0)
+            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%lu%c", &osd_num, &null_byte);
+            if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
                continue;
@@ -109,7 +109,7 @@ resume_1:
            }
            for (auto pg_per_pair: pg_per_osd)
            {
-                uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
+                uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
                if (pool_avail > pg_free)
                {
                    pool_avail = pg_free;
@@ -124,8 +124,10 @@ resume_1:
                pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
            }
            pool_stats[pool_cfg.id] = json11::Json::object {
+                { "id", (uint64_t)pool_cfg.id },
                { "name", pool_cfg.name },
                { "pg_count", pool_cfg.pg_count },
+                { "real_pg_count", pool_cfg.real_pg_count },
                { "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
                { "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
                    ? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -176,7 +178,7 @@ resume_1:
            { "title", "SCHEME" },
        });
        cols.push_back(json11::Json::object{
-            { "key", "pg_count" },
+            { "key", "pg_count_fmt" },
            { "title", "PGS" },
        });
        cols.push_back(json11::Json::object{
@@ -205,6 +207,9 @@ resume_1:
            double raw_to = kv.second["raw_to_usable"].number_value();
            if (raw_to < 0.000001 && raw_to > -0.000001)
                raw_to = 1;
+            kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
+                ? kv.second["real_pg_count"].as_string()
+                : kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
            kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
            kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
            kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
--- a/src/cli_ls.cpp
+++ b/src/cli_ls.cpp
@@ -56,14 +56,15 @@ struct image_lister_t
            {
                continue;
            }
-            auto & pool_cfg = parent->cli->st_cli.pool_config.at(INODE_POOL(ic.second.num));
+            auto pool_it = parent->cli->st_cli.pool_config.find(INODE_POOL(ic.second.num));
+            bool good_pool = pool_it != parent->cli->st_cli.pool_config.end();
            auto item = json11::Json::object {
                { "name", ic.second.name },
                { "size", ic.second.size },
                { "used_size", 0 },
                { "readonly", ic.second.readonly },
                { "pool_id", (uint64_t)INODE_POOL(ic.second.num) },
-                { "pool_name", pool_cfg.name },
+                { "pool_name", good_pool ? pool_it->second.name : "? (ID:"+std::to_string(INODE_POOL(ic.second.num))+")" },
                { "inode_num", INODE_NO_POOL(ic.second.num) },
                { "inode_id", ic.second.num },
            };
@@ -132,8 +133,8 @@ resume_1:
            // pool ID
            pool_id_t pool_id;
            char null_byte = 0;
-            sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
-            if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
+            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
+            if (scanned != 1 || !pool_id || pool_id >= POOL_ID_MAX)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
                continue;
@@ -148,9 +149,9 @@ resume_1:
            pool_id_t pool_id;
            inode_t only_inode_num;
            char null_byte = 0;
-            sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
+            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
                "/inode/stats/%u/%lu%c", &pool_id, &only_inode_num, &null_byte);
-            if (!pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0 || null_byte != 0)
+            if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
                continue;
@@ -173,7 +174,7 @@ resume_1:
                    { "size", 0 },
                    { "readonly", false },
                    { "pool_id", (uint64_t)INODE_POOL(inode_num) },
-                    { "pool_name", pool_it == parent->cli->st_cli.pool_config.end()
+                    { "pool_name", pool_it != parent->cli->st_cli.pool_config.end()
                        ? (pool_it->second.name == "" ? "<Unnamed>" : pool_it->second.name) : "?" },
                    { "inode_num", INODE_NO_POOL(inode_num) },
                    { "inode_id", inode_num },
@@ -247,6 +248,8 @@ resume_1:
        if (state == 1)
            goto resume_1;
        get_list();
+        if (state == 100)
+            return;
        if (show_stats)
        {
 resume_1:
@@ -269,7 +272,7 @@ resume_1:
            { "key", "name" },
            { "title", "NAME" },
        });
-        if (!list_pool_id)
+        if (list_pool_name == "")
        {
            cols.push_back(json11::Json::object{
                { "key", "pool_name" },
@@ -376,16 +379,18 @@ resume_1:

 std::string print_table(json11::Json items, json11::Json header, bool use_esc)
 {
+    int header_sizes[header.array_items().size()];
    std::vector<int> sizes;
    for (int i = 0; i < header.array_items().size(); i++)
    {
-        sizes.push_back(header[i]["title"].string_value().length());
+        header_sizes[i] = utf8_length(header[i]["title"].string_value());
+        sizes.push_back(header_sizes[i]);
    }
    for (auto & item: items.array_items())
    {
        for (int i = 0; i < header.array_items().size(); i++)
        {
-            int l = item[header[i]["key"].string_value()].as_string().length();
+            int l = utf8_length(item[header[i]["key"].string_value()].as_string());
            sizes[i] = sizes[i] < l ? l : sizes[i];
        }
    }
@@ -397,7 +402,7 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
            // Separator
            str += "  ";
        }
-        int pad = sizes[i]-header[i]["title"].string_value().length();
+        int pad = sizes[i]-header_sizes[i];
        if (header[i]["right"].bool_value())
        {
            // Align right
@@ -425,7 +430,7 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
                // Separator
                str += "  ";
            }
-            int pad = sizes[i] - item[header[i]["key"].string_value()].as_string().length();
+            int pad = sizes[i] - utf8_length(item[header[i]["key"].string_value()].as_string());
            if (header[i]["right"].bool_value())
            {
                // Align right
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@@ -53,6 +53,7 @@ struct snap_merger_t
    std::map<inode_t, std::vector<uint64_t>> layer_lists;
    std::map<inode_t, uint64_t> layer_block_size;
    std::map<inode_t, uint64_t> layer_list_pos;
+    std::vector<snap_rw_op_t*> continue_rwo, continue_rwo2;
    int in_flight = 0;
    uint64_t last_fsync_offset = 0;
    uint64_t last_written_offset = 0;
@@ -304,6 +305,12 @@ struct snap_merger_t
        oit = merge_offsets.begin();
    resume_5:
        // Now read, overwrite and optionally delete offsets one by one
+        continue_rwo2.swap(continue_rwo);
+        for (auto rwo: continue_rwo2)
+        {
+            next_write(rwo);
+        }
+        continue_rwo2.clear();
        while (in_flight < parent->iodepth*parent->parallel_osds &&
            oit != merge_offsets.end() && !rwo_error.size())
        {
@@ -464,7 +471,8 @@ struct snap_merger_t
                rwo->error_offset = op->offset;
                rwo->error_read = true;
            }
-            next_write(rwo);
+            continue_rwo.push_back(rwo);
+            parent->ringloop->wakeup();
        };
        parent->cli->execute(op);
    }
@@ -544,11 +552,9 @@ struct snap_merger_t
            }
            // Increment CAS version
            rwo->op.version = subop->version;
-            if (use_cas)
-                next_write(rwo);
-            else
-                autofree_op(rwo);
            delete subop;
+            continue_rwo.push_back(rwo);
+            parent->ringloop->wakeup();
        };
        parent->cli->execute(subop);
    }
--- a/src/cli_modify.cpp
+++ b/src/cli_modify.cpp
@@ -13,7 +13,7 @@ struct image_changer_t
    std::string image_name;
    std::string new_name;
    uint64_t new_size = 0;
-    bool force_size = false;
+    bool force_size = false, inc_size = false;
    bool set_readonly = false, set_readwrite = false, force = false;
    // interval between fsyncs
    int fsync_interval = 128;
@@ -81,14 +81,14 @@ struct image_changer_t
        }
        if ((!set_readwrite || !cfg.readonly) &&
            (!set_readonly || cfg.readonly) &&
-            (!new_size && !force_size || cfg.size == new_size) &&
+            (!new_size && !force_size || cfg.size == new_size || cfg.size >= new_size && inc_size) &&
            (new_name == "" || new_name == image_name))
        {
            result = (cli_result_t){ .text = "No change" };
            state = 100;
            return;
        }
-        if (new_size != 0 || force_size)
+        if ((new_size != 0 || force_size) && (cfg.size < new_size || !inc_size))
        {
            if (cfg.size >= new_size)
            {
@@ -233,6 +233,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
    changer->new_name = cfg["rename"].string_value();
    changer->new_size = parse_size(cfg["resize"].as_string());
    changer->force_size = cfg["force_size"].bool_value();
+    changer->inc_size = cfg["inc_size"].bool_value();
    changer->force = cfg["force"].bool_value();
    changer->set_readonly = cfg["readonly"].bool_value();
    changer->set_readwrite = cfg["readwrite"].bool_value();
--- a/src/cli_rm.cpp
+++ b/src/cli_rm.cpp
@@ -384,8 +384,8 @@ resume_100:
                pool_id_t pool_id = 0;
                inode_t inode = 0;
                char null_byte = 0;
-                sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
-                if (!inode || null_byte != 0)
+                int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
+                if (scanned != 2 || !inode)
                {
                    result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
                    state = 100;
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -132,8 +132,8 @@ resume_2:
            auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
            osd_num_t stat_osd_num = 0;
            char null_byte = 0;
-            sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
-            if (!stat_osd_num || null_byte != 0)
+            int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
+            if (scanned != 1 || !stat_osd_num)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
                continue;
@@ -158,12 +158,7 @@ resume_2:
        for (auto & pool_pair: parent->cli->st_cli.pool_config)
        {
            auto & pool_cfg = pool_pair.second;
-            bool active = true;
-            if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
-            {
-                active = false;
-                pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
-            }
+            bool active = pool_cfg.real_pg_count > 0;
            pool_count++;
            for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
            {
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -3,21 +3,13 @@

 #include <stdexcept>
 #include <assert.h>
-#include "cluster_client.h"
+#include "cluster_client_impl.h"
+#include "http_client.h" // json_is_true

-#define SCRAP_BUFFER_SIZE 4*1024*1024
-#define PART_SENT 1
-#define PART_DONE 2
-#define PART_ERROR 4
-#define PART_RETRY 8
-#define CACHE_DIRTY 1
-#define CACHE_FLUSHING 2
-#define CACHE_REPEATING 3
-#define OP_FLUSH_BUFFER 0x02
-#define OP_IMMEDIATE_COMMIT 0x04
-
-cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
+cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
 {
+    wb = new writeback_cache_t();
+
    cli_config = config.object_items();
    file_config = osd_messenger_t::read_config(config);
    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
@@ -37,20 +29,14 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
            continue_lists();
            continue_raw_ops(peer_osd);
        }
-        else if (dirty_buffers.size())
+        else
        {
            // peer_osd just dropped connection
            // determine WHICH dirty_buffers are now obsolete and repeat them
-            for (auto & wr: dirty_buffers)
+            if (wb->repeat_ops_for(this, peer_osd) > 0)
            {
-                if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
-                    wr.second.state != CACHE_REPEATING)
-                {
-                    // FIXME: Flush in larger parts
-                    flush_buffer(wr.first, &wr.second);
-                }
+                continue_ops();
            }
-            continue_ops();
        }
    };
    msgr.exec_op = [this](osd_op_t *op)
@@ -78,16 +64,14 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

 cluster_client_t::~cluster_client_t()
 {
-    for (auto bp: dirty_buffers)
-    {
-        free(bp.second.buf);
-    }
-    dirty_buffers.clear();
+    msgr.repeer_pgs = [](osd_num_t){};
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
    }
    free(scrap_buffer);
+    delete wb;
+    wb = NULL;
 }

 cluster_op_t::~cluster_op_t()
@@ -136,6 +120,19 @@ void cluster_client_t::init_msgr()
    }
 }

+void cluster_client_t::unshift_op(cluster_op_t *op)
+{
+    op->next = op_queue_head;
+    if (op_queue_head)
+    {
+        op_queue_head->prev = op;
+        op_queue_head = op;
+    }
+    else
+        op_queue_tail = op_queue_head = op;
+    inc_wait(op->opcode, op->flags, op->next, 1);
+}
+
 void cluster_client_t::calc_wait(cluster_op_t *op)
 {
    op->prev_wait = 0;
@@ -156,7 +153,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    {
        for (auto prev = op->prev; prev; prev = prev->prev)
        {
-            if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && !(prev->flags & OP_IMMEDIATE_COMMIT))
+            if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && (!(prev->flags & OP_IMMEDIATE_COMMIT) || enable_writeback))
            {
                op->prev_wait++;
            }
@@ -166,68 +163,58 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    }
    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
    {
-        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
-        {
-            if (prev->opcode == OSD_OP_WRITE && (prev->flags & OP_FLUSH_BUFFER))
-            {
-                op->prev_wait++;
-            }
-            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
-                prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
-            {
-                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
-                break;
-            }
-        }
-        if (!op->prev_wait)
-            continue_rw(op);
+        continue_rw(op);
    }
 }

 void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
 {
-    if (opcode == OSD_OP_WRITE)
+    if (opcode != OSD_OP_WRITE && opcode != OSD_OP_SYNC)
    {
-        while (next)
-        {
-            auto n2 = next->next;
-            if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
-                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
-                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
-                    next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
-            {
-                next->prev_wait += inc;
-                assert(next->prev_wait >= 0);
-                if (!next->prev_wait)
-                {
-                    if (next->opcode == OSD_OP_SYNC)
-                        continue_sync(next);
-                    else
-                        continue_rw(next);
-                }
-            }
-            next = n2;
-        }
+        return;
    }
-    else if (opcode == OSD_OP_SYNC)
+    cluster_op_t *bh_ops_local[32], **bh_ops = bh_ops_local;
+    int bh_op_count = 0, bh_op_max = 32;
+    while (next)
    {
-        while (next)
+        auto n2 = next->next;
+        if (opcode == OSD_OP_WRITE
+            ? (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
+                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
+            : (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE))
        {
-            auto n2 = next->next;
-            if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
+            next->prev_wait += inc;
+            assert(next->prev_wait >= 0);
+            if (!next->prev_wait)
            {
-                next->prev_wait += inc;
-                assert(next->prev_wait >= 0);
-                if (!next->prev_wait)
+                // Kind of std::vector with local "small vector optimisation"
+                if (bh_op_count >= bh_op_max)
                {
-                    if (next->opcode == OSD_OP_SYNC)
-                        continue_sync(next);
-                    else
-                        continue_rw(next);
+                    bh_op_max *= 2;
+                    cluster_op_t **n = (cluster_op_t**)malloc_or_die(sizeof(cluster_op_t*) * bh_op_max);
+                    memcpy(n, bh_ops, sizeof(cluster_op_t*) * bh_op_count);
+                    if (bh_ops != bh_ops_local)
+                    {
+                        free(bh_ops);
+                    }
+                    bh_ops = n;
                }
+                bh_ops[bh_op_count++] = next;
            }
-            next = n2;
        }
+        next = n2;
+    }
+    for (int i = 0; i < bh_op_count; i++)
+    {
+        cluster_op_t *next = bh_ops[i];
+        if (next->opcode == OSD_OP_SYNC)
+            continue_sync(next);
+        else
+            continue_rw(next);
+    }
+    if (bh_ops != bh_ops_local)
+    {
+        free(bh_ops);
    }
 }

@@ -245,13 +232,37 @@ void cluster_client_t::erase_op(cluster_op_t *op)
        op_queue_tail = op->prev;
    op->next = op->prev = NULL;
    if (flags & OP_FLUSH_BUFFER)
+    {
+        // Completed flushes change writeback buffer states,
+        // so the callback should be run before inc_wait()
+        // which may continue following SYNCs, but these SYNCs
+        // should know about the changed buffer state
+        // This is ugly but this is the way we do it
        std::function<void(cluster_op_t*)>(op->callback)(op);
-    if (!(flags & OP_IMMEDIATE_COMMIT))
+    }
+    if (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
+    {
        inc_wait(opcode, flags, next, -1);
-    // Call callback at the end to avoid inconsistencies in prev_wait
-    // if the callback adds more operations itself
+    }
    if (!(flags & OP_FLUSH_BUFFER))
+    {
+        // Call callback at the end to avoid inconsistencies in prev_wait
+        // if the callback adds more operations itself
        std::function<void(cluster_op_t*)>(op->callback)(op);
+    }
+    if (flags & OP_FLUSH_BUFFER)
+    {
+        int i = 0;
+        while (i < wb->writeback_overflow.size() && wb->writebacks_active < client_max_writeback_iodepth)
+        {
+            execute_internal(wb->writeback_overflow[i]);
+            i++;
+        }
+        if (i > 0)
+        {
+            wb->writeback_overflow.erase(wb->writeback_overflow.begin(), wb->writeback_overflow.begin()+i);
+        }
+    }
 }

 void cluster_client_t::continue_ops(bool up_retry)
@@ -295,6 +306,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
 {
    this->etcd_global_config = etcd_global_config;
    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
+    // client_max_dirty_bytes/client_dirty_limit
    if (config.find("client_max_dirty_bytes") != config.end())
    {
        client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -310,11 +322,34 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
    {
        client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
    }
+    // client_max_dirty_ops
    client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
    if (!client_max_dirty_ops)
    {
        client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
    }
+    // client_enable_writeback
+    enable_writeback = json_is_true(config["client_enable_writeback"]) &&
+        json_is_true(config["client_writeback_allowed"]);
+    // client_max_buffered_bytes
+    client_max_buffered_bytes = config["client_max_buffered_bytes"].uint64_value();
+    if (!client_max_buffered_bytes)
+    {
+        client_max_buffered_bytes = DEFAULT_CLIENT_MAX_BUFFERED_BYTES;
+    }
+    // client_max_buffered_ops
+    client_max_buffered_ops = config["client_max_buffered_ops"].uint64_value();
+    if (!client_max_buffered_ops)
+    {
+        client_max_buffered_ops = DEFAULT_CLIENT_MAX_BUFFERED_OPS;
+    }
+    // client_max_writeback_iodepth
+    client_max_writeback_iodepth = config["client_max_writeback_iodepth"].uint64_value();
+    if (!client_max_writeback_iodepth)
+    {
+        client_max_writeback_iodepth = DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH;
+    }
+    // up_wait_retry_interval
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
    {
@@ -374,6 +409,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes

 bool cluster_client_t::get_immediate_commit(uint64_t inode)
 {
+    if (enable_writeback)
+        return false;
    pool_id_t pool_id = INODE_POOL(inode);
    if (!pool_id)
        return true;
@@ -408,6 +445,41 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
    }
 }

+bool cluster_client_t::flush()
+{
+    if (!ringloop)
+    {
+        if (wb->writeback_queue.size())
+        {
+            wb->start_writebacks(this, 0);
+            cluster_op_t *sync = new cluster_op_t;
+            sync->opcode = OSD_OP_SYNC;
+            sync->callback = [](cluster_op_t *sync)
+            {
+                delete sync;
+            };
+            execute(sync);
+        }
+        return op_queue_head == NULL;
+    }
+    bool sync_done = false;
+    cluster_op_t *sync = new cluster_op_t;
+    sync->opcode = OSD_OP_SYNC;
+    sync->callback = [&sync_done](cluster_op_t *sync)
+    {
+        delete sync;
+        sync_done = true;
+    };
+    execute(sync);
+    while (!sync_done)
+    {
+        ringloop->loop();
+        if (!sync_done)
+            ringloop->wait();
+    }
+    return true;
+}
+
 /**
 * How writes are synced when immediate_commit is false
 *
@@ -428,6 +500,9 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 * 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
 * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
 * 5) if any of them fail due to other errors, fail the SYNC operation
+ *
+ * If writeback caching is turned on and writeback limit is not exhausted:
+ * data is just copied and the write is confirmed to the client.
 */
 void cluster_client_t::execute(cluster_op_t *op)
 {
@@ -443,67 +518,73 @@ void cluster_client_t::execute(cluster_op_t *op)
        offline_ops.push_back(op);
        return;
    }
+    op->flags = op->flags & OSD_OP_IGNORE_READONLY; // the only allowed flag
+    execute_internal(op);
+}
+
+void cluster_client_t::execute_internal(cluster_op_t *op)
+{
    op->cur_inode = op->inode;
    op->retval = 0;
-    op->flags = op->flags & OSD_OP_IGNORE_READONLY; // single allowed flag
-    if (op->opcode != OSD_OP_SYNC)
+    // check alignment, readonly flag and so on
+    if (!check_rw(op))
    {
-        pool_id_t pool_id = INODE_POOL(op->cur_inode);
-        if (!pool_id)
+        return;
+    }
+    if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
+        !op->version /* no CAS writeback */)
+    {
+        if (wb->writebacks_active >= client_max_writeback_iodepth)
        {
-            op->retval = -EINVAL;
-            std::function<void(cluster_op_t*)>(op->callback)(op);
+            // Writeback queue is full, postpone the operation
+            wb->writeback_overflow.push_back(op);
            return;
        }
-        auto pool_it = st_cli.pool_config.find(pool_id);
-        if (pool_it == st_cli.pool_config.end() || pool_it->second.real_pg_count == 0)
+        // Just copy and acknowledge the operation
+        wb->copy_write(op, CACHE_DIRTY);
+        while (wb->writeback_bytes + op->len > client_max_buffered_bytes || wb->writeback_queue_size > client_max_buffered_ops)
        {
-            // Pools are loaded, but this one is unknown
-            op->retval = -EINVAL;
-            std::function<void(cluster_op_t*)>(op->callback)(op);
-            return;
-        }
-        // Check alignment
-        if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
-            op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
-        {
-            op->retval = -EINVAL;
-            std::function<void(cluster_op_t*)>(op->callback)(op);
-            return;
-        }
-        if (pool_it->second.immediate_commit == IMMEDIATE_ALL)
-        {
-            op->flags |= OP_IMMEDIATE_COMMIT;
+            // Initiate some writeback (asynchronously)
+            wb->start_writebacks(this, 1);
        }
+        op->retval = op->len;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return;
    }
    if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
    {
+        if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
+        {
+            wb->copy_write(op, CACHE_WRITTEN);
+        }
        if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
        {
            // Push an extra SYNC operation to flush previous writes
            cluster_op_t *sync_op = new cluster_op_t;
            sync_op->opcode = OSD_OP_SYNC;
+            sync_op->flags = OP_FLUSH_BUFFER;
            sync_op->callback = [](cluster_op_t* sync_op)
            {
                delete sync_op;
            };
-            sync_op->prev = op_queue_tail;
-            if (op_queue_tail)
-            {
-                op_queue_tail->next = sync_op;
-                op_queue_tail = sync_op;
-            }
-            else
-                op_queue_tail = op_queue_head = sync_op;
-            dirty_bytes = 0;
-            dirty_ops = 0;
-            calc_wait(sync_op);
+            execute_internal(sync_op);
        }
        dirty_bytes += op->len;
        dirty_ops++;
    }
    else if (op->opcode == OSD_OP_SYNC)
    {
+        // Flush the whole write-back queue first
+        if (!(op->flags & OP_FLUSH_BUFFER) && wb->writeback_overflow.size() > 0)
+        {
+            // Writeback queue is full, postpone the operation
+            wb->writeback_overflow.push_back(op);
+            return;
+        }
+        if (wb->writeback_queue.size())
+        {
+            wb->start_writebacks(this, 0);
+        }
        dirty_bytes = 0;
        dirty_ops = 0;
    }
@@ -515,7 +596,7 @@ void cluster_client_t::execute(cluster_op_t *op)
    }
    else
        op_queue_tail = op_queue_head = op;
-    if (!(op->flags & OP_IMMEDIATE_COMMIT))
+    if (!(op->flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
        calc_wait(op);
    else
    {
@@ -526,6 +607,52 @@ void cluster_client_t::execute(cluster_op_t *op)
    }
 }

+bool cluster_client_t::check_rw(cluster_op_t *op)
+{
+    if (op->opcode == OSD_OP_SYNC)
+    {
+        return true;
+    }
+    pool_id_t pool_id = INODE_POOL(op->cur_inode);
+    if (!pool_id)
+    {
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return false;
+    }
+    auto pool_it = st_cli.pool_config.find(pool_id);
+    if (pool_it == st_cli.pool_config.end() || pool_it->second.real_pg_count == 0)
+    {
+        // Pools are loaded, but this one is unknown
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return false;
+    }
+    // Check alignment
+    if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
+        op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
+    {
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return false;
+    }
+    if (pool_it->second.immediate_commit == IMMEDIATE_ALL)
+    {
+        op->flags |= OP_IMMEDIATE_COMMIT;
+    }
+    if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OSD_OP_IGNORE_READONLY))
+    {
+        auto ino_it = st_cli.inode_config.find(op->inode);
+        if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
+        {
+            op->retval = -EROFS;
+            std::function<void(cluster_op_t*)>(op->callback)(op);
+            return false;
+        }
+    }
+    return true;
+}
+
 void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
 {
    auto fd_it = msgr.osd_peer_fds.find(osd_num);
@@ -543,114 +670,6 @@ void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
    }
 }

-void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
-{
-    // Save operation for replay when one of PGs goes out of sync
-    // (primary OSD drops our connection in this case)
-    auto dirty_it = dirty_buffers.lower_bound((object_id){
-        .inode = op->inode,
-        .stripe = op->offset,
-    });
-    while (dirty_it != dirty_buffers.begin())
-    {
-        dirty_it--;
-        if (dirty_it->first.inode != op->inode ||
-            (dirty_it->first.stripe + dirty_it->second.len) <= op->offset)
-        {
-            dirty_it++;
-            break;
-        }
-    }
-    uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
-    while (len > 0)
-    {
-        uint64_t new_len = 0;
-        if (dirty_it == dirty_buffers.end())
-        {
-            new_len = len;
-        }
-        else if (dirty_it->first.inode != op->inode || dirty_it->first.stripe > pos)
-        {
-            new_len = dirty_it->first.stripe - pos;
-            if (new_len > len)
-            {
-                new_len = len;
-            }
-        }
-        if (new_len > 0)
-        {
-            dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
-                .inode = op->inode,
-                .stripe = pos,
-            }, (cluster_buffer_t){
-                .buf = malloc_or_die(new_len),
-                .len = new_len,
-            });
-        }
-        // FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
-        dirty_it->second.state = CACHE_DIRTY;
-        uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
-        if (cur_len > len)
-        {
-            cur_len = len;
-        }
-        while (cur_len > 0 && iov_idx < op->iov.count)
-        {
-            unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
-            if (iov_len <= cur_len)
-            {
-                memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
-                    (uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
-                pos += iov_len;
-                len -= iov_len;
-                cur_len -= iov_len;
-                iov_pos = 0;
-                iov_idx++;
-            }
-            else
-            {
-                memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
-                    (uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
-                pos += cur_len;
-                len -= cur_len;
-                iov_pos += cur_len;
-                cur_len = 0;
-            }
-        }
-        dirty_it++;
-    }
-}
-
-void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
-{
-    wr->state = CACHE_REPEATING;
-    cluster_op_t *op = new cluster_op_t;
-    op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
-    op->opcode = OSD_OP_WRITE;
-    op->cur_inode = op->inode = oid.inode;
-    op->offset = oid.stripe;
-    op->len = wr->len;
-    op->iov.push_back(wr->buf, wr->len);
-    op->callback = [wr](cluster_op_t* op)
-    {
-        if (wr->state == CACHE_REPEATING)
-        {
-            wr->state = CACHE_DIRTY;
-        }
-        delete op;
-    };
-    op->next = op_queue_head;
-    if (op_queue_head)
-    {
-        op_queue_head->prev = op;
-        op_queue_head = op;
-    }
-    else
-        op_queue_tail = op_queue_head = op;
-    inc_wait(op->opcode, op->flags, op->next, 1);
-    continue_rw(op);
-}
-
 int cluster_client_t::continue_rw(cluster_op_t *op)
 {
    if (op->state == 0)
@@ -659,27 +678,7 @@ int cluster_client_t::continue_rw(cluster_op_t *op)
        goto resume_1;
    else if (op->state == 2)
        goto resume_2;
-    else if (op->state == 3)
-        goto resume_3;
 resume_0:
-    if (op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE)
-    {
-        if (!(op->flags & OSD_OP_IGNORE_READONLY))
-        {
-            auto ino_it = st_cli.inode_config.find(op->inode);
-            if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
-            {
-                op->retval = -EINVAL;
-                erase_op(op);
-                return 1;
-            }
-        }
-        if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT) && !(op->flags & OP_FLUSH_BUFFER))
-        {
-            copy_write(op, dirty_buffers);
-        }
-    }
-resume_1:
    // Slice the operation into parts
    slice_rw(op);
    op->needs_reslice = false;
@@ -690,9 +689,9 @@ resume_1:
        erase_op(op);
        return 1;
    }
-resume_2:
+resume_1:
    // Send unsent parts, if they're not subject to change
-    op->state = 3;
+    op->state = 2;
    if (op->needs_reslice)
    {
        for (int i = 0; i < op->parts.size(); i++)
@@ -702,7 +701,7 @@ resume_2:
                op->retval = -EPIPE;
            }
        }
-        goto resume_3;
+        goto resume_2;
    }
    for (int i = 0; i < op->parts.size(); i++)
    {
@@ -723,18 +722,18 @@ resume_2:
                        });
                    }
                }
-                op->state = 2;
+                op->state = 1;
            }
        }
    }
-    if (op->state == 2)
+    if (op->state == 1)
    {
        return 0;
    }
-resume_3:
+resume_2:
    if (op->inflight_count > 0)
    {
-        op->state = 3;
+        op->state = 2;
        return 0;
    }
    if (op->done_count >= op->parts.size())
@@ -762,7 +761,7 @@ resume_3:
                op->cur_inode = ino_it->second.parent_id;
                op->parts.clear();
                op->done_count = 0;
-                goto resume_1;
+                goto resume_0;
            }
        }
        op->retval = op->len;
@@ -774,7 +773,8 @@ resume_3:
        erase_op(op);
        return 1;
    }
-    else if (op->retval != 0 && op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
+    else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
+        op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
    {
        // Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
        // FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
@@ -789,7 +789,7 @@ resume_3:
        {
            op->parts.clear();
            op->done_count = 0;
-            goto resume_1;
+            goto resume_0;
        }
        else
        {
@@ -800,7 +800,7 @@ resume_3:
                    op->parts[i].flags = PART_RETRY;
                }
            }
-            goto resume_2;
+            goto resume_1;
        }
    }
    return 0;
@@ -874,6 +874,11 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    int iov_idx = 0;
    size_t iov_pos = 0;
    int i = 0;
+    // We also have to return reads from CACHE_REPEATING buffers - they are not
+    // guaranteed to be present on target OSDs at the moment of repeating
+    // And we're also free to return data from other cached buffers just
+    // because it's faster
+    bool dirty_copied = wb->read_from_cache(op, pool_cfg.bitmap_granularity);
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
        pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
@@ -881,7 +886,8 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
            ? (stripe + pg_block_size) : (op->offset + op->len);
        op->parts[i].iov.reset();
-        if (op->cur_inode != op->inode)
+        op->parts[i].flags = 0;
+        if (op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied)
        {
            // Read remaining parts from upper layers
            uint64_t prev = begin, cur = begin;
@@ -918,7 +924,10 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            else
                add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
            if (end == begin)
+            {
                op->done_count++;
+                op->parts[i].flags = PART_DONE;
+            }
        }
        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
        {
@@ -930,7 +939,6 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
        op->parts[i].pg_num = pg_num;
        op->parts[i].osd_num = 0;
-        op->parts[i].flags = 0;
        i++;
    }
 }
@@ -1042,13 +1050,7 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
            do_it++;
    }
    // Post sync to affected OSDs
-    for (auto & prev_op: dirty_buffers)
-    {
-        if (prev_op.second.state == CACHE_DIRTY)
-        {
-            prev_op.second.state = CACHE_FLUSHING;
-        }
-    }
+    wb->fsync_start();
    op->parts.resize(dirty_osds.size());
    op->retval = 0;
    {
@@ -1073,13 +1075,7 @@ resume_1:
    }
    if (op->retval != 0)
    {
-        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
-        {
-            if (uw_it->second.state == CACHE_FLUSHING)
-            {
-                uw_it->second.state = CACHE_DIRTY;
-            }
-        }
+        wb->fsync_error();
        if (op->retval == -EPIPE || op->retval == -EIO || op->retval == -ENOSPC)
        {
            // Retry later
@@ -1093,16 +1089,7 @@ resume_1:
    }
    else
    {
-        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
-        {
-            if (uw_it->second.state == CACHE_FLUSHING)
-            {
-                free(uw_it->second.buf);
-                dirty_buffers.erase(uw_it++);
-            }
-            else
-                uw_it++;
-        }
+        wb->fsync_ok();
    }
    erase_op(op);
    return 1;
@@ -1165,7 +1152,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
                osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
            );
        }
-        else
+        else if (log_level > 0)
        {
            fprintf(
                stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -8,6 +8,9 @@

 #define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
 #define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
+#define DEFAULT_CLIENT_MAX_BUFFERED_BYTES 32*1024*1024
+#define DEFAULT_CLIENT_MAX_BUFFERED_OPS 1024
+#define DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH 256
 #define INODE_LIST_DONE 1
 #define INODE_LIST_HAS_UNSTABLE 2
 #define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
@@ -64,17 +67,12 @@ protected:
    cluster_op_t *prev = NULL, *next = NULL;
    int prev_wait = 0;
    friend class cluster_client_t;
-};
-
-struct cluster_buffer_t
-{
-    void *buf;
-    uint64_t len;
-    int state;
+    friend class writeback_cache_t;
 };

 struct inode_list_t;
 struct inode_list_osd_t;
+class writeback_cache_t;

 // FIXME: Split into public and private interfaces
 class cluster_client_t
@@ -83,16 +81,23 @@ class cluster_client_t
    ring_loop_t *ringloop;

    std::map<pool_id_t, uint64_t> pg_counts;
-    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
+    // client_max_dirty_* is actually "max unsynced", for the case when immediate_commit is off
    uint64_t client_max_dirty_bytes = 0;
    uint64_t client_max_dirty_ops = 0;
+    // writeback improves (1) small consecutive writes and (2) Q1 writes without fsync
+    bool enable_writeback = false;
+    // client_max_buffered_* is the real "dirty limit" - maximum amount of writes buffered in memory
+    uint64_t client_max_buffered_bytes = 0;
+    uint64_t client_max_buffered_ops = 0;
+    uint64_t client_max_writeback_iodepth = 0;
+
    int log_level;
    int up_wait_retry_interval = 500; // ms

    int retry_timeout_id = 0;
    std::vector<cluster_op_t*> offline_ops;
    cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
-    std::map<object_id, cluster_buffer_t> dirty_buffers;
+    writeback_cache_t *wb = NULL;
    std::set<osd_num_t> dirty_osds;
    uint64_t dirty_bytes = 0, dirty_ops = 0;

@@ -116,16 +121,16 @@ public:
    json11::Json::object cli_config, file_config, etcd_global_config;
    json11::Json::object config;

-    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
+    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
    void execute_raw(osd_num_t osd_num, osd_op_t *op);
    bool is_ready();
    void on_ready(std::function<void(void)> fn);
+    bool flush();

    bool get_immediate_commit(uint64_t inode);

-    static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
    void continue_ops(bool up_retry = false);
    inode_list_t *list_inode_start(inode_t inode,
        std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
@@ -138,12 +143,14 @@ public:

 protected:
    bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
-    void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
    void on_load_config_hook(json11::Json::object & config);
    void on_load_pgs_hook(bool success);
    void on_change_hook(std::map<std::string, etcd_kv_t> & changes);
    void on_change_osd_state_hook(uint64_t peer_osd);
+    void execute_internal(cluster_op_t *op);
+    void unshift_op(cluster_op_t *op);
    int continue_rw(cluster_op_t *op);
+    bool check_rw(cluster_op_t *op);
    void slice_rw(cluster_op_t *op);
    bool try_send(cluster_op_t *op, int i);
    int continue_sync(cluster_op_t *op);
@@ -157,4 +164,6 @@ protected:
    void continue_listing(inode_list_t *lst);
    void send_list(inode_list_osd_t *cur_list);
    void continue_raw_ops(osd_num_t peer_osd);
+
+    friend class writeback_cache_t;
 };
--- a/src/cluster_client_impl.h
+++ b/src/cluster_client_impl.h
@@ -0,0 +1,57 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include "cluster_client.h"
+
+#define SCRAP_BUFFER_SIZE 4*1024*1024
+#define PART_SENT 1
+#define PART_DONE 2
+#define PART_ERROR 4
+#define PART_RETRY 8
+#define CACHE_DIRTY 1
+#define CACHE_WRITTEN 2
+#define CACHE_FLUSHING 3
+#define CACHE_REPEATING 4
+#define OP_FLUSH_BUFFER 0x02
+#define OP_IMMEDIATE_COMMIT 0x04
+
+struct cluster_buffer_t
+{
+    uint8_t *buf;
+    uint64_t len;
+    int state;
+    uint64_t flush_id;
+    uint64_t *refcnt;
+};
+
+typedef std::map<object_id, cluster_buffer_t>::iterator dirty_buf_it_t;
+
+class writeback_cache_t
+{
+public:
+    uint64_t writeback_bytes = 0;
+    int writeback_queue_size = 0;
+    int writebacks_active = 0;
+    uint64_t last_flush_id = 0;
+
+    std::map<object_id, cluster_buffer_t> dirty_buffers;
+    std::vector<cluster_op_t*> writeback_overflow;
+    std::vector<object_id> writeback_queue;
+    std::multimap<uint64_t, uint64_t*> flushed_buffers; // flush_id => refcnt
+
+    ~writeback_cache_t();
+    dirty_buf_it_t find_dirty(uint64_t inode, uint64_t offset);
+    bool is_left_merged(dirty_buf_it_t dirty_it);
+    bool is_right_merged(dirty_buf_it_t dirty_it);
+    bool is_merged(const dirty_buf_it_t & dirty_it);
+    void copy_write(cluster_op_t *op, int state);
+    int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd);
+    void start_writebacks(cluster_client_t *cli, int count);
+    bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
+    void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
+    void fsync_start();
+    void fsync_error();
+    void fsync_ok();
+};
--- a/src/cluster_client_wb.cpp
+++ b/src/cluster_client_wb.cpp
@@ -0,0 +1,498 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <cassert>
+
+#include "cluster_client_impl.h"
+
+writeback_cache_t::~writeback_cache_t()
+{
+    for (auto & bp: dirty_buffers)
+    {
+        if (!--(*bp.second.refcnt))
+        {
+            free(bp.second.refcnt); // refcnt is allocated with the buffer
+        }
+    }
+    dirty_buffers.clear();
+}
+
+dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
+{
+    auto dirty_it = dirty_buffers.lower_bound((object_id){
+        .inode = inode,
+        .stripe = offset,
+    });
+    while (dirty_it != dirty_buffers.begin())
+    {
+        dirty_it--;
+        if (dirty_it->first.inode != inode ||
+            (dirty_it->first.stripe + dirty_it->second.len) <= offset)
+        {
+            dirty_it++;
+            break;
+        }
+    }
+    return dirty_it;
+}
+
+bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
+{
+    if (dirty_it != dirty_buffers.begin())
+    {
+        auto prev_it = dirty_it;
+        prev_it--;
+        if (prev_it->first.inode == dirty_it->first.inode &&
+            prev_it->first.stripe+prev_it->second.len == dirty_it->first.stripe &&
+            prev_it->second.state == CACHE_DIRTY)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
+{
+    auto next_it = dirty_it;
+    next_it++;
+    if (next_it != dirty_buffers.end() &&
+        next_it->first.inode == dirty_it->first.inode &&
+        next_it->first.stripe == dirty_it->first.stripe+dirty_it->second.len &&
+        next_it->second.state == CACHE_DIRTY)
+    {
+        return true;
+    }
+    return false;
+}
+
+bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
+{
+    return is_left_merged(dirty_it) || is_right_merged(dirty_it);
+}
+
+void writeback_cache_t::copy_write(cluster_op_t *op, int state)
+{
+    // Save operation for replay when one of PGs goes out of sync
+    // (primary OSD drops our connection in this case)
+    // ...or just save it for writeback if write buffering is enabled
+    if (op->len == 0)
+    {
+        return;
+    }
+    auto dirty_it = find_dirty(op->inode, op->offset);
+    auto new_end = op->offset + op->len;
+    while (dirty_it != dirty_buffers.end() &&
+        dirty_it->first.inode == op->inode &&
+        dirty_it->first.stripe < op->offset+op->len)
+    {
+        assert(dirty_it->first.stripe + dirty_it->second.len > op->offset);
+        // Remove overlapping part(s) of buffers
+        auto old_end = dirty_it->first.stripe + dirty_it->second.len;
+        if (dirty_it->first.stripe < op->offset)
+        {
+            if (old_end > new_end)
+            {
+                // Split into end and start
+                dirty_it->second.len = op->offset - dirty_it->first.stripe;
+                dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
+                    .inode = op->inode,
+                    .stripe = new_end,
+                }, (cluster_buffer_t){
+                    .buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
+                    .len = old_end - new_end,
+                    .state = dirty_it->second.state,
+                    .flush_id = dirty_it->second.flush_id,
+                    .refcnt = dirty_it->second.refcnt,
+                });
+                (*dirty_it->second.refcnt)++;
+                if (dirty_it->second.state == CACHE_DIRTY)
+                {
+                    writeback_bytes -= op->len;
+                    writeback_queue_size++;
+                }
+                break;
+            }
+            else
+            {
+                // Only leave the beginning
+                if (dirty_it->second.state == CACHE_DIRTY)
+                {
+                    writeback_bytes -= old_end - op->offset;
+                    if (is_left_merged(dirty_it) && !is_right_merged(dirty_it))
+                    {
+                        writeback_queue_size++;
+                    }
+                }
+                dirty_it->second.len = op->offset - dirty_it->first.stripe;
+                dirty_it++;
+            }
+        }
+        else if (old_end > new_end)
+        {
+            // Only leave the end
+            if (dirty_it->second.state == CACHE_DIRTY)
+            {
+                writeback_bytes -= new_end - dirty_it->first.stripe;
+                if (!is_left_merged(dirty_it) && is_right_merged(dirty_it))
+                {
+                    writeback_queue_size++;
+                }
+            }
+            auto new_dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
+                .inode = op->inode,
+                .stripe = new_end,
+            }, (cluster_buffer_t){
+                .buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
+                .len = old_end - new_end,
+                .state = dirty_it->second.state,
+                .flush_id = dirty_it->second.flush_id,
+                .refcnt = dirty_it->second.refcnt,
+            });
+            dirty_buffers.erase(dirty_it);
+            dirty_it = new_dirty_it;
+            break;
+        }
+        else
+        {
+            // Remove the whole buffer
+            if (dirty_it->second.state == CACHE_DIRTY && !is_merged(dirty_it))
+            {
+                writeback_bytes -= dirty_it->second.len;
+                assert(writeback_queue_size > 0);
+                writeback_queue_size--;
+            }
+            if (!--(*dirty_it->second.refcnt))
+            {
+                free(dirty_it->second.refcnt);
+            }
+            dirty_buffers.erase(dirty_it++);
+        }
+    }
+    // Overlapping buffers are removed, just insert the new one
+    uint64_t *refcnt = (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
+    uint8_t *buf = (uint8_t*)refcnt + sizeof(uint64_t);
+    *refcnt = 1;
+    dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
+        .inode = op->inode,
+        .stripe = op->offset,
+    }, (cluster_buffer_t){
+        .buf = buf,
+        .len = op->len,
+        .state = state,
+        .refcnt = refcnt,
+    });
+    if (state == CACHE_DIRTY)
+    {
+        writeback_bytes += op->len;
+        // Track consecutive write-back operations
+        if (!is_merged(dirty_it))
+        {
+            // <writeback_queue> is OK to contain more than actual number of consecutive
+            // requests as long as it doesn't miss anything. But <writeback_queue_size>
+            // is always calculated correctly.
+            writeback_queue_size++;
+            writeback_queue.push_back((object_id){
+                .inode = op->inode,
+                .stripe = op->offset,
+            });
+        }
+    }
+    uint64_t pos = 0, len = op->len, iov_idx = 0;
+    while (len > 0 && iov_idx < op->iov.count)
+    {
+        auto & iov = op->iov.buf[iov_idx];
+        memcpy(buf + pos, iov.iov_base, iov.iov_len);
+        pos += iov.iov_len;
+        iov_idx++;
+    }
+}
+
+int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
+{
+    int repeated = 0;
+    if (dirty_buffers.size())
+    {
+        // peer_osd just dropped connection
+        // determine WHICH dirty_buffers are now obsolete and repeat them
+        for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
+        {
+            bool end = wr_it == dirty_buffers.end();
+            bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
+                cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
+            if (flush_it != wr_it && (end || !flush_this ||
+                wr_it->first.inode != flush_it->first.inode ||
+                wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
+            {
+                repeated++;
+                flush_buffers(cli, flush_it, wr_it);
+                flush_it = wr_it;
+            }
+            if (end)
+                break;
+            last_it = wr_it;
+            wr_it++;
+            if (!flush_this)
+                flush_it = wr_it;
+        }
+    }
+    return repeated;
+}
+
+void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it)
+{
+    auto prev_it = to_it;
+    prev_it--;
+    bool is_writeback = from_it->second.state == CACHE_DIRTY;
+    cluster_op_t *op = new cluster_op_t;
+    op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
+    op->opcode = OSD_OP_WRITE;
+    op->cur_inode = op->inode = from_it->first.inode;
+    op->offset = from_it->first.stripe;
+    op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
+    uint32_t calc_len = 0;
+    uint64_t flush_id = ++last_flush_id;
+    for (auto it = from_it; it != to_it; it++)
+    {
+        it->second.state = CACHE_REPEATING;
+        it->second.flush_id = flush_id;
+        (*it->second.refcnt)++;
+        flushed_buffers.emplace(flush_id, it->second.refcnt);
+        op->iov.push_back(it->second.buf, it->second.len);
+        calc_len += it->second.len;
+    }
+    assert(calc_len == op->len);
+    writebacks_active++;
+    op->callback = [this, flush_id](cluster_op_t* op)
+    {
+        // Buffer flushes should be always retried, regardless of the error,
+        // so they should never result in an error here
+        assert(op->retval == op->len);
+        for (auto fl_it = flushed_buffers.find(flush_id);
+            fl_it != flushed_buffers.end() && fl_it->first == flush_id; )
+        {
+            if (!--(*fl_it->second)) // refcnt
+            {
+                free(fl_it->second);
+            }
+            flushed_buffers.erase(fl_it++);
+        }
+        for (auto dirty_it = find_dirty(op->inode, op->offset);
+            dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
+            dirty_it->first.stripe < op->offset+op->len; dirty_it++)
+        {
+            if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
+            {
+                dirty_it->second.flush_id = 0;
+                dirty_it->second.state = CACHE_WRITTEN;
+            }
+        }
+        delete op;
+        writebacks_active--;
+        // We can't call execute_internal because it affects an invalid copy of the list here
+        // (erase_op remembers `next` after writeback callback)
+    };
+    if (is_writeback)
+    {
+        cli->execute_internal(op);
+    }
+    else
+    {
+        // Insert repeated flushes into the beginning
+        cli->unshift_op(op);
+        cli->continue_rw(op);
+    }
+}
+
+void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
+{
+    if (!writeback_queue.size())
+    {
+        return;
+    }
+    std::vector<object_id> queue_copy;
+    queue_copy.swap(writeback_queue);
+    int started = 0, i = 0;
+    for (i = 0; i < queue_copy.size() && (!count || started < count); i++)
+    {
+        object_id & req = queue_copy[i];
+        auto dirty_it = find_dirty(req.inode, req.stripe);
+        if (dirty_it == dirty_buffers.end() ||
+            dirty_it->first.inode != req.inode ||
+            dirty_it->second.state != CACHE_DIRTY)
+        {
+            continue;
+        }
+        auto from_it = dirty_it;
+        uint64_t off = dirty_it->first.stripe;
+        while (from_it != dirty_buffers.begin())
+        {
+            from_it--;
+            if (from_it->second.state != CACHE_DIRTY ||
+                from_it->first.inode != req.inode ||
+                from_it->first.stripe+from_it->second.len != off)
+            {
+                from_it++;
+                break;
+            }
+            off = from_it->first.stripe;
+        }
+        off = dirty_it->first.stripe + dirty_it->second.len;
+        auto to_it = dirty_it;
+        to_it++;
+        while (to_it != dirty_buffers.end())
+        {
+            if (to_it->second.state != CACHE_DIRTY ||
+                to_it->first.inode != req.inode ||
+                to_it->first.stripe != off)
+            {
+                break;
+            }
+            off = to_it->first.stripe + to_it->second.len;
+            to_it++;
+        }
+        started++;
+        assert(writeback_queue_size > 0);
+        writeback_queue_size--;
+        writeback_bytes -= off - from_it->first.stripe;
+        flush_buffers(cli, from_it, to_it);
+    }
+    queue_copy.erase(queue_copy.begin(), queue_copy.begin()+i);
+    if (writeback_queue.size())
+    {
+        queue_copy.insert(queue_copy.end(), writeback_queue.begin(), writeback_queue.end());
+    }
+    queue_copy.swap(writeback_queue);
+}
+
+static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t len, uint32_t bitmap_granularity)
+{
+    if (op->opcode == OSD_OP_READ)
+    {
+        // Not OSD_OP_READ_BITMAP or OSD_OP_READ_CHAIN_BITMAP
+        int iov_idx = 0;
+        uint64_t cur_offset = op->offset;
+        while (iov_idx < op->iov.count && cur_offset+op->iov.buf[iov_idx].iov_len <= offset)
+        {
+            cur_offset += op->iov.buf[iov_idx].iov_len;
+            iov_idx++;
+        }
+        while (iov_idx < op->iov.count && cur_offset < offset+len)
+        {
+            auto & v = op->iov.buf[iov_idx];
+            auto begin = (cur_offset < offset ? offset : cur_offset);
+            auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
+            memcpy(
+                (uint8_t*)v.iov_base + begin - cur_offset,
+                buf + (cur_offset <= offset ? 0 : cur_offset-offset),
+                end - begin
+            );
+            cur_offset += v.iov_len;
+            iov_idx++;
+        }
+    }
+    // Set bitmap bits
+    int start_bit = (offset-op->offset)/bitmap_granularity;
+    int end_bit = (offset-op->offset+len)/bitmap_granularity;
+    for (int bit = start_bit; bit < end_bit;)
+    {
+        if (!(bit%8) && bit <= end_bit-8)
+        {
+            ((uint8_t*)op->bitmap_buf)[bit/8] = 0xFF;
+            bit += 8;
+        }
+        else
+        {
+            ((uint8_t*)op->bitmap_buf)[bit/8] |= (1 << (bit%8));
+            bit++;
+        }
+    }
+}
+
+bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity)
+{
+    bool dirty_copied = false;
+    if (dirty_buffers.size() && (op->opcode == OSD_OP_READ ||
+        op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
+    {
+        // We also have to return reads from CACHE_REPEATING buffers - they are not
+        // guaranteed to be present on target OSDs at the moment of repeating
+        // And we're also free to return data from other cached buffers just
+        // because it's faster
+        auto dirty_it = find_dirty(op->cur_inode, op->offset);
+        while (dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->cur_inode &&
+            dirty_it->first.stripe < op->offset+op->len)
+        {
+            uint64_t begin = dirty_it->first.stripe, end = dirty_it->first.stripe + dirty_it->second.len;
+            if (begin < op->offset)
+                begin = op->offset;
+            if (end > op->offset+op->len)
+                end = op->offset+op->len;
+            bool skip_prev = true;
+            uint64_t cur = begin, prev = begin;
+            while (cur < end)
+            {
+                unsigned bmp_loc = (cur - op->offset)/bitmap_granularity;
+                bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
+                if (skip_prev != skip)
+                {
+                    if (cur > prev && !skip)
+                    {
+                        // Copy data
+                        dirty_copied = true;
+                        copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
+                    }
+                    skip_prev = skip;
+                    prev = cur;
+                }
+                cur += bitmap_granularity;
+            }
+            assert(cur > prev);
+            if (!skip_prev)
+            {
+                // Copy data
+                dirty_copied = true;
+                copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
+            }
+            dirty_it++;
+        }
+    }
+    return dirty_copied;
+}
+
+void writeback_cache_t::fsync_start()
+{
+    for (auto & prev_op: dirty_buffers)
+    {
+        if (prev_op.second.state == CACHE_WRITTEN)
+        {
+            prev_op.second.state = CACHE_FLUSHING;
+        }
+    }
+}
+
+void writeback_cache_t::fsync_error()
+{
+    for (auto & prev_op: dirty_buffers)
+    {
+        if (prev_op.second.state == CACHE_FLUSHING)
+        {
+            prev_op.second.state = CACHE_WRITTEN;
+        }
+    }
+}
+
+void writeback_cache_t::fsync_ok()
+{
+    for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
+    {
+        if (uw_it->second.state == CACHE_FLUSHING)
+        {
+            if (!--(*uw_it->second.refcnt))
+                free(uw_it->second.refcnt);
+            dirty_buffers.erase(uw_it++);
+        }
+        else
+            uw_it++;
+    }
+}
--- a/src/disk_tool.cpp
+++ b/src/disk_tool.cpp
@@ -5,7 +5,7 @@
 #include "str_util.h"

 static const char *help_text =
-    "Vitastor disk management tool\n"
+    "Vitastor disk management tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -74,7 +74,7 @@ static const char *help_text =
    "  If it doesn't succeed it issues a warning in the system log.\n"
    "  \n"
    "  You can also pass other OSD options here as arguments and they'll be persisted\n"
-    "  in the superblock: cached_read_data, cached_read_meta, cached_read_journal,\n"
+    "  in the superblock: data_io, meta_io, journal_io,\n"
    "  inmemory_metadata, inmemory_journal, max_write_iodepth,\n"
    "  min_flusher_count, max_flusher_count, journal_sector_buffer_count,\n"
    "  journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,\n"
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
        {
            self.options["allow_data_loss"] = "1";
        }
-        else if (argv[i][0] == '-' && argv[i][1] == '-')
+        else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
        {
            char *key = argv[i]+2;
            self.options[key] = argv[++i];
--- a/src/disk_tool_journal.cpp
+++ b/src/disk_tool_journal.cpp
@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
        if (journal_calc_data_pos != sw.data_offset)
        {
            printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
-                : " (mismatched, calculated = %lu)", journal_pos);
+                : " (mismatched, calculated = %08lx)", journal_pos);
        }
        uint32_t data_csum_size = (!je_start.csum_block_size
            ? 0
--- a/Show More
+++ b/Show More