WIP Send additional fsyncs from secondary OSDs to primaries to prevent lockups

2023-09-16 17:54:28 +03:00
116 changed files with 742 additions and 2812 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "1.3.1")
+set(VERSION "1.0.0")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@@ -50,7 +50,6 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - Параметры
    - [Общие](docs/config/common.ru.md)
    - [Сетевые](docs/config/network.ru.md)
-    - [Клиентский код](docs/config/client.en.md)
    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
    - [Прочие параметры OSD](docs/config/osd.ru.md)
--- a/README.md
+++ b/README.md
@@ -50,7 +50,6 @@ Read more details below in the documentation.
  - Parameter Reference
    - [Common](docs/config/common.en.md)
    - [Network](docs/config/network.en.md)
-    - [Client](docs/config/client.en.md)
    - [Global Disk Layout](docs/config/layout-cluster.en.md)
    - [OSD Disk Layout](docs/config/layout-osd.en.md)
    - [OSD Runtime Parameters](docs/config/osd.en.md)
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@@ -1,15 +1,14 @@
 # Compile stage
-FROM golang:bookworm AS build
+FROM golang:buster AS build

 ADD go.sum go.mod /app/
 RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
 ADD . /app
-RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
-    cd /app && \
-    CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
+RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
+RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi

 # Final stage
-FROM debian:bookworm
+FROM debian:buster

 LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
 LABEL description="Vitastor CSI Driver"
@@ -19,30 +18,19 @@ ENV CSI_ENDPOINT=""

 RUN apt-get update && \
    apt-get install -y wget && \
+    (echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
    apt-get update && \
-    apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
-        # dependencies of qemu-storage-daemon
-        libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
-        libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
+    apt-get install -y e2fsprogs xfsprogs kmod && \
    apt-get clean && \
    (echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)

 COPY --from=build /app/vitastor-csi /bin/

-RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
-    ((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
+RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
    apt-get update && \
    apt-get install -y vitastor-client && \
-    apt-get download qemu-system-common && \
-    apt-get download qemu-block-extra && \
-    dpkg -x qemu-system-common*.deb tmp1 && \
-    dpkg -x qemu-block-extra*.deb tmp1 && \
-    cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
-    mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
-    cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
-    rm -rf tmp1 *.deb && \
    apt-get clean

 ENTRYPOINT ["/bin/vitastor-csi"]
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v1.3.1
+VERSION ?= v1.0.0

 all: build push

--- a/csi/deploy/001-csi-config-map.yaml
+++ b/csi/deploy/001-csi-config-map.yaml
@@ -2,7 +2,6 @@
 apiVersion: v1
 kind: ConfigMap
 data:
-  # You can add multiple configuration files here to use a multi-cluster setup
  vitastor.conf: |-
    {"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
 metadata:
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.3.1
+          image: vitalif/vitastor-csi:v1.0.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
@@ -82,8 +82,6 @@ spec:
              name: host-sys
            - mountPath: /run/mount
              name: host-mount
-            - mountPath: /run/vitastor-csi
-              name: run-vitastor-csi
            - mountPath: /lib/modules
              name: lib-modules
              readOnly: true
@@ -134,9 +132,6 @@ spec:
        - name: host-mount
          hostPath:
            path: /run/mount
-        - name: run-vitastor-csi
-          hostPath:
-            path: /run/vitastor-csi
        - name: lib-modules
          hostPath:
            path: /lib/modules
--- a/csi/deploy/005-csi-provisioner-rbac.yaml
+++ b/csi/deploy/005-csi-provisioner-rbac.yaml
@@ -35,13 +35,10 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshots"]
-    verbs: ["get", "list", "patch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshots/status"]
-    verbs: ["get", "list", "patch"]
+    verbs: ["get", "list"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents"]
-    verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
+    verbs: ["create", "get", "list", "watch", "update", "delete"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotclasses"]
    verbs: ["get", "list", "watch"]
@@ -56,7 +53,7 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents/status"]
-    verbs: ["update", "patch"]
+    verbs: ["update"]
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get"]
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -23,11 +23,6 @@ metadata:
  name: csi-vitastor-provisioner
 spec:
  replicas: 3
-  strategy:
-    type: RollingUpdate
-    rollingUpdate:
-      maxUnavailable: 1
-      maxSurge: 0
  selector:
    matchLabels:
      app: csi-vitastor-provisioner
@@ -51,7 +46,7 @@ spec:
      priorityClassName: system-cluster-critical
      containers:
        - name: csi-provisioner
-          image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
+          image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
          args:
            - "--csi-address=$(ADDRESS)"
            - "--v=5"
@@ -121,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.3.1
+          image: vitalif/vitastor-csi:v1.0.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/009-storage-class.yaml
+++ b/csi/deploy/009-storage-class.yaml
@@ -12,6 +12,8 @@ parameters:
  etcdVolumePrefix: ""
  poolId: "1"
  # you can choose other configuration file if you have it in the config map
-  # different etcd URLs and prefixes should also be put in the config
  #configPath: "/etc/vitastor/vitastor.conf"
-allowVolumeExpansion: true
+  # you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
+  # multiple etcdUrls may be specified, delimited by comma
+  #etcdUrl: "http://192.168.7.2:2379"
+  #etcdPrefix: "/vitastor"
--- a/csi/deploy/example-snapshot-class.yaml
+++ b/csi/deploy/example-snapshot-class.yaml
@@ -1,7 +0,0 @@
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshotClass
-metadata:
-  name: vitastor-snapclass
-driver: csi.vitastor.io
-deletionPolicy: Delete
-parameters:
--- a/csi/deploy/example-snapshot-clone.yaml
+++ b/csi/deploy/example-snapshot-clone.yaml
@@ -1,16 +0,0 @@
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-vitastor-clone
-spec:
-  storageClassName: vitastor
-  dataSource:
-    name: snap1
-    kind: VolumeSnapshot
-    apiGroup: snapshot.storage.k8s.io
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
--- a/csi/deploy/example-snapshot.yaml
+++ b/csi/deploy/example-snapshot.yaml
@@ -1,8 +0,0 @@
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshot
-metadata:
-  name: snap1
-spec:
-  volumeSnapshotClassName: vitastor-snapclass
-  source:
-    persistentVolumeClaimName: test-vitastor-pvc
--- a/csi/go.mod
+++ b/csi/go.mod
@@ -9,7 +9,6 @@ require (
 	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
-	google.golang.org/protobuf v1.24.0
 	k8s.io/klog v1.0.0
 	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
 )
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.3.1"
+    vitastorCSIDriverVersion = "1.0.0"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -20,7 +20,6 @@ import (

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
-    "google.golang.org/protobuf/types/known/timestamppb"

    "github.com/container-storage-interface/spec/lib/go/csi"
 )
@@ -46,7 +45,6 @@ type InodeConfig struct
    ParentPool uint64 `json:"parent_pool,omitempty"`
    ParentId uint64 `json:"parent_id,omitempty"`
    Readonly bool `json:"readonly,omitempty"`
-    CreateTs uint64 `json:"create_ts,omitempty"`
 }

 type ControllerServer struct
@@ -62,7 +60,7 @@ func NewControllerServer(driver *Driver) *ControllerServer
    }
 }

-func GetConnectionParams(params map[string]string) (map[string]string, error)
+func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
 {
    ctxVars := make(map[string]string)
    configPath := params["configPath"]
@@ -75,58 +73,71 @@ func GetConnectionParams(params map[string]string) (map[string]string, error)
        ctxVars["configPath"] = configPath
    }
    config := make(map[string]interface{})
-    configFD, err := os.Open(configPath)
-    if (err != nil)
+    if configFD, err := os.Open(configPath); err == nil
    {
-        return nil, err
+        defer configFD.Close()
+        data, _ := ioutil.ReadAll(configFD)
+        json.Unmarshal(data, &config)
    }
-    defer configFD.Close()
-    data, _ := ioutil.ReadAll(configFD)
-    json.Unmarshal(data, &config)
-    // Check etcd URL in the config, but do not use the explicit etcdUrl
-    // parameter for CLI calls, otherwise users won't be able to later
-    // change them - storage class parameters are saved in volume IDs
+    // Try to load prefix & etcd URL from the config
    var etcdUrl []string
-    switch config["etcd_address"].(type)
+    if (params["etcdUrl"] != "")
    {
-    case string:
-        url := strings.TrimSpace(config["etcd_address"].(string))
-        if (url != "")
-        {
-            etcdUrl = strings.Split(url, ",")
-        }
-    case []string:
-        etcdUrl = config["etcd_address"].([]string)
+        ctxVars["etcdUrl"] = params["etcdUrl"]
+        etcdUrl = strings.Split(params["etcdUrl"], ",")
    }
    if (len(etcdUrl) == 0)
    {
-        return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
+        switch config["etcd_address"].(type)
+        {
+        case string:
+            etcdUrl = strings.Split(config["etcd_address"].(string), ",")
+        case []string:
+            etcdUrl = config["etcd_address"].([]string)
+        }
    }
-    return ctxVars, nil
-}
-
-func system(program string, args ...string) ([]byte, error)
-{
-    c := exec.Command(program, args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout, c.Stderr = &stdout, &stderr
-    err := c.Run()
-    if (err != nil)
+    etcdPrefix := params["etcdPrefix"]
+    if (etcdPrefix == "")
    {
-        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
-        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
-        return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
+        etcdPrefix, _ = config["etcd_prefix"].(string)
+        if (etcdPrefix == "")
+        {
+            etcdPrefix = "/vitastor"
+        }
    }
-    return stdout.Bytes(), nil
+    else
+    {
+        ctxVars["etcdPrefix"] = etcdPrefix
+    }
+    return ctxVars, etcdUrl, etcdPrefix
 }

 func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
 {
+    if (ctxVars["etcdUrl"] != "")
+    {
+        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
+    }
+    if (ctxVars["etcdPrefix"] != "")
+    {
+        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
+    }
    if (ctxVars["configPath"] != "")
    {
        args = append(args, "--config_path", ctxVars["configPath"])
    }
-    return system("/usr/bin/vitastor-cli", args...)
+    c := exec.Command("/usr/bin/vitastor-cli", args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout = &stdout
+    c.Stderr = &stderr
+    err := c.Run()
+    stderrStr := string(stderr.Bytes())
+    if (err != nil)
+    {
+        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
+        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
+    }
+    return stdout.Bytes(), nil
 }

 // Create the volume
@@ -161,49 +172,33 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }

-    ctxVars, err := GetConnectionParams(req.Parameters)
-    if (err != nil)
+    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
+    if (len(etcdUrl) == 0)
    {
-        return nil, err
-    }
-
-    args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
-
-    // Support creation from snapshot
-    var src *csi.VolumeContentSource
-    if (req.VolumeContentSource.GetSnapshot() != nil)
-    {
-        snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
-        if (snapId != "")
-        {
-            snapVars := make(map[string]string)
-            err := json.Unmarshal([]byte(snapId), &snapVars)
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-            }
-            args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
-            src = &csi.VolumeContentSource{
-                Type: &csi.VolumeContentSource_Snapshot{
-                    Snapshot: &csi.VolumeContentSource_SnapshotSource{
-                        SnapshotId: snapId,
-                    },
-                },
-            }
-        }
+        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

    // Create image using vitastor-cli
-    _, err = invokeCLI(ctxVars, args)
+    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
    if (err != nil)
    {
        if (strings.Index(err.Error(), "already exists") > 0)
        {
-            inodeCfg, err := invokeList(ctxVars, volName, true)
+            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
            if (err != nil)
            {
                return nil, err
            }
+            var inodeCfg []InodeConfig
+            err = json.Unmarshal(stat, &inodeCfg)
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+            }
+            if (len(inodeCfg) == 0)
+            {
+                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
+            }
            if (inodeCfg[0].Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -222,7 +217,6 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
            // Ugly, but VolumeContext isn't passed to DeleteVolume :-(
            VolumeId: string(volumeIdJson),
            CapacityBytes: volSize,
-            ContentSource: src,
        },
    }, nil
 }
@@ -236,19 +230,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }

-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
+    ctxVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
-    volName := volVars["name"]
+    volName := ctxVars["name"]

-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
+    ctxVars, _, _ = GetConnectionParams(ctxVars)

    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
    if (err != nil)
@@ -354,8 +344,6 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
        csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
        csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
        csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
-        csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
-        // TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
    } {
        controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
    }
@@ -365,226 +353,28 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
    }, nil
 }

-func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
-{
-    stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
-    if (err != nil)
-    {
-        return nil, err
-    }
-    var inodeCfg []InodeConfig
-    err = json.Unmarshal(stat, &inodeCfg)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
-    }
-    if (expectExist && len(inodeCfg) == 0)
-    {
-        return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
-    }
-    return inodeCfg, nil
-}
-
 // CreateSnapshot create snapshot of an existing PV
 func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
 {
-    klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.SourceVolumeId == "" || req.Name == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
-    }
-
-    // snapshot name
-    snapName := req.Name
-
-    // req.VolumeId is an ugly json string in our case :)
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := ctxVars["name"]
-
-    // Create image using vitastor-cli
-    _, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
-    if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
-    {
-        return nil, err
-    }
-
-    // Check created snapshot
-    inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    // Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
-    ctxVars["snapshot"] = snapName
-    snapIdJson, _ := json.Marshal(ctxVars)
-    return &csi.CreateSnapshotResponse{
-        Snapshot: &csi.Snapshot{
-            SizeBytes: int64(inodeCfg[0].Size),
-            SnapshotId: string(snapIdJson),
-            SourceVolumeId: req.SourceVolumeId,
-            CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
-            ReadyToUse: true,
-        },
-    }, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

 // DeleteSnapshot delete provided snapshot of a PV
 func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
 {
-    klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.SnapshotId == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
-    }
-    volName := volVars["name"]
-    snapName := volVars["snapshot"]
-
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    _, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    return &csi.DeleteSnapshotResponse{}, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

 // ListSnapshots list the snapshots of a PV
 func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
 {
-    klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    resp := &csi.ListSnapshotsResponse{}
-    for _, ino := range inodeCfg
-    {
-        snapName := ino.Name[len(volName)+1:]
-        if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
-        {
-        }
-        else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
-        {
-            volVars["snapshot"] = snapName
-            snapIdJson, _ := json.Marshal(volVars)
-            resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
-                Snapshot: &csi.Snapshot{
-                    SizeBytes: int64(ino.Size),
-                    SnapshotId: string(snapIdJson),
-                    SourceVolumeId: req.SourceVolumeId,
-                    CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
-                    ReadyToUse: true,
-                },
-            })
-        }
-        else
-        {
-            resp.NextToken = snapName
-            break
-        }
-    }
-
-    return resp, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

-// ControllerExpandVolume increases the size of a volume
+// ControllerExpandVolume resizes a volume
 func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
 {
-    klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    inodeCfg, err := invokeList(ctxVars, volName, true)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
-    {
-        sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
-        _, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
-        if (err != nil)
-        {
-            return nil, err
-        }
-        inodeCfg, err = invokeList(ctxVars, volName, true)
-        if (err != nil)
-        {
-            return nil, err
-        }
-    }
-
-    return &csi.ControllerExpandVolumeResponse{
-        CapacityBytes: int64(inodeCfg[0].Size),
-        NodeExpansionRequired: false,
-    }, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

 // ControllerGetVolume get volume info
--- a/csi/src/identityserver.go
+++ b/csi/src/identityserver.go
@@ -49,13 +49,6 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
                    },
                },
            },
-            {
-                Type: &csi.PluginCapability_VolumeExpansion_{
-                    VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
-                        Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
-                    },
-                },
-            },
        },
    }, nil
 }
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@@ -5,14 +5,11 @@ package vitastor

 import (
    "context"
-    "errors"
-    "encoding/json"
    "os"
    "os/exec"
-    "path/filepath"
-    "strconv"
+    "encoding/json"
    "strings"
-    "syscall"
+    "bytes"

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
@@ -28,91 +25,16 @@ import (
 type NodeServer struct
 {
    *Driver
-    useVduse bool
-    stateDir string
    mounter mount.Interface
 }

-type DeviceState struct
-{
-    ConfigPath string `json:"configPath"`
-    VdpaId     string `json:"vdpaId"`
-    Image      string `json:"image"`
-    Blockdev   string `json:"blockdev"`
-    Readonly   bool   `json:"readonly"`
-    PidFile    string `json:"pidFile"`
-}
-
 // NewNodeServer create new instance node
 func NewNodeServer(driver *Driver) *NodeServer
 {
-    stateDir := os.Getenv("STATE_DIR")
-    if (stateDir == "")
-    {
-        stateDir = "/run/vitastor-csi"
-    }
-    if (stateDir[len(stateDir)-1] != '/')
-    {
-        stateDir += "/"
-    }
-    ns := &NodeServer{
+    return &NodeServer{
        Driver: driver,
-        useVduse: checkVduseSupport(),
-        stateDir: stateDir,
        mounter: mount.New(""),
    }
-    if (ns.useVduse)
-    {
-        ns.restoreVduseDaemons()
-    }
-    return ns
-}
-
-func checkVduseSupport() bool
-{
-    // Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
-    vduse := true
-    for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
-    {
-        _, err := os.Stat("/sys/module/"+mod)
-        if (err != nil)
-        {
-            if (!errors.Is(err, os.ErrNotExist))
-            {
-                klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
-            }
-            c := exec.Command("/sbin/modprobe", mod)
-            c.Stdout = os.Stderr
-            c.Stderr = os.Stderr
-            err := c.Run()
-            if (err != nil)
-            {
-                klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
-                vduse = false
-                break
-            }
-        }
-    }
-    // Check that vdpa tool functions
-    if (vduse)
-    {
-        c := exec.Command("/sbin/vdpa", "-j", "dev")
-        c.Stderr = os.Stderr
-        err := c.Run()
-        if (err != nil)
-        {
-            klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
-            vduse = false
-        }
-    }
-    if (!vduse)
-    {
-        klog.Errorf(
-            "Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
-            " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
-        )
-    }
-    return vduse
 }

 // NodeStageVolume mounts the volume to a staging path on the node.
@@ -139,303 +61,6 @@ func Contains(list []string, s string) bool
    return false
 }

-func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
-{
-    // Map NBD device
-    // FIXME: Check if already mapped
-    args := []string{
-        "map", "--image", volName,
-    }
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    if (readonly)
-    {
-        args = append(args, "--readonly", "1")
-    }
-    dev, err := system("/usr/bin/vitastor-nbd", args...)
-    return strings.TrimSpace(string(dev)), err
-}
-
-func (ns *NodeServer) unmapNbd(devicePath string)
-{
-    // unmap NBD device
-    unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-    if (unmapErr != nil)
-    {
-        klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-    }
-}
-
-func findByPidFile(pidFile string) (*os.Process, error)
-{
-    pidBuf, err := os.ReadFile(pidFile)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    proc, err := os.FindProcess(int(pid))
-    if (err != nil)
-    {
-        return nil, err
-    }
-    return proc, nil
-}
-
-func killByPidFile(pidFile string) error
-{
-    proc, err := findByPidFile(pidFile)
-    if (err != nil)
-    {
-        return err
-    }
-    return proc.Signal(syscall.SIGTERM)
-}
-
-func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
-{
-    // Start qemu-storage-daemon
-    blockSpec := map[string]interface{}{
-        "node-name": "disk1",
-        "driver": "vitastor",
-        "image": volName,
-        "cache": map[string]bool{
-            "direct": true,
-            "no-flush": false,
-        },
-        "discard": "unmap",
-    }
-    if (configPath != "")
-    {
-        blockSpec["config-path"] = configPath
-    }
-    blockSpecJson, _ := json.Marshal(blockSpec)
-    writable := "true"
-    if (readonly)
-    {
-        writable = "false"
-    }
-    _, err := system(
-        "/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
-        "--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
-    )
-    return err
-}
-
-func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
-{
-    // Generate state file
-    stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
-    if (err != nil)
-    {
-        return "", "", status.Error(codes.Internal, err.Error())
-    }
-    stateFile := stateFd.Name()
-    stateFd.Close()
-    vdpaId := filepath.Base(stateFile)
-    vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
-    pidFile := ns.stateDir + vdpaId + ".pid"
-    // Map VDUSE device via qemu-storage-daemon
-    err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
-    if (err == nil)
-    {
-        // Add device to VDPA bus
-        _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
-        if (err == nil)
-        {
-            // Find block device name
-            matches, err := filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
-            if (err == nil && len(matches) == 0)
-            {
-                err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
-            }
-            if (err == nil)
-            {
-                blockdev := "/dev/"+filepath.Base(matches[0])
-                _, err = os.Stat(blockdev)
-                if (err == nil)
-                {
-                    // Generate state file
-                    stateJSON, _ := json.Marshal(&DeviceState{
-                        ConfigPath: ctxVars["configPath"],
-                        VdpaId:     vdpaId,
-                        Image:      volName,
-                        Blockdev:   blockdev,
-                        Readonly:   readonly,
-                        PidFile:    pidFile,
-                    })
-                    err = os.WriteFile(stateFile, stateJSON, 0600)
-                    if (err == nil)
-                    {
-                        return blockdev, vdpaId, nil
-                    }
-                }
-            }
-            if (err != nil)
-            {
-                err = status.Error(codes.Internal, err.Error())
-            }
-        }
-        if (err != nil)
-        {
-            killErr := killByPidFile(pidFile)
-            if (killErr != nil)
-            {
-                klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
-            }
-            os.Remove(stateFile)
-            os.Remove(pidFile)
-        }
-    }
-    return "", "", err
-}
-
-func (ns *NodeServer) unmapVduse(devicePath string)
-{
-    if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
-    {
-        klog.Errorf("%s does not start with /dev/v", devicePath)
-        return
-    }
-    vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
-    if (err != nil)
-    {
-        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
-        return
-    }
-    vdpaId := ""
-    p := strings.Index(vduseDev, "/vduse/")
-    if (p >= 0)
-    {
-        vduseDev = vduseDev[p+7:]
-        p = strings.Index(vduseDev, "/")
-        if (p >= 0)
-        {
-            vdpaId = vduseDev[0:p]
-        }
-    }
-    if (vdpaId == "")
-    {
-        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
-        return
-    }
-    ns.unmapVduseById(vdpaId)
-}
-
-func (ns *NodeServer) unmapVduseById(vdpaId string)
-{
-    _, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
-    if (err != nil)
-    {
-        klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
-    }
-    else
-    {
-        _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
-    }
-    stateFile := ns.stateDir + vdpaId + ".json"
-    os.Remove(stateFile)
-    pidFile := ns.stateDir + vdpaId + ".pid"
-    _, err = os.Stat(pidFile)
-    if (os.IsNotExist(err))
-    {
-        // ok, already killed
-    }
-    else if (err != nil)
-    {
-        klog.Errorf("Failed to stat %v: %v", pidFile, err)
-        return
-    }
-    else
-    {
-        err = killByPidFile(pidFile)
-        if (err != nil)
-        {
-            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
-        }
-        os.Remove(pidFile)
-    }
-}
-
-func (ns *NodeServer) restoreVduseDaemons()
-{
-    pattern := ns.stateDir+"vitastor-vduse-*.json"
-    matches, err := filepath.Glob(pattern)
-    if (err != nil)
-    {
-        klog.Errorf("failed to list %s: %v", pattern, err)
-    }
-    if (len(matches) == 0)
-    {
-        return
-    }
-    devList := make(map[string]interface{})
-    // example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
-    devListJSON, err := system("/sbin/vdpa", "-j", "dev", "list")
-    if (err != nil)
-    {
-        return
-    }
-    err = json.Unmarshal(devListJSON, &devList)
-    devs, ok := devList["dev"].(map[string]interface{})
-    if (err != nil || !ok)
-    {
-        klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
-        return
-    }
-    for _, stateFile := range matches
-    {
-        vdpaId := filepath.Base(stateFile)
-        vdpaId = vdpaId[0:len(vdpaId)-5]
-        // Check if VDPA device is still added to the bus
-        if (devs[vdpaId] != nil)
-        {
-            // Check if the storage daemon is still active
-            pidFile := ns.stateDir + vdpaId + ".pid"
-            exists := false
-            proc, err := findByPidFile(pidFile)
-            if (err == nil)
-            {
-                exists = proc.Signal(syscall.Signal(0)) == nil
-            }
-            if (!exists)
-            {
-                // Restart daemon
-                stateJSON, err := os.ReadFile(stateFile)
-                if (err != nil)
-                {
-                    klog.Warningf("error reading state file %v: %v", stateFile, err)
-                }
-                else
-                {
-                    var state DeviceState
-                    err := json.Unmarshal(stateJSON, &state)
-                    if (err != nil)
-                    {
-                        klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
-                    }
-                    else
-                    {
-                        klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
-                        _ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Unused, clean it up
-            ns.unmapVduseById(vdpaId)
-        }
-    }
-}
-
 // NodePublishVolume mounts the volume mounted to the staging path to the target path
 func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
 {
@@ -445,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    isBlock := req.GetVolumeCapability().GetBlock() != nil

    // Check that it's not already mounted
-    _, err := mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (err != nil)
+    _, error := mount.IsNotMountPoint(ns.mounter, targetPath)
+    if (error != nil)
    {
-        if (os.IsNotExist(err))
+        if (os.IsNotExist(error))
        {
            if (isBlock)
            {
@@ -477,106 +102,115 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
        }
        else
        {
-            return nil, status.Error(codes.Internal, err.Error())
+            return nil, status.Error(codes.Internal, error.Error())
        }
    }

    ctxVars := make(map[string]string)
-    err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
    volName := ctxVars["name"]

-    _, err = GetConnectionParams(ctxVars)
-    if (err != nil)
+    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
+    if (len(etcdUrl) == 0)
    {
-        return nil, err
+        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

-    var devicePath, vdpaId string
-    if (!ns.useVduse)
+    // Map NBD device
+    // FIXME: Check if already mapped
+    args := []string{
+        "map", "--etcd_address", strings.Join(etcdUrl, ","),
+        "--etcd_prefix", etcdPrefix,
+        "--image", volName,
+    };
+    if (ctxVars["configPath"] != "")
    {
-        devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
+        args = append(args, "--config_path", ctxVars["configPath"])
    }
-    else
+    if (req.GetReadonly())
    {
-        devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
+        args = append(args, "--readonly", "1")
    }
+    c := exec.Command("/usr/bin/vitastor-nbd", args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout, c.Stderr = &stdout, &stderr
+    err = c.Run()
+    stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
    if (err != nil)
    {
-        return nil, err
+        klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
+        return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
    }
+    devicePath := strings.TrimSpace(stdoutStr)

+    // Check existing format
    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
+    existingFormat, err := diskMounter.GetDiskFormat(devicePath)
+    if (err != nil)
+    {
+        klog.Errorf("failed to get disk format for path %s, error: %v", err)
+        // unmap NBD device
+        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (unmapErr != nil)
+        {
+            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+        }
+        return nil, err
+    }
+
+    // Format the device (ext4 or xfs)
+    fsType := req.GetVolumeCapability().GetMount().GetFsType()
+    opt := req.GetVolumeCapability().GetMount().GetMountFlags()
+    opt = append(opt, "_netdev")
+    if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
+        req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
+        !Contains(opt, "ro"))
+    {
+        opt = append(opt, "ro")
+    }
+    if (fsType == "xfs")
+    {
+        opt = append(opt, "nouuid")
+    }
+    readOnly := Contains(opt, "ro")
+    if (existingFormat == "" && !readOnly)
+    {
+        args := []string{}
+        switch fsType
+        {
+            case "ext4":
+                args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
+            case "xfs":
+                args = []string{"-K", devicePath}
+        }
+        if (len(args) > 0)
+        {
+            cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
+            if (cmdErr != nil)
+            {
+                klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
+                // unmap NBD device
+                unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+                if (unmapErr != nil)
+                {
+                    klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+                }
+                return nil, status.Error(codes.Internal, cmdErr.Error())
+            }
+        }
+    }
    if (isBlock)
    {
-        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
+        opt = append(opt, "bind")
+        err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
    }
    else
    {
-        // Check existing format
-        existingFormat, err := diskMounter.GetDiskFormat(devicePath)
-        if (err != nil)
-        {
-            klog.Errorf("failed to get disk format for path %s, error: %v", err)
-            goto unmap
-        }
-
-        // Format the device (ext4 or xfs)
-        fsType := req.GetVolumeCapability().GetMount().GetFsType()
-        opt := req.GetVolumeCapability().GetMount().GetMountFlags()
-        opt = append(opt, "_netdev")
-        if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
-            req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
-            !Contains(opt, "ro"))
-        {
-            opt = append(opt, "ro")
-        }
-        if (fsType == "xfs")
-        {
-            opt = append(opt, "nouuid")
-        }
-        readOnly := Contains(opt, "ro")
-        if (existingFormat == "" && !readOnly)
-        {
-            var cmdOut []byte
-            switch fsType
-            {
-                case "ext4":
-                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
-                    cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
-                case "xfs":
-                    cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
-            }
-            if (err != nil)
-            {
-                klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
-                goto unmap
-            }
-        }
-
        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
-
-        // Try to run online resize on mount.
-        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
-        if (err == nil && existingFormat != "" && !readOnly)
-        {
-            var cmdOut []byte
-            switch (fsType)
-            {
-                case "ext4":
-                    cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
-                case "xfs":
-                    cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
-            }
-            if (err != nil)
-            {
-                klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
-                goto unmap
-            }
-        }
    }
    if (err != nil)
    {
@@ -584,20 +218,15 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
            devicePath, targetPath, volName, err,
        )
-        goto unmap
+        // unmap NBD device
+        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (unmapErr != nil)
+        {
+            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+        }
+        return nil, status.Error(codes.Internal, err.Error())
    }
    return &csi.NodePublishVolumeResponse{}, nil
-
-unmap:
-    if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
-    {
-        ns.unmapNbd(devicePath)
-    }
-    else
-    {
-        ns.unmapVduseById(vdpaId)
-    }
-    return nil, status.Error(codes.Internal, err.Error())
 }

 // NodeUnpublishVolume unmounts the volume from the target path
@@ -616,10 +245,7 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
    }
    if (devicePath == "")
    {
-        // volume not mounted
-        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
-        os.Remove(targetPath)
-        return &csi.NodeUnpublishVolumeResponse{}, nil
+        return nil, status.Error(codes.NotFound, "Volume not mounted")
    }
    // unmount
    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
@@ -630,13 +256,10 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
    // unmap NBD device
    if (refCount == 1)
    {
-        if (!ns.useVduse)
+        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (unmapErr != nil)
        {
-            ns.unmapNbd(devicePath)
-        }
-        else
-        {
-            ns.unmapVduse(devicePath)
+            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
        }
    }
    return &csi.NodeUnpublishVolumeResponse{}, nil
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (1.3.1-1) unstable; urgency=medium
+vitastor (1.0.0-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.7.0-1) unstable; urgency=medium
+vitastor (1.0.0-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: vitastor
 Section: admin
 Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
 Rules-Requires-Root: no
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -54,8 +54,7 @@ RUN set -e; \
    quilt add block/vitastor.c; \
    cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
-    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.3.1; \
-    cd vitastor-1.3.1; \
+    cp -r /root/vitastor vitastor-1.0.0; \
+    cd vitastor-1.0.0; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.3.1.orig.tar.xz vitastor-1.3.1; \
-    cd vitastor-1.3.1; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.0.0.orig.tar.xz vitastor-1.0.0; \
+    cd vitastor-1.0.0; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config.en.md
+++ b/docs/config.en.md
@@ -33,7 +33,6 @@ In the future, additional configuration methods may be added:

 - [Common](config/common.en.md)
 - [Network](config/network.en.md)
- [Client](config/client.en.md)
 - [Global Disk Layout](config/layout-cluster.en.md)
 - [OSD Disk Layout](config/layout-osd.en.md)
 - [OSD Runtime Parameters](config/osd.en.md)
--- a/docs/config.ru.md
+++ b/docs/config.ru.md
@@ -36,7 +36,6 @@

 - [Общие](config/common.ru.md)
 - [Сеть](config/network.ru.md)
- [Клиентский код](config/client.ru.md)
 - [Глобальные дисковые параметры](config/layout-cluster.ru.md)
 - [Дисковые параметры OSD](config/layout-osd.ru.md)
 - [Прочие параметры OSD](config/osd.ru.md)
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -1,137 +0,0 @@
-[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
-
-----
-
-[Читать на русском](client.ru.md)
-
-# Client Parameters
-
-These parameters apply only to clients and affect their interaction with
-the cluster.
-
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
-
-## client_max_dirty_bytes
-
- Type: integer
- Default: 33554432
- Can be changed online: yes
-
-Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
-(not committed by fsync) data allowed by the client before forcing an
-additional fsync and committing the data. Also note that the client always
-holds a copy of uncommitted data in memory so this setting also affects
-RAM usage of clients.
-
-## client_max_dirty_ops
-
- Type: integer
- Default: 1024
- Can be changed online: yes
-
-Same as client_max_dirty_bytes, but instead of total size, limits the number
-of uncommitted write operations.
-
-## client_enable_writeback
-
- Type: boolean
- Default: false
- Can be changed online: yes
-
-This parameter enables client-side write buffering. This means that write
-requests are accumulated in memory for a short time before being sent to
-a Vitastor cluster which allows to send them in parallel and increase
-performance of some applications. Writes are buffered until client forces
-a flush with fsync() or until the amount of buffered writes exceeds the
-limit.
-
-Write buffering significantly increases performance of some applications,
-for example, CrystalDiskMark under Windows (LOL :-D), but also any other
-applications if they do writes in one of two non-optimal ways: either if
-they do a lot of small (4 kb or so) sequential writes, or if they do a lot
-of small random writes, but without any parallelism or asynchrony, and also
-without calling fsync().
-
-With write buffering enabled, you can expect around 22000 T1Q1 random write
-iops in QEMU more or less regardless of the quality of your SSDs, and this
-number is in fact bound by QEMU itself rather than Vitastor (check it
-yourself by adding a "driver=null-co" disk in QEMU). Without write
-buffering, the current record is 9900 iops, but the number is usually
-even lower with non-ideal hardware, for example, it may be 5000 iops.
-
-Even when this parameter is enabled, write buffering isn't enabled until
-the client explicitly allows it, because enabling it without the client
-being aware of the fact that his writes may be buffered may lead to data
-loss. Because of this, older versions of clients don't support write
-buffering at all, newer versions of the QEMU driver allow write buffering
-only if it's enabled in disk settings with `-blockdev cache.direct=false`,
-and newer versions of FIO only allow write buffering if you don't specify
-`-direct=1`. NBD and NFS drivers allow write buffering by default.
-
-You can overcome this restriction too with the `client_writeback_allowed`
-parameter, but you shouldn't do that unless you **really** know what you
-are doing.
-
-## client_max_buffered_bytes
-
- Type: integer
- Default: 33554432
- Can be changed online: yes
-
-Maximum total size of buffered writes which triggers write-back when reached.
-
-## client_max_buffered_ops
-
- Type: integer
- Default: 1024
- Can be changed online: yes
-
-Maximum number of buffered writes which triggers write-back when reached.
-Multiple consecutive modified data regions are counted as 1 write here.
-
-## client_max_writeback_iodepth
-
- Type: integer
- Default: 256
- Can be changed online: yes
-
-Maximum number of parallel writes when flushing buffered data to the server.
-
-## nbd_timeout
-
- Type: seconds
- Default: 300
-
-Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
-executes for longer than this timeout, including when your cluster is just
-temporarily down for more than timeout, the NBD device will detach by itself
-(and possibly break the mounted file system).
-
-You can set timeout to 0 to never detach, but in that case you won't be
-able to remove the kernel device at all if the NBD process dies - you'll have
-to reboot the host.
-
-## nbd_max_devices
-
- Type: integer
- Default: 64
-
-Maximum number of NBD devices in the system. This value is passed as
-`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-
-## nbd_max_part
-
- Type: integer
- Default: 3
-
-Maximum number of partitions per NBD device. This value is passed as
-`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-Note that (nbds_max)*(1+max_part) usually can't exceed 256.
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -1,137 +0,0 @@
-[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
-
-----
-
-[Read in English](client.en.md)
-
-# Параметры клиентского кода
-
-Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
-затрагивают логику их работы с кластером.
-
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
-
-## client_max_dirty_bytes
-
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
-
-При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
-зафиксированных fsync-ом) данных, при достижении которого клиент будет
-принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-что в этом случае до момента fsync клиент хранит копию незафиксированных
-данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
-
-## client_max_dirty_ops
-
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
-
-Аналогично client_max_dirty_bytes, но ограничивает количество
-незафиксированных операций записи вместо их общего объёма.
-
-## client_enable_writeback
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
-
-Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
-означает, что операции записи отправляются на кластер Vitastor не сразу, а
-могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
-до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
-пока клиент не вызовет fsync.
-
-Буферизация значительно повышает производительность некоторых приложений,
-например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
-которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
-(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
-есть, например, отправляя 128 операций записи в разные места диска, но не
-все сразу с помощью асинхронного I/O, а по одной.
-
-В QEMU с буферизацией записи можно ожидать показателя примерно 22000
-операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
-без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
-цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
-в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
-в секунду.
-
-При этом, даже если данный параметр включён, буферизация не включается, если
-явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
-буферизуются, это может приводить к потере данных. Поэтому в старых версиях
-клиентских драйверов буферизация записи не включается вообще, в новых
-версиях QEMU-драйвера включается, только если разрешена опцией диска
-`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
-В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
-
-Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
-но делать так не надо, если только вы не уверены в том, что делаете, на все
-100%. :-)
-
-## client_max_buffered_bytes
-
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
-
-Максимальный общий размер буферизованных записей, при достижении которого
-начинается процесс сброса данных на сервер.
-
-## client_max_buffered_ops
-
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
-
-Максимальное количество буферизованных записей, при достижении которого
-начинается процесс сброса данных на сервер. При этом несколько
-последовательных изменённых областей здесь считаются 1 записью.
-
-## client_max_writeback_iodepth
-
- Тип: целое число
- Значение по умолчанию: 256
- Можно менять на лету: да
-
-Максимальное число параллельных операций записи при сбросе буферов на сервер.
-
-## nbd_timeout
-
- Тип: секунды
- Значение по умолчанию: 300
-
-Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
-операция выполняется дольше таймаута, включая временную недоступность
-кластера на время, большее таймаута, NBD-устройство отключится само собой
-(и, возможно, сломает примонтированную ФС).
-
-Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
-таймауту, но в этом случае вы вообще не сможете удалить устройство, если
-процесс NBD умрёт - вам придётся перезагружать сервер.
-
-## nbd_max_devices
-
- Тип: целое число
- Значение по умолчанию: 64
-
-Максимальное число NBD-устройств в системе. Данное значение передаётся
-модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
-
-## nbd_max_part
-
- Тип: целое число
- Значение по умолчанию: 3
-
-Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
-модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
-Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -20,7 +20,6 @@ between clients, OSDs and etcd.
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -31,6 +30,7 @@ between clients, OSDs and etcd.
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
+- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@@ -69,14 +69,11 @@ but they are not connected to the cluster.
 - Type: string

 RDMA device name to use for Vitastor OSD communications (for example,
-"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
-ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
-
-Versions up to Vitastor 1.2.0 required ODP which is only present in
-Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
-
-Run `ibv_devinfo -v` as root to list available RDMA devices and their
-features.
+"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
+Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
+to work. For example, Mellanox ConnectX-3 and older adapters don't have
+Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
+root to list available RDMA devices and their features.

 Remember that you also have to configure your network switches if you use
 RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -151,28 +148,6 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
 Doesn't affect memory usage - additional memory isn't allocated for send
 operations.

-## rdma_odp
-
- Type: boolean
- Default: false
-
-Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
-ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
-for RDMA adapter to be able to use it. This, in turn, allows to skip memory
-copying during sending. One would think this should improve performance, but
-**in reality** RDMA performance with ODP is **drastically** worse. Example
-3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
-without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
-
-This happens because Mellanox ODP implementation seems to be based on
-message retransmissions when the adapter doesn't know about the buffer yet -
-it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
-which is generally slow in RDMA/RoCE networks. Here's a presentation about
-it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-ODP support is retained in the code just in case a good ODP implementation
-appears one day.
-
 ## peer_connect_interval

 - Type: seconds
@@ -265,3 +240,17 @@ etcd_report_interval to guarantee that keepalive actually works.

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
+
+## client_dirty_limit
+
+- Type: integer
+- Default: 33554432
+- Can be changed online: yes
+
+Without immediate_commit=all this parameter sets the limit of "dirty"
+(not committed by fsync) data allowed by the client before forcing an
+additional fsync and committing the data. Also note that the client always
+holds a copy of uncommitted data in memory so this setting also affects
+RAM usage of clients.
+
+This parameter doesn't affect OSDs themselves.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -20,7 +20,6 @@
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -31,6 +30,7 @@
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
+- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@@ -72,15 +72,12 @@ RDMA может быть нужно только если у клиентов е
 - Тип: строка

 Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
-нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
-картами производства не Mellanox.
-
-Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
-на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
-
-Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
-список доступных RDMA-устройств, их параметры и возможности.
+Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
+Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
+адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
+потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
+суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
+параметры и возможности.

 Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
 правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -159,29 +156,6 @@ OSD в любом случае согласовывают реальное зн
 Не влияет на потребление памяти - дополнительная память на операции отправки
 не выделяется.

-## rdma_odp
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
-
-Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
-исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
-не регистрировать память для её использования RDMA-картой. Благодаря этому
-можно не копировать данные при отправке их в сеть и, казалось бы, это должно
-улучшать производительность - но **по факту** получается так, что
-производительность только ухудшается, причём сильно. Пример - на 3-узловом
-кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
-удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
-
-Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
-основана на повторной передаче сообщений, когда карте не известен буфер -
-вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
-А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
-Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-Возможность использования ODP сохранена в коде на случай, если вдруг в один
-прекрасный день появится хорошая реализация ODP.
-
 ## peer_connect_interval

 - Тип: секунды
@@ -277,3 +251,17 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
+
+## client_dirty_limit
+
+- Тип: целое число
+- Значение по умолчанию: 33554432
+- Можно менять на лету: да
+
+При работе без immediate_commit=all - это лимит объёма "грязных" (не
+зафиксированных fsync-ом) данных, при достижении которого клиент будет
+принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+что в этом случае до момента fsync клиент хранит копию незафиксированных
+данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+
+Параметр не влияет на сами OSD.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -11,7 +11,6 @@ initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.

 - [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -57,21 +56,11 @@ them, even without restarting by updating configuration in etcd.
 - Type: seconds
 - Default: 5

-Interval at which OSDs report their liveness to etcd. Affects OSD lease time
+Interval at which OSDs report their state to etcd. Affects OSD lease time
 and thus the failover speed. Lease time is equal to this parameter value
 plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
 that every OSD always refreshes its lease in time.

-## etcd_stats_interval
-
- Type: seconds
- Default: 30
-
-Interval at which OSDs report their statistics to etcd. Highly affects the
-imposed load on etcd, because statistics include a key for every OSD and
-for every PG. At the same time, low statistic intervals make `vitastor-cli`
-statistics more responsive.
-
 ## run_primary

 - Type: boolean
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -12,7 +12,6 @@
 изменения конфигурации в etcd.

 - [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -58,21 +57,11 @@
 - Тип: секунды
 - Значение по умолчанию: 5

-Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
-влияет на время резервации (lease) OSD и поэтому - на скорость переключения
+Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
+влияет на время резервации (lease) OSD и поэтому на скорость переключения
 при падении OSD. Время lease равняется значению этого параметра плюс
 max_etcd_attempts * etcd_quick_timeout.

-## etcd_stats_interval
-
- Тип: секунды
- Значение по умолчанию: 30
-
-Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
-создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
-каждый OSD и на каждую PG. В то же время низкий интервал делает
-статистику, печатаемую `vitastor-cli`, отзывчивей.
-
 ## run_primary

 - Тип: булево (да/нет)
--- a/docs/config/src/client.en.md
+++ b/docs/config/src/client.en.md
@@ -1,4 +0,0 @@
-# Client Parameters
-
-These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
-affect their interaction with the cluster.
--- a/docs/config/src/client.ru.md
+++ b/docs/config/src/client.ru.md
@@ -1,4 +0,0 @@
-# Параметры клиентского кода
-
-Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
-затрагивают логику их работы с кластером.
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -1,168 +0,0 @@
- name: client_max_dirty_bytes
-  type: int
-  default: 33554432
-  online: true
-  info: |
-    Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
-    (not committed by fsync) data allowed by the client before forcing an
-    additional fsync and committing the data. Also note that the client always
-    holds a copy of uncommitted data in memory so this setting also affects
-    RAM usage of clients.
-  info_ru: |
-    При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
-    зафиксированных fsync-ом) данных, при достижении которого клиент будет
-    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-    что в этом случае до момента fsync клиент хранит копию незафиксированных
-    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
- name: client_max_dirty_ops
-  type: int
-  default: 1024
-  online: true
-  info: |
-    Same as client_max_dirty_bytes, but instead of total size, limits the number
-    of uncommitted write operations.
-  info_ru: |
-    Аналогично client_max_dirty_bytes, но ограничивает количество
-    незафиксированных операций записи вместо их общего объёма.
- name: client_enable_writeback
-  type: bool
-  default: false
-  online: true
-  info: |
-    This parameter enables client-side write buffering. This means that write
-    requests are accumulated in memory for a short time before being sent to
-    a Vitastor cluster which allows to send them in parallel and increase
-    performance of some applications. Writes are buffered until client forces
-    a flush with fsync() or until the amount of buffered writes exceeds the
-    limit.
-
-    Write buffering significantly increases performance of some applications,
-    for example, CrystalDiskMark under Windows (LOL :-D), but also any other
-    applications if they do writes in one of two non-optimal ways: either if
-    they do a lot of small (4 kb or so) sequential writes, or if they do a lot
-    of small random writes, but without any parallelism or asynchrony, and also
-    without calling fsync().
-
-    With write buffering enabled, you can expect around 22000 T1Q1 random write
-    iops in QEMU more or less regardless of the quality of your SSDs, and this
-    number is in fact bound by QEMU itself rather than Vitastor (check it
-    yourself by adding a "driver=null-co" disk in QEMU). Without write
-    buffering, the current record is 9900 iops, but the number is usually
-    even lower with non-ideal hardware, for example, it may be 5000 iops.
-
-    Even when this parameter is enabled, write buffering isn't enabled until
-    the client explicitly allows it, because enabling it without the client
-    being aware of the fact that his writes may be buffered may lead to data
-    loss. Because of this, older versions of clients don't support write
-    buffering at all, newer versions of the QEMU driver allow write buffering
-    only if it's enabled in disk settings with `-blockdev cache.direct=false`,
-    and newer versions of FIO only allow write buffering if you don't specify
-    `-direct=1`. NBD and NFS drivers allow write buffering by default.
-
-    You can overcome this restriction too with the `client_writeback_allowed`
-    parameter, but you shouldn't do that unless you **really** know what you
-    are doing.
-  info_ru: |
-    Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
-    означает, что операции записи отправляются на кластер Vitastor не сразу, а
-    могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
-    до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
-    пока клиент не вызовет fsync.
-
-    Буферизация значительно повышает производительность некоторых приложений,
-    например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
-    которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
-    (например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
-    есть, например, отправляя 128 операций записи в разные места диска, но не
-    все сразу с помощью асинхронного I/O, а по одной.
-
-    В QEMU с буферизацией записи можно ожидать показателя примерно 22000
-    операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
-    без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
-    цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
-    в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
-    в секунду.
-
-    При этом, даже если данный параметр включён, буферизация не включается, если
-    явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
-    буферизуются, это может приводить к потере данных. Поэтому в старых версиях
-    клиентских драйверов буферизация записи не включается вообще, в новых
-    версиях QEMU-драйвера включается, только если разрешена опцией диска
-    `-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
-    В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
-
-    Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
-    но делать так не надо, если только вы не уверены в том, что делаете, на все
-    100%. :-)
- name: client_max_buffered_bytes
-  type: int
-  default: 33554432
-  online: true
-  info: |
-    Maximum total size of buffered writes which triggers write-back when reached.
-  info_ru: |
-    Максимальный общий размер буферизованных записей, при достижении которого
-    начинается процесс сброса данных на сервер.
- name: client_max_buffered_ops
-  type: int
-  default: 1024
-  online: true
-  info: |
-    Maximum number of buffered writes which triggers write-back when reached.
-    Multiple consecutive modified data regions are counted as 1 write here.
-  info_ru: |
-    Максимальное количество буферизованных записей, при достижении которого
-    начинается процесс сброса данных на сервер. При этом несколько
-    последовательных изменённых областей здесь считаются 1 записью.
- name: client_max_writeback_iodepth
-  type: int
-  default: 256
-  online: true
-  info: |
-    Maximum number of parallel writes when flushing buffered data to the server.
-  info_ru: |
-    Максимальное число параллельных операций записи при сбросе буферов на сервер.
- name: nbd_timeout
-  type: sec
-  default: 300
-  online: false
-  info: |
-    Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
-    executes for longer than this timeout, including when your cluster is just
-    temporarily down for more than timeout, the NBD device will detach by itself
-    (and possibly break the mounted file system).
-
-    You can set timeout to 0 to never detach, but in that case you won't be
-    able to remove the kernel device at all if the NBD process dies - you'll have
-    to reboot the host.
-  info_ru: |
-    Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
-    операция выполняется дольше таймаута, включая временную недоступность
-    кластера на время, большее таймаута, NBD-устройство отключится само собой
-    (и, возможно, сломает примонтированную ФС).
-
-    Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
-    таймауту, но в этом случае вы вообще не сможете удалить устройство, если
-    процесс NBD умрёт - вам придётся перезагружать сервер.
- name: nbd_max_devices
-  type: int
-  default: 64
-  online: false
-  info: |
-    Maximum number of NBD devices in the system. This value is passed as
-    `nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-  info_ru: |
-    Максимальное число NBD-устройств в системе. Данное значение передаётся
-    модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
- name: nbd_max_part
-  type: int
-  default: 3
-  online: false
-  info: |
-    Maximum number of partitions per NBD device. This value is passed as
-    `max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-    Note that (nbds_max)*(1+max_part) usually can't exceed 256.
-  info_ru: |
-    Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
-    модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
-    Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
--- a/docs/config/src/included.en.md
+++ b/docs/config/src/included.en.md
@@ -28,8 +28,6 @@

 {{../../config/network.en.md|indent=2}}

-{{../../config/client.en.md|indent=2}}
-
 {{../../config/layout-cluster.en.md|indent=2}}

 {{../../config/layout-osd.en.md|indent=2}}
--- a/docs/config/src/included.ru.md
+++ b/docs/config/src/included.ru.md
@@ -28,8 +28,6 @@

 {{../../config/network.ru.md|indent=2}}

-{{../../config/client.ru.md|indent=2}}
-
 {{../../config/layout-cluster.ru.md|indent=2}}

 {{../../config/layout-osd.ru.md|indent=2}}
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -48,14 +48,11 @@
  type: string
  info: |
    RDMA device name to use for Vitastor OSD communications (for example,
-    "rocep5s0f0"). Now Vitastor supports all adapters, even ones without
-    ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
-
-    Versions up to Vitastor 1.2.0 required ODP which is only present in
-    Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
-
-    Run `ibv_devinfo -v` as root to list available RDMA devices and their
-    features.
+    "rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
+    Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
+    to work. For example, Mellanox ConnectX-3 and older adapters don't have
+    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
+    root to list available RDMA devices and their features.

    Remember that you also have to configure your network switches if you use
    RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -64,15 +61,12 @@
    PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-    Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
-    нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
-    картами производства не Mellanox.
-
-    Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
-    на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
-
-    Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
-    список доступных RDMA-устройств, их параметры и возможности.
+    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
+    Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
+    адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
+    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
+    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
+    параметры и возможности.

    Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
    правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -166,45 +160,6 @@
    у принимающей стороны в процессе работы не заканчивались буферы на приём.
    Не влияет на потребление памяти - дополнительная память на операции отправки
    не выделяется.
- name: rdma_odp
-  type: bool
-  default: false
-  online: false
-  info: |
-    Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
-    ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
-    for RDMA adapter to be able to use it. This, in turn, allows to skip memory
-    copying during sending. One would think this should improve performance, but
-    **in reality** RDMA performance with ODP is **drastically** worse. Example
-    3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
-    without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
-
-    This happens because Mellanox ODP implementation seems to be based on
-    message retransmissions when the adapter doesn't know about the buffer yet -
-    it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
-    which is generally slow in RDMA/RoCE networks. Here's a presentation about
-    it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-    ODP support is retained in the code just in case a good ODP implementation
-    appears one day.
-  info_ru: |
-    Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
-    исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
-    не регистрировать память для её использования RDMA-картой. Благодаря этому
-    можно не копировать данные при отправке их в сеть и, казалось бы, это должно
-    улучшать производительность - но **по факту** получается так, что
-    производительность только ухудшается, причём сильно. Пример - на 3-узловом
-    кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
-    удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
-
-    Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
-    основана на повторной передаче сообщений, когда карте не известен буфер -
-    вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
-    А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
-    Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-    Возможность использования ODP сохранена в коде на случай, если вдруг в один
-    прекрасный день появится хорошая реализация ODP.
 - name: peer_connect_interval
  type: sec
  min: 1
@@ -304,3 +259,23 @@
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
+- name: client_dirty_limit
+  type: int
+  default: 33554432
+  online: true
+  info: |
+    Without immediate_commit=all this parameter sets the limit of "dirty"
+    (not committed by fsync) data allowed by the client before forcing an
+    additional fsync and committing the data. Also note that the client always
+    holds a copy of uncommitted data in memory so this setting also affects
+    RAM usage of clients.
+
+    This parameter doesn't affect OSDs themselves.
+  info_ru: |
+    При работе без immediate_commit=all - это лимит объёма "грязных" (не
+    зафиксированных fsync-ом) данных, при достижении которого клиент будет
+    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+    что в этом случае до момента fsync клиент хранит копию незафиксированных
+    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+
+    Параметр не влияет на сами OSD.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -2,28 +2,15 @@
  type: sec
  default: 5
  info: |
-    Interval at which OSDs report their liveness to etcd. Affects OSD lease time
+    Interval at which OSDs report their state to etcd. Affects OSD lease time
    and thus the failover speed. Lease time is equal to this parameter value
    plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
    that every OSD always refreshes its lease in time.
  info_ru: |
-    Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
-    влияет на время резервации (lease) OSD и поэтому - на скорость переключения
+    Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
+    влияет на время резервации (lease) OSD и поэтому на скорость переключения
    при падении OSD. Время lease равняется значению этого параметра плюс
    max_etcd_attempts * etcd_quick_timeout.
- name: etcd_stats_interval
-  type: sec
-  default: 30
-  info: |
-    Interval at which OSDs report their statistics to etcd. Highly affects the
-    imposed load on etcd, because statistics include a key for every OSD and
-    for every PG. At the same time, low statistic intervals make `vitastor-cli`
-    statistics more responsive.
-  info_ru: |
-    Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
-    создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
-    каждый OSD и на каждую PG. В то же время низкий интервал делает
-    статистику, печатаемую `vitastor-cli`, отзывчивей.
 - name: run_primary
  type: bool
  default: true
--- a/docs/installation/kubernetes.en.md
+++ b/docs/installation/kubernetes.en.md
@@ -17,26 +17,4 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-After that you'll be able to create PersistentVolumes.
-
-**Important:** For best experience, use Linux kernel at least 5.15 with [VDUSE](../usage/qemu.en.md#vduse)
-kernel modules enabled (vdpa, vduse, virtio-vdpa). If your distribution doesn't
-have them pre-built - build them yourself ([instructions](../usage/qemu.en.md#vduse)),
-I promise it's worth it :-). When VDUSE is unavailable, CSI driver uses [NBD](../usage/nbd.en.md)
-to map Vitastor devices. NBD is slower and prone to timeout issues: if Vitastor
-cluster becomes unresponsible for more than [nbd_timeout](../config/client.en.md#nbd_timeout),
-the NBD device detaches and breaks pods using it.
-
-## Features
-
-Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
- [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
- Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
- Multiple clusters by using multiple configuration files in ConfigMap.
-
-Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
+After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
--- a/docs/installation/kubernetes.ru.md
+++ b/docs/installation/kubernetes.ru.md
@@ -17,26 +17,4 @@
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-После этого вы сможете создавать PersistentVolume.
-
-**Важно:** Лучше всего использовать ядро Linux версии не менее 5.15 с включёнными модулями
-[VDUSE](../usage/qemu.ru.md#vduse) (vdpa, vduse, virtio-vdpa). Если в вашем дистрибутиве
-они не собраны из коробки - соберите их сами, обещаю, что это стоит того ([инструкция](../usage/qemu.ru.md#vduse)) :-).
-Когда VDUSE недоступно, CSI-плагин использует [NBD](../usage/nbd.ru.md) для подключения
-дисков, а NBD медленнее и имеет проблему таймаута - если кластер остаётся недоступным
-дольше, чем [nbd_timeout](../config/client.ru.md#nbd_timeout), NBD-устройство отключается
-и ломает поды, использующие его.
-
-## Возможности
-
-CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
- Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
- Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
- Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
-
-Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
+После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
+To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):

 - Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
-  bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
+  bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
 - Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
 - Define storage in `/etc/pve/storage.cfg` (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
+Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):

 - Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
-  bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
+  bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
 - Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
 - Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
--- a/docs/intro/architecture.ru.md
+++ b/docs/intro/architecture.ru.md
@@ -54,8 +54,7 @@
  виртуальные диски, их снимки и клоны.
 - **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
  с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
-  библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
-  позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
+  библиотеки, без необходимости отображения дисков в виде блочных устройств.
 - **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
  с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
  (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -31,7 +31,6 @@
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
 - [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
 - [Checksums](../config/layout-osd.en.md#data_csum_type)
- [Client write-back cache](../config/client.en.md#client_enable_writeback)

 ## Plugins and tools

@@ -51,15 +50,13 @@

 The following features are planned for the future:

- File system
- Control plane optimisation
 - Other administrative tools
 - Web GUI
 - OpenNebula plugin
- iSCSI and NVMeoF gateways
+- iSCSI proxy
 - Multi-threaded client
 - Faster failover
- S3
 - Tiered storage (SSD caching)
 - NVDIMM support
 - Compression (possibly)
+- Read caching using system page cache (possibly)
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -33,7 +33,6 @@
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
 - [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
 - [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)

 ## Драйверы и инструменты

@@ -51,15 +50,12 @@

 ## Планы развития

- Файловая система
- Оптимизация слоя управления
 - Другие инструменты администрирования
 - Web-интерфейс
 - Плагин для OpenNebula
- iSCSI и NVMeoF прокси
+- iSCSI-прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
- S3
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
 - Возможно, сжатие
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -28,8 +28,7 @@ It supports the following commands:
 Global options:

 ```
--config_file FILE   Path to Vitastor configuration file
--etcd_address URL   Etcd connection address
+--etcd_address ADDR  Etcd connection address
 --iodepth N          Send N operations in parallel to each OSD when possible (default 32)
 --parallel_osds M    Work with M osds in parallel when possible (default 4)
 --progress 1|0       Report progress (default 1)
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -27,8 +27,7 @@ vitastor-cli - интерфейс командной строки для адм
 Глобальные опции:

 ```
--config_file FILE   Путь к файлу конфигурации Vitastor
--etcd_address URL   Адрес соединения с etcd
+--etcd_address ADDR  Адрес соединения с etcd
 --iodepth N          Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
 --parallel_osds M    Работать параллельно с M OSD (по умолчанию 4)
 --progress 1|0       Печатать прогресс выполнения (по умолчанию 1)
--- a/docs/usage/nbd.en.md
+++ b/docs/usage/nbd.en.md
@@ -11,25 +11,25 @@ NBD stands for "Network Block Device", but in fact it also functions as "BUSE"
 NBD slighly lowers the performance due to additional overhead, but performance still
 remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).

-See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
+Vitastor Kubernetes CSI driver is based on NBD.

-Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
+See also [VDUSE](qemu.en.md#vduse).

 ## Map image

 To create a local block device for a Vitastor image run:

 ```
-vitastor-nbd map --image testimg
+vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 ```

 It will output a block device name like /dev/nbd0 which you can then use as a normal disk.

 You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.

-vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
+Additional options for map command:

-* `--nbd_timeout 300` \
+* `--nbd_timeout 30` \
  Timeout for I/O operations in seconds after exceeding which the kernel stops
  the device. You can set it to 0 to disable the timeout, but beware that you
  won't be able to stop the device at all if vitastor-nbd process dies.
@@ -44,9 +44,6 @@ vitastor-nbd supports all usual Vitastor configuration options like `--config_fi
 * `--foreground 1` \
  Stay in foreground, do not daemonize.

-Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
-in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
-
 ## Unmap image

 To unmap the device run:
--- a/docs/usage/nbd.ru.md
+++ b/docs/usage/nbd.ru.md
@@ -14,16 +14,16 @@ NBD на данный момент необходимо, чтобы монтир
 NBD немного снижает производительность из-за дополнительных копирований памяти,
 но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).

-Смотрите также [VDUSE](qemu.ru.md#vduse), как лучшую альтернативу NBD.
+CSI-драйвер Kubernetes Vitastor основан на NBD.

-CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
+Смотрите также [VDUSE](qemu.ru.md#vduse).

 ## Подключить устройство

 Чтобы создать локальное блочное устройство для образа, выполните команду:

 ```
-vitastor-nbd map --image testimg
+vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 ```

 Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
@@ -32,8 +32,7 @@ vitastor-nbd map --image testimg
 Для обращения по номеру инода, аналогично другим командам, можно использовать опции
 `--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.

-vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
-плюс специфичные для NBD:
+Дополнительные опции для команды подключения NBD-устройства:

 * `--nbd_timeout 30` \
  Максимальное время выполнения любой операции чтения/записи в секундах, при
@@ -54,10 +53,6 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
 * `--foreground 1` \
  Не уводить процесс в фоновый режим.

-Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
-также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
-заданном опцией `--config_file`.
-
 ## Отключить устройство

 Для отключения устройства выполните:
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@@ -23,7 +23,7 @@ balancer or any failover method you want to in that case.
 vitastor-nfs usage:

 ```
-vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
+vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]

 --subdir <DIR>    export images prefixed <DIR>/ (default empty - export all images)
 --portmap 0       do not listen on port 111 (portmap/rpcbind, requires root)
--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@@ -22,7 +22,7 @@
 Использование vitastor-nfs:

 ```
-vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
+vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]

 --subdir <DIR>    экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
 --portmap 0       отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -34,20 +34,6 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```

-With a separate I/O thread:
-
-```
-qemu-system-x86_64 -enable-kvm -m 1024 \
-    -object iothread,id=vitastor1 \
-    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
-        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
-        id=virtio-disk0,bootindex=1,write-cache=off' \
-    -vnc 0.0.0.0:0
-```
-
-You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
-
 ## qemu-img

 For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -98,75 +84,25 @@ This can be used for backups. Just note that exporting an image that is currentl
 is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
 on a live VM.

-## vhost-user-blk
-
-QEMU, starting with 6.0, includes support for attaching disks via a separate
-userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
-lower latency.
-
-Example commands to use it with Vitastor:
-
-```
-qemu-storage-daemon \
-    --daemonize \
-    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
-
-qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-    -object memory-backend-memfd,id=mem,size=2G,share=on \
-    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-    -vnc 0.0.0.0:0
-```
-
-memfd memory-backend is crucial, vhost-user-blk does not work without it.
-
 ## VDUSE

 Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
 to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
 exporting QEMU block devices over this protocol using qemu-storage-daemon.

-VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
-  and block device will continue operation
- It doesn't seem to have the device number limit
+VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
+for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
+hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
+In this case reboot will be the only way to remove VDUSE devices from system.

-Example performance comparison:
-
-|                      | direct fio  | NBD         | VDUSE       |
-|----------------------|-------------|-------------|-------------|
-| linear write         | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
-| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
-| 4k random write Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
-| linear read          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
-| 4k random read Q128  | 287000 iops | 140000 iops | 189000 iops |
-| 4k random read Q1    | 9600 iops   | 7640 iops   | 7780 iops   |
+On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
+performance is important for you. Approximate performance numbers:
+direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.

 To try VDUSE you need at least Linux 5.15, built with VDUSE support
-(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
-
-Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
-use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
-or build modules for Debian kernel manually:
-
-```
-mkdir build
-cd build
-apt-get install linux-headers-`uname -r`
-apt-get build-dep linux-image-`uname -r`-unsigned
-apt-get source linux-image-`uname -r`-unsigned
-cd linux*/drivers/vdpa
-make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
-cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
-cd ../virtio
-make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
-depmod -a
-```
-
-You also need `vdpa` tool from the `iproute2` package.
+(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
+disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
+[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.

 Commands to attach Vitastor image as a VDUSE device:

@@ -179,7 +115,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
 vdpa dev add name test1 mgmtdev vduse
 ```

-After running these commands, `/dev/vda` device will appear in the system and you'll be able to
+After running these commands /dev/vda device will appear in the system and you'll be able to
 use it as a normal disk.

 To remove the device:
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -36,18 +36,6 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```

-С отдельным потоком ввода-вывода:
-
-```
-qemu-system-x86_64 -enable-kvm -m 1024 \
-    -object iothread,id=vitastor1 \
-    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
-        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
-        id=virtio-disk0,bootindex=1,write-cache=off' \
-    -vnc 0.0.0.0:0
-```
-
 Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.

 ## qemu-img
@@ -100,76 +88,25 @@ qemu-img rebase -u -b '' testimg.qcow2
 в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
 с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.

-## vhost-user-blk
-
-QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
-Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
-задержку (ниже на 20-30 микросекунд, чем при обычном методе).
-
-Пример команд для использования vhost-user-blk с Vitastor:
-
-```
-qemu-storage-daemon \
-    --daemonize \
-    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
-
-qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-    -object memory-backend-memfd,id=mem,size=2G,share=on \
-    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-    -vnc 0.0.0.0:0
-```
-
-Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
-
 ## VDUSE

 В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
 к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
 экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.

-VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
-устройств на уровне ядра, ибо:
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
-  перезапустить (!) и блочное устройство продолжит работать
- По-видимому, у него нет предела числа подключаемых в систему устройств
+VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
+подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
+процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
+через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.

-Пример сравнения производительности:
+С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
+быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
+прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.

-|                          | Прямой fio  | NBD         | VDUSE       |
-|--------------------------|-------------|-------------|-------------|
-| линейная запись          | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
-| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
-| 4k случайная запись Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
-| линейное чтение          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
-| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
-| 4k случайное чтение Q1   | 9600 iops   | 7640 iops   | 7780 iops   |
-
-Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
-VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
-
-В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
-на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
-из Proxmox или соберите модули для ядра Debian вручную:
-
-```
-mkdir build
-cd build
-apt-get install linux-headers-`uname -r`
-apt-get build-dep linux-image-`uname -r`-unsigned
-apt-get source linux-image-`uname -r`-unsigned
-cd linux*/drivers/vdpa
-make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
-cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
-cd ../virtio
-make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
-depmod -a
-```
-
-Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
+Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
+VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
+отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
+[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.

 Команды для подключения виртуального диска через VDUSE:

@@ -182,7 +119,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
 vdpa dev add name test1 mgmtdev vduse
 ```

-После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
+После этого в системе появится устройство /dev/vda, которое можно будет использовать как
 обычный диск.

 Для удаления устройства из системы:
--- a/mon/90-vitastor.rules
+++ b/mon/90-vitastor.rules
@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
    IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
    SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"

-ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
-ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -99,7 +99,6 @@ const etcd_tree = {
            etcd_ws_keepalive_interval: 30, // seconds
            // osd
            etcd_report_interval: 5, // seconds
-            etcd_stats_interval: 30, // seconds
            run_primary: true,
            osd_network: null, // "192.168.7.0/24" or an array of masks
            bind_address: "0.0.0.0",
@@ -397,13 +396,12 @@ class Mon
        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
        this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
-        this.prev_stats = { osd_stats: {}, osd_diff: {} };
        this.signals_set = false;
+        this.stat_time = Date.now();
        this.ws = null;
        this.ws_alive = false;
        this.ws_keepalive_timer = null;
        this.on_stop_cb = () => this.on_stop(0).catch(console.error);
-        this.recheck_pgs_active = false;
    }

    parse_etcd_addresses(addrs)
@@ -553,9 +551,9 @@ class Mon
            const cur_addr = this.pick_next_etcd();
            const base = 'ws'+cur_addr.substr(4);
            let now = Date.now();
-            if (tried[base] && now-tried[base] < this.etcd_start_timeout)
+            if (tried[base] && now-tried[base] < timeout)
            {
-                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
+                await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
                now = Date.now();
            }
            tried[base] = now;
@@ -693,27 +691,8 @@ class Mon
        });
    }

-    // Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
-    schedule_save_last_clean()
-    {
-        if (!this.save_last_clean_timer)
-        {
-            this.save_last_clean_timer = setTimeout(() =>
-            {
-                this.save_last_clean_timer = null;
-                this.save_last_clean().catch(this.die);
-            }, this.config.mon_change_timeout || 1000);
-        }
-    }
-
    async save_last_clean()
    {
-        if (this.save_last_clean_running)
-        {
-            this.schedule_save_last_clean();
-            return;
-        }
-        this.save_last_clean_running = true;
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
        const new_clean_pgs = { items: {} };
    next_pool:
@@ -750,7 +729,6 @@ class Mon
                value: b64(JSON.stringify(this.state.history.last_clean_pgs))
            } } ],
        }, this.etcd_start_timeout, 0);
-        this.save_last_clean_running = false;
    }

    get_mon_state()
@@ -1224,12 +1202,6 @@ class Mon

    async recheck_pgs()
    {
-        if (this.recheck_pgs_active)
-        {
-            this.schedule_recheck();
-            return;
-        }
-        this.recheck_pgs_active = true;
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
        // FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1251,7 +1223,6 @@ class Mon
                    // Pool deleted. Delete all PGs, but first stop them.
                    if (!await this.stop_all_pgs(pool_id))
                    {
-                        this.recheck_pgs_active = false;
                        this.schedule_recheck();
                        return;
                    }
@@ -1320,16 +1291,9 @@ class Mon
                        // PG count changed. Need to bring all PGs down.
                        if (!await this.stop_all_pgs(pool_id))
                        {
-                            this.recheck_pgs_active = false;
                            this.schedule_recheck();
                            return;
                        }
-                    }
-                    if (prev_pgs.length != pool_cfg.pg_count)
-                    {
-                        // Scale PG count
-                        // Do it even if old_pg_count is already equal to pool_cfg.pg_count,
-                        // because last_clean_pgs may still contain the old number of PGs
                        const new_pg_history = [];
                        PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
                        pg_history = new_pg_history;
@@ -1431,7 +1395,6 @@ class Mon
                await this.save_pg_config(new_config_pgs);
            }
        }
-        this.recheck_pgs_active = false;
    }

    async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1481,6 +1444,7 @@ class Mon
    }

    // Schedule a recheck to run after a small timeout (1s)
+    // If already scheduled, cancel previous timer and schedule it again
    // This is required for multiple change events to trigger at most 1 recheck in 1s
    schedule_recheck()
    {
@@ -1494,15 +1458,15 @@ class Mon
        }
    }

-    derive_osd_stats(st, prev, prev_diff)
+    derive_osd_stats(st, prev)
    {
        const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
-        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
-        if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
+        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
+        if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
        {
-            return prev_diff || diff;
+            return diff;
        }
-        const timediff = BigInt(st.time*1000 - prev.time*1000);
+        const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
        for (const op in st.op_stats||{})
        {
            const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1534,47 +1498,25 @@ class Mon
            if (n > 0)
                diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
        }
-        for (const pool_id in st.inode_stats||{})
-        {
-            const pool_diff = diff.inode_stats[pool_id] = {};
-            for (const inode_num in st.inode_stats[pool_id])
-            {
-                const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
-                for (const op of [ 'read', 'write', 'delete' ])
-                {
-                    const c = st.inode_stats[pool_id][inode_num][op];
-                    const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
-                        prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
-                    const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
-                    inode_diff[op] = {
-                        bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
-                        iops: n*1000n/timediff,
-                        lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
-                    };
-                }
-            }
-        }
        return diff;
    }

-    sum_op_stats()
+    sum_op_stats(timestamp, prev_stats)
    {
-        for (const osd in this.state.osd.stats)
-        {
-            const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
-            this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
-                cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
-            );
-            this.prev_stats.osd_stats[osd] = cur;
-        }
        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
+        if (!prev_stats || prev_stats.timestamp >= timestamp)
+        {
+            return sum_diff;
+        }
+        const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
        // Sum derived values instead of deriving summed
        for (const osd in this.state.osd.stats)
        {
-            const derived = this.prev_stats.osd_diff[osd];
-            for (const type in sum_diff)
+            const derived = this.derive_osd_stats(this.state.osd.stats[osd],
+                this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
+            for (const type in derived)
            {
-                for (const op in derived[type]||{})
+                for (const op in derived[type])
                {
                    for (const k in derived[type][op])
                    {
@@ -1631,14 +1573,14 @@ class Mon
        return { object_counts, object_bytes };
    }

-    sum_inode_stats()
+    sum_inode_stats(prev_stats, timestamp, prev_timestamp)
    {
        const inode_stats = {};
        const inode_stub = () => ({
            raw_used: 0n,
-            read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
-            write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
-            delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
+            read: { count: 0n, usec: 0n, bytes: 0n },
+            write: { count: 0n, usec: 0n, bytes: 0n },
+            delete: { count: 0n, usec: 0n, bytes: 0n },
        });
        const seen_pools = {};
        for (const pool_id in this.state.config.pools)
@@ -1690,25 +1632,11 @@ class Mon
                }
            }
        }
-        for (const osd in this.prev_stats.osd_diff)
+        if (prev_stats && prev_timestamp >= timestamp)
        {
-            for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
-            {
-                for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
-                {
-                    inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
-                    for (const op of [ 'read', 'write', 'delete' ])
-                    {
-                        const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
-                        const op_st = inode_stats[pool_id][inode_num][op];
-                        op_st.bps += op_diff.bps;
-                        op_st.iops += op_diff.iops;
-                        op_st.lat += op_diff.lat;
-                        op_st.n_osd = (op_st.n_osd || 0) + 1;
-                    }
-                }
-            }
+            prev_stats = null;
        }
+        const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
        for (const pool_id in inode_stats)
        {
            for (const inode_num in inode_stats[pool_id])
@@ -1717,12 +1645,11 @@ class Mon
                for (const op of [ 'read', 'write', 'delete' ])
                {
                    const op_st = inode_stats[pool_id][inode_num][op];
-                    if (op_st.n_osd)
-                    {
-                        op_st.lat /= BigInt(op_st.n_osd);
-                        delete op_st.n_osd;
-                    }
-                    if (op_st.bps > 0 || op_st.iops > 0)
+                    const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
+                    op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
+                    op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
+                    op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
+                    if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
                        nonzero = true;
                }
                if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1755,9 +1682,15 @@ class Mon
    async update_total_stats()
    {
        const txn = [];
+        const timestamp = Date.now();
        const { object_counts, object_bytes } = this.sum_object_counts();
-        let stats = this.sum_op_stats();
-        let { inode_stats, seen_pools } = this.sum_inode_stats();
+        let stats = this.sum_op_stats(timestamp, this.prev_stats);
+        let { inode_stats, seen_pools } = this.sum_inode_stats(
+            this.prev_stats ? this.prev_stats.inode_stats : null,
+            timestamp, this.prev_stats ? this.prev_stats.timestamp : null
+        );
+        this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
+        this.stat_time = Date.now();
        stats.object_counts = object_counts;
        stats.object_bytes = object_bytes;
        stats = this.serialize_bigints(stats);
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.3.1",
+  "version": "1.0.0",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '1.3.1'
+VERSION = '1.0.0'

 LOG = logging.getLogger(__name__)

--- a/patches/pve-qemu-8.1-vitastor.patch
+++ b/patches/pve-qemu-8.1-vitastor.patch
@@ -1,190 +0,0 @@
-Index: pve-qemu-kvm-8.1.2/block/meson.build
-===================================================================
--- pve-qemu-kvm-8.1.2.orig/block/meson.build
-+++ pve-qemu-kvm-8.1.2/block/meson.build
-@@ -123,6 +123,7 @@ foreach m : [
-   [libnfs, 'nfs', files('nfs.c')],
-   [libssh, 'ssh', files('ssh.c')],
-   [rbd, 'rbd', files('rbd.c')],
-+  [vitastor, 'vitastor', files('vitastor.c')],
- ]
-   if m[0].found()
-     module_ss = ss.source_set()
-Index: pve-qemu-kvm-8.1.2/meson.build
-===================================================================
--- pve-qemu-kvm-8.1.2.orig/meson.build
-+++ pve-qemu-kvm-8.1.2/meson.build
-@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_
-   endif
- endif
- 
-+vitastor = not_found
-+if not get_option('vitastor').auto() or have_block
-+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
-+    required: get_option('vitastor'))
-+  if libvitastor_client.found()
-+    if cc.links('''
-+      #include <vitastor_c.h>
-+      int main(void) {
-+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-+        return 0;
-+      }''', dependencies: libvitastor_client)
-+      vitastor = declare_dependency(dependencies: libvitastor_client)
-+    elif get_option('vitastor').enabled()
-+      error('could not link libvitastor_client')
-+    else
-+      warning('could not link libvitastor_client, disabling')
-+    endif
-+  endif
-+endif
-+
- glusterfs = not_found
- glusterfs_ftruncate_has_stat = false
- glusterfs_iocb_has_stat = false
-@@ -2123,6 +2143,7 @@ if numa.found()
- endif
- config_host_data.set('CONFIG_OPENGL', opengl.found())
- config_host_data.set('CONFIG_RBD', rbd.found())
-+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
- config_host_data.set('CONFIG_RDMA', rdma.found())
- config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
- config_host_data.set('CONFIG_SDL', sdl.found())
-@@ -4298,6 +4319,7 @@ summary_info += {'fdt support':       fd
- summary_info += {'libcap-ng support': libcap_ng}
- summary_info += {'bpf support':       libbpf}
- summary_info += {'rbd support':       rbd}
-+summary_info += {'vitastor support':  vitastor}
- summary_info += {'smartcard support': cacard}
- summary_info += {'U2F support':       u2f}
- summary_info += {'libusb':            libusb}
-Index: pve-qemu-kvm-8.1.2/meson_options.txt
-===================================================================
--- pve-qemu-kvm-8.1.2.orig/meson_options.txt
-+++ pve-qemu-kvm-8.1.2/meson_options.txt
-@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value :
-        description: 'lzo compression support')
- option('rbd', type : 'feature', value : 'auto',
-        description: 'Ceph block device driver')
-+option('vitastor', type : 'feature', value : 'auto',
-+       description: 'Vitastor block device driver')
- option('opengl', type : 'feature', value : 'auto',
-        description: 'OpenGL support')
- option('rdma', type : 'feature', value : 'auto',
-Index: pve-qemu-kvm-8.1.2/qapi/block-core.json
-===================================================================
--- pve-qemu-kvm-8.1.2.orig/qapi/block-core.json
-+++ pve-qemu-kvm-8.1.2/qapi/block-core.json
-@@ -3403,7 +3403,7 @@
-             'raw', 'rbd',
-             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
-             'pbs',
-            'ssh', 'throttle', 'vdi', 'vhdx',
-+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
-             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
-             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
-             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
-@@ -4465,6 +4465,28 @@
-             '*server': ['InetSocketAddressBase'] } }
- 
- ##
-+# @BlockdevOptionsVitastor:
-+#
-+# Driver specific block device options for vitastor
-+#
-+# @image:       Image name
-+# @inode:       Inode number
-+# @pool:        Pool ID
-+# @size:        Desired image size in bytes
-+# @config-path: Path to Vitastor configuration
-+# @etcd-host:   etcd connection address(es)
-+# @etcd-prefix: etcd key/value prefix
-+##
-+{ 'struct': 'BlockdevOptionsVitastor',
-+  'data': { '*inode': 'uint64',
-+            '*pool': 'uint64',
-+            '*size': 'uint64',
-+            '*image': 'str',
-+            '*config-path': 'str',
-+            '*etcd-host': 'str',
-+            '*etcd-prefix': 'str' } }
-+
-+##
- # @ReplicationMode:
- #
- # An enumeration of replication modes.
-@@ -4923,6 +4945,7 @@
-       'throttle':   'BlockdevOptionsThrottle',
-       'vdi':        'BlockdevOptionsGenericFormat',
-       'vhdx':       'BlockdevOptionsGenericFormat',
-+      'vitastor':   'BlockdevOptionsVitastor',
-       'virtio-blk-vfio-pci':
-                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
-                       'if': 'CONFIG_BLKIO' },
-@@ -5360,6 +5383,17 @@
-             '*encrypt' :        'RbdEncryptionCreateOptions' } }
- 
- ##
-+# @BlockdevCreateOptionsVitastor:
-+#
-+# Driver specific image creation options for Vitastor.
-+#
-+# @size: Size of the virtual disk in bytes
-+##
-+{ 'struct': 'BlockdevCreateOptionsVitastor',
-+  'data': { 'location':         'BlockdevOptionsVitastor',
-+            'size':             'size' } }
-+
-+##
- # @BlockdevVmdkSubformat:
- #
- # Subformat options for VMDK images
-@@ -5581,6 +5615,7 @@
-       'ssh':            'BlockdevCreateOptionsSsh',
-       'vdi':            'BlockdevCreateOptionsVdi',
-       'vhdx':           'BlockdevCreateOptionsVhdx',
-+      'vitastor':       'BlockdevCreateOptionsVitastor',
-       'vmdk':           'BlockdevCreateOptionsVmdk',
-       'vpc':            'BlockdevCreateOptionsVpc'
-   } }
-Index: pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
-===================================================================
--- pve-qemu-kvm-8.1.2.orig/scripts/ci/org.centos/stream/8/x86_64/configure
-+++ pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
-@@ -30,7 +30,7 @@
- --with-suffix="qemu-kvm" \
- --firmwarepath=/usr/share/qemu-firmware \
- --target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
-+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
- --audio-drv-list="" \
- --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
- --with-coroutine=ucontext \
-@@ -176,6 +176,7 @@
- --enable-opengl \
- --enable-pie \
- --enable-rbd \
-+--enable-vitastor \
- --enable-rdma \
- --enable-seccomp \
- --enable-snappy \
-Index: pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
-===================================================================
--- pve-qemu-kvm-8.1.2.orig/scripts/meson-buildoptions.sh
-+++ pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
-@@ -153,6 +153,7 @@ meson_options_help() {
-   printf "%s\n" '  qed             qed image format support'
-   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
-   printf "%s\n" '  rbd             Ceph block device driver'
-+  printf "%s\n" '  vitastor        Vitastor block device driver'
-   printf "%s\n" '  rdma            Enable RDMA-based migration'
-   printf "%s\n" '  replication     replication support'
-   printf "%s\n" '  sdl             SDL user interface'
-@@ -416,6 +417,8 @@ _meson_option_parse() {
-     --disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
-     --enable-rbd) printf "%s" -Drbd=enabled ;;
-     --disable-rbd) printf "%s" -Drbd=disabled ;;
-+    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
-+    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
-     --enable-rdma) printf "%s" -Drdma=enabled ;;
-     --disable-rdma) printf "%s" -Drdma=disabled ;;
-     --enable-replication) printf "%s" -Dreplication=enabled ;;
--- a/patches/qemu-8.1-vitastor.patch
+++ b/patches/qemu-8.1-vitastor.patch
@@ -1,190 +0,0 @@
-diff --git a/block/meson.build b/block/meson.build
-index 529fc172c6..d542dc0609 100644
--- a/block/meson.build
-+++ b/block/meson.build
-@@ -110,6 +110,7 @@ foreach m : [
-   [libnfs, 'nfs', files('nfs.c')],
-   [libssh, 'ssh', files('ssh.c')],
-   [rbd, 'rbd', files('rbd.c')],
-+  [vitastor, 'vitastor', files('vitastor.c')],
- ]
-   if m[0].found()
-     module_ss = ss.source_set()
-diff --git a/meson.build b/meson.build
-index a9c4f28247..8496cf13f1 100644
--- a/meson.build
-+++ b/meson.build
-@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_block
-   endif
- endif
- 
-+vitastor = not_found
-+if not get_option('vitastor').auto() or have_block
-+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
-+    required: get_option('vitastor'))
-+  if libvitastor_client.found()
-+    if cc.links('''
-+      #include <vitastor_c.h>
-+      int main(void) {
-+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-+        return 0;
-+      }''', dependencies: libvitastor_client)
-+      vitastor = declare_dependency(dependencies: libvitastor_client)
-+    elif get_option('vitastor').enabled()
-+      error('could not link libvitastor_client')
-+    else
-+      warning('could not link libvitastor_client, disabling')
-+    endif
-+  endif
-+endif
-+
- glusterfs = not_found
- glusterfs_ftruncate_has_stat = false
- glusterfs_iocb_has_stat = false
-@@ -2119,6 +2139,7 @@ if numa.found()
- endif
- config_host_data.set('CONFIG_OPENGL', opengl.found())
- config_host_data.set('CONFIG_RBD', rbd.found())
-+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
- config_host_data.set('CONFIG_RDMA', rdma.found())
- config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
- config_host_data.set('CONFIG_SDL', sdl.found())
-@@ -4286,6 +4307,7 @@ summary_info += {'fdt support':       fdt_opt == 'disabled' ? false : fdt_opt}
- summary_info += {'libcap-ng support': libcap_ng}
- summary_info += {'bpf support':       libbpf}
- summary_info += {'rbd support':       rbd}
-+summary_info += {'vitastor support':  vitastor}
- summary_info += {'smartcard support': cacard}
- summary_info += {'U2F support':       u2f}
- summary_info += {'libusb':            libusb}
-diff --git a/meson_options.txt b/meson_options.txt
-index ae6d8f469d..e3d9f8404d 100644
--- a/meson_options.txt
-+++ b/meson_options.txt
-@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value : 'auto',
-        description: 'lzo compression support')
- option('rbd', type : 'feature', value : 'auto',
-        description: 'Ceph block device driver')
-+option('vitastor', type : 'feature', value : 'auto',
-+       description: 'Vitastor block device driver')
- option('opengl', type : 'feature', value : 'auto',
-        description: 'OpenGL support')
- option('rdma', type : 'feature', value : 'auto',
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 2b1d493d6e..90673fdbdc 100644
--- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -3146,7 +3146,7 @@
-             'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
-             'raw', 'rbd',
-             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
-            'ssh', 'throttle', 'vdi', 'vhdx',
-+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
-             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
-             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
-             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
-@@ -4196,6 +4196,28 @@
-             '*key-secret': 'str',
-             '*server': ['InetSocketAddressBase'] } }
- 
-+##
-+# @BlockdevOptionsVitastor:
-+#
-+# Driver specific block device options for vitastor
-+#
-+# @image:       Image name
-+# @inode:       Inode number
-+# @pool:        Pool ID
-+# @size:        Desired image size in bytes
-+# @config-path: Path to Vitastor configuration
-+# @etcd-host:   etcd connection address(es)
-+# @etcd-prefix: etcd key/value prefix
-+##
-+{ 'struct': 'BlockdevOptionsVitastor',
-+  'data': { '*inode': 'uint64',
-+            '*pool': 'uint64',
-+            '*size': 'uint64',
-+            '*image': 'str',
-+            '*config-path': 'str',
-+            '*etcd-host': 'str',
-+            '*etcd-prefix': 'str' } }
-+
- ##
- # @ReplicationMode:
- #
-@@ -4654,6 +4676,7 @@
-       'throttle':   'BlockdevOptionsThrottle',
-       'vdi':        'BlockdevOptionsGenericFormat',
-       'vhdx':       'BlockdevOptionsGenericFormat',
-+      'vitastor':   'BlockdevOptionsVitastor',
-       'virtio-blk-vfio-pci':
-                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
-                       'if': 'CONFIG_BLKIO' },
-@@ -5089,6 +5112,17 @@
-             '*cluster-size' :   'size',
-             '*encrypt' :        'RbdEncryptionCreateOptions' } }
- 
-+##
-+# @BlockdevCreateOptionsVitastor:
-+#
-+# Driver specific image creation options for Vitastor.
-+#
-+# @size: Size of the virtual disk in bytes
-+##
-+{ 'struct': 'BlockdevCreateOptionsVitastor',
-+  'data': { 'location':         'BlockdevOptionsVitastor',
-+            'size':             'size' } }
-+
- ##
- # @BlockdevVmdkSubformat:
- #
-@@ -5311,6 +5345,7 @@
-       'ssh':            'BlockdevCreateOptionsSsh',
-       'vdi':            'BlockdevCreateOptionsVdi',
-       'vhdx':           'BlockdevCreateOptionsVhdx',
-+      'vitastor':       'BlockdevCreateOptionsVitastor',
-       'vmdk':           'BlockdevCreateOptionsVmdk',
-       'vpc':            'BlockdevCreateOptionsVpc'
-   } }
-diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
-index d02b09a4b9..f0b5fbfef3 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
-+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
-@@ -30,7 +30,7 @@
- --with-suffix="qemu-kvm" \
- --firmwarepath=/usr/share/qemu-firmware \
- --target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
-+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
- --audio-drv-list="" \
- --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
- --with-coroutine=ucontext \
-@@ -176,6 +176,7 @@
- --enable-opengl \
- --enable-pie \
- --enable-rbd \
-+--enable-vitastor \
- --enable-rdma \
- --enable-seccomp \
- --enable-snappy \
-diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
-index d7020af175..94958eb6fa 100644
--- a/scripts/meson-buildoptions.sh
-+++ b/scripts/meson-buildoptions.sh
-@@ -153,6 +153,7 @@ meson_options_help() {
-   printf "%s\n" '  qed             qed image format support'
-   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
-   printf "%s\n" '  rbd             Ceph block device driver'
-+  printf "%s\n" '  vitastor        Vitastor block device driver'
-   printf "%s\n" '  rdma            Enable RDMA-based migration'
-   printf "%s\n" '  replication     replication support'
-   printf "%s\n" '  sdl             SDL user interface'
-@@ -416,6 +417,8 @@ _meson_option_parse() {
-     --disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
-     --enable-rbd) printf "%s" -Drbd=enabled ;;
-     --disable-rbd) printf "%s" -Drbd=disabled ;;
-+    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
-+    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
-     --enable-rdma) printf "%s" -Drdma=enabled ;;
-     --disable-rdma) printf "%s" -Drdma=disabled ;;
-     --enable-replication) printf "%s" -Dreplication=enabled ;;
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-1.3.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.3.1$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.0.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.0.0$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -15,7 +15,6 @@ RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
 RUN rpm --nomd5 -i fio*.src.rpm
 RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
 RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
-RUN yum -y install cmake3

 ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root

@@ -36,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.3.1.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.0.0.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.3.1
+Version:        1.0.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.3.1.el7.tar.gz
+Source0:        vitastor-1.0.0.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -16,7 +16,7 @@ BuildRequires:  jerasure-devel
 BuildRequires:  libisa-l-devel
 BuildRequires:  gf-complete-devel
 BuildRequires:  libibverbs-devel
-BuildRequires:  cmake3
+BuildRequires:  cmake
 Requires:       vitastor-osd = %{version}-%{release}
 Requires:       vitastor-mon = %{version}-%{release}
 Requires:       vitastor-client = %{version}-%{release}
@@ -94,7 +94,7 @@ Vitastor fio drivers for benchmarking.

 %build
 . /opt/rh/devtoolset-9/enable
-%cmake3 .
+%cmake .
 %make_build


--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.3.1.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.0.0.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.3.1
+Version:        1.0.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.3.1.el8.tar.gz
+Source0:        vitastor-1.0.0.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.3.1.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.0.0.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.3.1
+Version:        1.0.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.3.1.el9.tar.gz
+Source0:        vitastor-1.0.0.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,11 +16,10 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="1.3.1")
-add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
-add_link_options(-fno-omit-frame-pointer)
+add_definitions(-DVERSION="1.0.0")
+add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
-	add_definitions(-fsanitize=address)
+	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
 	add_link_options(-fsanitize=address -fno-omit-frame-pointer)
 endif (${WITH_ASAN})

--- a/src/blockstore.cpp
+++ b/src/blockstore.cpp
@@ -82,3 +82,8 @@ uint32_t blockstore_t::get_bitmap_granularity()
 {
    return impl->get_bitmap_granularity();
 }
+
+bool blockstore_t::wants_fsync()
+{
+    return impl->wants_fsync();
+}
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -226,4 +226,7 @@ public:
    uint64_t get_journal_size();

    uint32_t get_bitmap_granularity();
+
+    // Returns true if writing can stall due to a lack of fsync
+    bool wants_fsync();
 };
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -1372,8 +1372,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
                    ? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
                .reserved = 0,
                .journal_start = new_trim_pos,
-                .version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
-                    ? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
+                .version = JOURNAL_VERSION_V2,
                .data_csum_type = bs->dsk.data_csum_type,
                .csum_block_size = bs->dsk.csum_block_size,
            };
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -167,7 +167,7 @@ void blockstore_impl_t::loop()
                // wait for all big writes to complete, submit data device fsync
                // wait for the data device fsync to complete, then submit journal writes for big writes
                // then submit an fsync operation
-                if (has_writes)
+                if (0 && has_writes)
                {
                    // Can't submit SYNC before previous writes
                    continue;
@@ -734,3 +734,15 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
    fprintf(stderr, "Disk %s failed: result is %d, expected %d. Can't continue, sorry :-(\n", op, retval, expected);
    exit(1);
 }
+
+bool blockstore_impl_t::wants_fsync()
+{
+    if (!unstable_writes.size())
+    {
+        return false;
+    }
+    uint64_t journal_free_space = journal.next_free < journal.used_start
+        ? (journal.used_start - journal.next_free)
+        : (journal.len - journal.next_free + journal.used_start - journal.block_size);
+    return journal_fsync_feedback_limit > 0 && journal.len-journal_free_space >= journal_fsync_feedback_limit;
+}
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -264,6 +264,8 @@ class blockstore_impl_t
    int throttle_threshold_us = 50;
    // Maximum writes between automatically added fsync operations
    uint64_t autosync_writes = 128;
+    // Maximum free space in the journal in bytes to start sending fsync feedback to primary OSDs
+    uint64_t journal_fsync_feedback_limit = 0;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;
@@ -274,7 +276,7 @@ class blockstore_impl_t
    blockstore_dirty_db_t dirty_db;
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    int unsynced_big_write_count = 0, unstable_unsynced = 0;
+    int unsynced_big_write_count = 0;
    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;
@@ -433,4 +435,6 @@ public:
    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
    inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
    inline uint64_t get_journal_size() { return dsk.journal_len; }
+
+    bool wants_fsync();
 };
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -553,7 +553,7 @@ resume_1:
        }
        if (je_start->size == JE_START_V0_SIZE ||
            (je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
-            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
+            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
        {
            fprintf(
                stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,8 +562,7 @@ resume_1:
            );
            exit(1);
        }
-        if (je_start->version == JOURNAL_VERSION_V1 ||
-            je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
+        if (je_start->version == JOURNAL_VERSION_V1)
        {
            je_start->data_csum_type = 0;
            je_start->csum_block_size = 0;
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -144,10 +144,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        journal.sector_info[journal.cur_sector].written = false;
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
-        auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
-        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
-        journal.next_free = next_next_free;
+        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
        memset(journal.inmemory
            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
            : (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -13,6 +13,12 @@
 #define JOURNAL_BUFFER_SIZE 4*1024*1024
 #define JOURNAL_ENTRY_HEADER_SIZE 16

+// We reserve some extra space for future stabilize requests during writes
+// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
+// writing more than can be stabilized afterwards
+#define JOURNAL_STABILIZE_RESERVATION 65536
+#define JOURNAL_INSTANT_RESERVATION 131072
+
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -4,6 +4,25 @@
 #include <sys/file.h>
 #include "blockstore_impl.h"

+static uint64_t parse_fsync_feedback(blockstore_config_t & config, uint64_t journal_len)
+{
+    uint64_t journal_fsync_feedback_limit = 0;
+    if (config.find("journal_min_free_bytes") == config.end() &&
+        config.find("journal_min_free_percent") == config.end())
+    {
+        journal_fsync_feedback_limit = 90 * journal_len / 100;
+    }
+    else
+    {
+        journal_fsync_feedback_limit = strtoull(config["journal_min_free_bytes"].c_str(), NULL, 10);
+        if (!journal_fsync_feedback_limit)
+        {
+            journal_fsync_feedback_limit = strtoull(config["journal_min_free_percent"].c_str(), NULL, 10) * journal_len / 100;
+        }
+    }
+    return journal_fsync_feedback_limit;
+}
+
 void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
 {
    // Online-configurable options:
@@ -53,6 +72,8 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    }
    if (!init)
    {
+        // has to be parsed after dsk.parse_config(), thus repeated here for online update
+        journal_fsync_feedback_limit = parse_fsync_feedback(config, journal.len);
        return;
    }
    // Offline-configurable options:
@@ -96,6 +117,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
        config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
    journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
        config["inmemory_journal"] != "no";
+    journal_fsync_feedback_limit = parse_fsync_feedback(config, journal.len);
    // Validate
    if (journal.sector_count < 2)
    {
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -16,7 +16,6 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
 {
    if (immediate_commit == IMMEDIATE_ALL)
    {
-        // We can return immediately because sync is only dequeued after all previous writes
        op->retval = 0;
        FINISH_OP(op);
        return 2;
@@ -86,15 +85,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
                auto & dirty_entry = dirty_db.at(sbw);
                uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
                if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-                    (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+                    left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
                {
                    return 0;
                }
            }
        }
        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@@ -185,11 +183,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            mark_stable(dirty_it->first);
        }
-        else
-        {
-            unstable_unsynced--;
-            assert(unstable_unsynced >= 0);
-        }
        dirty_it++;
        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
        {
@@ -220,11 +213,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
            {
                mark_stable(*it);
            }
-            else
-            {
-                unstable_unsynced--;
-                assert(unstable_unsynced >= 0);
-            }
        }
    }
    op->retval = 0;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        dyn = calloc_or_die(1, dyn_size+sizeof(int));
        *((int*)dyn) = 1;
    }
-    uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
+    uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -289,18 +289,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
 #endif
        auto prev_it = dirty_it;
-        if (prev_it != dirty_db.begin())
+        prev_it--;
+        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
        {
-            prev_it--;
-            if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
-            {
-                // Original version is still invalid
-                // All subsequent writes to the same object must be canceled too
-                printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
-                    op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
-                cancel_all_writes(op, dirty_it, -EEXIST);
-                return 2;
-            }
+            // Original version is still invalid
+            // All subsequent writes to the same object must be canceled too
+            cancel_all_writes(op, dirty_it, -EEXIST);
+            return 2;
        }
        op->version = PRIV(op)->real_version;
        PRIV(op)->real_version = 0;
@@ -320,7 +315,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
            sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            (dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@@ -386,10 +381,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
-        if (!(dirty_it->second.state & BS_ST_INSTANT))
-        {
-            unstable_unsynced++;
-        }
        if (immediate_commit != IMMEDIATE_ALL)
        {
            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -412,7 +403,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
            || !space_check.check_available(op, 1,
                sizeof(journal_entry_small_write) + dyn_size,
-                op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+                op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
        {
            return 0;
        }
@@ -462,8 +453,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                exit(1);
            }
        }
-        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        je->oid = op->oid;
        je->version = op->version;
@@ -501,15 +490,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        dirty_it->second.location = journal.next_free;
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
-        next_next_free = journal.next_free + op->len;
-        if (next_next_free >= journal.len)
-            next_next_free = dsk.journal_block_size;
-        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
-        journal.next_free = next_next_free;
-        if (!(dirty_it->second.state & BS_ST_INSTANT))
+        journal.next_free += op->len;
+        if (journal.next_free >= journal.len)
        {
-            unstable_unsynced++;
+            journal.next_free = dsk.journal_block_size;
        }
        if (!PRIV(op)->pending_ops)
        {
@@ -549,7 +533,7 @@ resume_2:
        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
        {
            return 0;
        }
@@ -593,20 +577,14 @@ resume_4:
 #endif
        bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
        bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
-        bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
        if (imm)
        {
            auto & unstab = unstable_writes[op->oid];
            unstab = unstab < op->version ? op->version : unstab;
-            if (!is_instant)
-            {
-                unstable_unsynced--;
-                assert(unstable_unsynced >= 0);
-            }
        }
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
            | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
-        if (imm && is_instant)
+        if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
        {
            // Deletions and 'instant' operations are treated as immediately stable
            mark_stable(dirty_it->first);
@@ -752,7 +730,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
    {
        return 0;
    }
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -17,7 +17,7 @@
 static const char *exe_name = NULL;

 static const char* help_text =
-    "Vitastor command-line tool " VERSION "\n"
+    "Vitastor command-line tool\n"
    "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -116,8 +116,7 @@ static const char* help_text =
    "Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
    "\n"
    "GLOBAL OPTIONS:\n"
-    "  --config_file FILE  Path to Vitastor configuration file\n"
-    "  --etcd_address URL  Etcd connection address\n"
+    "  --etcd_address <etcd_address>\n"
    "  --iodepth N         Send N operations in parallel to each OSD when possible (default 32)\n"
    "  --parallel_osds M   Work with M osds in parallel when possible (default 4)\n"
    "  --progress 1|0      Report progress (default 1)\n"
@@ -332,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
    {
        // Create client
        json11::Json cfg_j = cfg;
-        p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
+        p->ringloop = new ring_loop_t(512);
        p->epmgr = new epoll_manager_t(p->ringloop);
        p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
        // Smaller timeout by default for more interactiveness
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -109,7 +109,7 @@ resume_1:
            }
            for (auto pg_per_pair: pg_per_osd)
            {
-                uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
+                uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
                if (pool_avail > pg_free)
                {
                    pool_avail = pg_free;
@@ -124,10 +124,8 @@ resume_1:
                pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
            }
            pool_stats[pool_cfg.id] = json11::Json::object {
-                { "id", (uint64_t)pool_cfg.id },
                { "name", pool_cfg.name },
                { "pg_count", pool_cfg.pg_count },
-                { "real_pg_count", pool_cfg.real_pg_count },
                { "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
                { "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
                    ? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -178,7 +176,7 @@ resume_1:
            { "title", "SCHEME" },
        });
        cols.push_back(json11::Json::object{
-            { "key", "pg_count_fmt" },
+            { "key", "pg_count" },
            { "title", "PGS" },
        });
        cols.push_back(json11::Json::object{
@@ -207,9 +205,6 @@ resume_1:
            double raw_to = kv.second["raw_to_usable"].number_value();
            if (raw_to < 0.000001 && raw_to > -0.000001)
                raw_to = 1;
-            kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
-                ? kv.second["real_pg_count"].as_string()
-                : kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
            kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
            kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
            kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
--- a/src/cli_ls.cpp
+++ b/src/cli_ls.cpp
@@ -174,7 +174,7 @@ resume_1:
                    { "size", 0 },
                    { "readonly", false },
                    { "pool_id", (uint64_t)INODE_POOL(inode_num) },
-                    { "pool_name", pool_it != parent->cli->st_cli.pool_config.end()
+                    { "pool_name", pool_it == parent->cli->st_cli.pool_config.end()
                        ? (pool_it->second.name == "" ? "<Unnamed>" : pool_it->second.name) : "?" },
                    { "inode_num", INODE_NO_POOL(inode_num) },
                    { "inode_id", inode_num },
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -158,7 +158,12 @@ resume_2:
        for (auto & pool_pair: parent->cli->st_cli.pool_config)
        {
            auto & pool_cfg = pool_pair.second;
-            bool active = pool_cfg.real_pg_count > 0;
+            bool active = true;
+            if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
+            {
+                active = false;
+                pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
+            }
            pool_count++;
            for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
            {
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -64,7 +64,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

 cluster_client_t::~cluster_client_t()
 {
-    msgr.repeer_pgs = [](osd_num_t){};
+    msgr.repeer_pgs = [this](osd_num_t){};
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
@@ -169,52 +169,46 @@ void cluster_client_t::calc_wait(cluster_op_t *op)

 void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
 {
-    if (opcode != OSD_OP_WRITE && opcode != OSD_OP_SYNC)
+    if (opcode == OSD_OP_WRITE)
    {
-        return;
-    }
-    cluster_op_t *bh_ops_local[32], **bh_ops = bh_ops_local;
-    int bh_op_count = 0, bh_op_max = 32;
-    while (next)
-    {
-        auto n2 = next->next;
-        if (opcode == OSD_OP_WRITE
-            ? (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
-                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
-            : (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE))
+        while (next)
        {
-            next->prev_wait += inc;
-            assert(next->prev_wait >= 0);
-            if (!next->prev_wait)
+            auto n2 = next->next;
+            if (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
+                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
            {
-                // Kind of std::vector with local "small vector optimisation"
-                if (bh_op_count >= bh_op_max)
+                next->prev_wait += inc;
+                assert(next->prev_wait >= 0);
+                if (!next->prev_wait)
                {
-                    bh_op_max *= 2;
-                    cluster_op_t **n = (cluster_op_t**)malloc_or_die(sizeof(cluster_op_t*) * bh_op_max);
-                    memcpy(n, bh_ops, sizeof(cluster_op_t*) * bh_op_count);
-                    if (bh_ops != bh_ops_local)
-                    {
-                        free(bh_ops);
-                    }
-                    bh_ops = n;
+                    if (next->opcode == OSD_OP_SYNC)
+                        continue_sync(next);
+                    else
+                        continue_rw(next);
                }
-                bh_ops[bh_op_count++] = next;
            }
+            next = n2;
        }
-        next = n2;
    }
-    for (int i = 0; i < bh_op_count; i++)
+    else if (opcode == OSD_OP_SYNC)
    {
-        cluster_op_t *next = bh_ops[i];
-        if (next->opcode == OSD_OP_SYNC)
-            continue_sync(next);
-        else
-            continue_rw(next);
-    }
-    if (bh_ops != bh_ops_local)
-    {
-        free(bh_ops);
+        while (next)
+        {
+            auto n2 = next->next;
+            if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
+            {
+                next->prev_wait += inc;
+                assert(next->prev_wait >= 0);
+                if (!next->prev_wait)
+                {
+                    if (next->opcode == OSD_OP_SYNC)
+                        continue_sync(next);
+                    else
+                        continue_rw(next);
+                }
+            }
+            next = n2;
+        }
    }
 }

@@ -454,7 +448,7 @@ bool cluster_client_t::flush()
            wb->start_writebacks(this, 0);
            cluster_op_t *sync = new cluster_op_t;
            sync->opcode = OSD_OP_SYNC;
-            sync->callback = [](cluster_op_t *sync)
+            sync->callback = [this](cluster_op_t *sync)
            {
                delete sync;
            };
@@ -465,7 +459,7 @@ bool cluster_client_t::flush()
    bool sync_done = false;
    cluster_op_t *sync = new cluster_op_t;
    sync->opcode = OSD_OP_SYNC;
-    sync->callback = [&sync_done](cluster_op_t *sync)
+    sync->callback = [this, &sync_done](cluster_op_t *sync)
    {
        delete sync;
        sync_done = true;
--- a/src/cluster_client_wb.cpp
+++ b/src/cluster_client_wb.cpp
@@ -263,7 +263,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
    }
    assert(calc_len == op->len);
    writebacks_active++;
-    op->callback = [this, flush_id](cluster_op_t* op)
+    op->callback = [this, cli, flush_id](cluster_op_t* op)
    {
        // Buffer flushes should be always retried, regardless of the error,
        // so they should never result in an error here
@@ -383,7 +383,7 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
            auto begin = (cur_offset < offset ? offset : cur_offset);
            auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
            memcpy(
-                (uint8_t*)v.iov_base + begin - cur_offset,
+                v.iov_base + begin - cur_offset,
                buf + (cur_offset <= offset ? 0 : cur_offset-offset),
                end - begin
            );
--- a/src/disk_tool.cpp
+++ b/src/disk_tool.cpp
@@ -5,7 +5,7 @@
 #include "str_util.h"

 static const char *help_text =
-    "Vitastor disk management tool " VERSION "\n"
+    "Vitastor disk management tool\n"
    "(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
        {
            self.options["allow_data_loss"] = "1";
        }
-        else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
+        else if (argv[i][0] == '-' && argv[i][1] == '-')
        {
            char *key = argv[i]+2;
            self.options[key] = argv[++i];
--- a/src/disk_tool_journal.cpp
+++ b/src/disk_tool_journal.cpp
@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
        if (journal_calc_data_pos != sw.data_offset)
        {
            printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
-                : " (mismatched, calculated = %08lx)", journal_pos);
+                : " (mismatched, calculated = %lu)", journal_pos);
        }
        uint32_t data_csum_size = (!je_start.csum_block_size
            ? 0
--- a/src/disk_tool_resize.cpp
+++ b/src/disk_tool_resize.cpp
@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
    {
        iodepth = 32;
    }
-    ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
+    ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
    dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
    if (dsk.data_fd < 0)
    {
--- a/src/epoll_manager.cpp
+++ b/src/epoll_manager.cpp
@@ -23,24 +23,19 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)

    tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });

-    if (ringloop)
+    consumer.loop = [this]()
    {
-        consumer.loop = [this]()
-        {
-            if (pending)
-                handle_uring_event();
-        };
-        ringloop->register_consumer(&consumer);
-        handle_uring_event();
-    }
+        if (pending)
+            handle_epoll_events();
+    };
+    ringloop->register_consumer(&consumer);
+
+    handle_epoll_events();
 }

 epoll_manager_t::~epoll_manager_t()
 {
-    if (ringloop)
-    {
-        ringloop->unregister_consumer(&consumer);
-    }
+    ringloop->unregister_consumer(&consumer);
    if (tfd)
    {
        delete tfd;
@@ -49,11 +44,6 @@ epoll_manager_t::~epoll_manager_t()
    close(epoll_fd);
 }

-int epoll_manager_t::get_fd()
-{
-    return epoll_fd;
-}
-
 void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler)
 {
    if (handler != NULL)
@@ -85,7 +75,7 @@ void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, in
    }
 }

-void epoll_manager_t::handle_uring_event()
+void epoll_manager_t::handle_epoll_events()
 {
    io_uring_sqe *sqe = ringloop->get_sqe();
    if (!sqe)
@@ -105,20 +95,14 @@ void epoll_manager_t::handle_uring_event()
        {
            throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
        }
-        handle_uring_event();
+        handle_epoll_events();
    };
    ringloop->submit();
-    handle_events(0);
-}
-
-void epoll_manager_t::handle_events(int timeout)
-{
    int nfds;
    epoll_event events[MAX_EPOLL_EVENTS];
    do
    {
-        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, timeout);
-        timeout = 0;
+        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
        for (int i = 0; i < nfds; i++)
        {
            auto cb_it = epoll_handlers.find(events[i].data.fd);
--- a/src/epoll_manager.h
+++ b/src/epoll_manager.h
@@ -15,14 +15,11 @@ class epoll_manager_t
    ring_consumer_t consumer;
    ring_loop_t *ringloop;
    std::map<int, std::function<void(int, int)>> epoll_handlers;
-
-    void handle_uring_event();
 public:
    epoll_manager_t(ring_loop_t *ringloop);
    ~epoll_manager_t();
-    int get_fd();
    void set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler);
-    void handle_events(int timeout);
+    void handle_epoll_events();

    timerfd_manager_t *tfd;
 };
--- a/src/fio_cluster.cpp
+++ b/src/fio_cluster.cpp
@@ -32,7 +32,6 @@
 struct sec_data
 {
    vitastor_c *cli = NULL;
-    bool epoll_based = false;
    void *watch = NULL;
    bool last_sync = false;
    /* The list of completed io_u structs. */
@@ -59,7 +58,6 @@ struct sec_options
    int rdma_port_num = 0;
    int rdma_gid_index = 0;
    int rdma_mtu = 0;
-    int no_io_uring = 0;
 };

 static struct fio_option options[] = {
@@ -195,16 +193,6 @@ static struct fio_option options[] = {
        .category = FIO_OPT_C_ENGINE,
        .group  = FIO_OPT_G_FILENAME,
    },
-    {
-        .name   = "no_io_uring",
-        .lname  = "Disable io_uring",
-        .type   = FIO_OPT_BOOL,
-        .off1   = offsetof(struct sec_options, no_io_uring),
-        .help   = "Use epoll and plain sendmsg/recvmsg instead of io_uring (slower)",
-        .def    = "0",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
    {
        .name = NULL,
    },
@@ -293,17 +281,7 @@ static int sec_setup(struct thread_data *td)
        opt_push(options, "log_level", std::to_string(o->cluster_log).c_str());
    // allow writeback caching if -direct is not set
    opt_push(options, "client_writeback_allowed", td->o.odirect ? "0" : "1");
-    bsd->cli = o->no_io_uring ? NULL : vitastor_c_create_uring_json((const char**)options.data(), options.size());
-    bsd->epoll_based = false;
-    if (!bsd->cli)
-    {
-        if (o->no_io_uring)
-            fprintf(stderr, "vitastor: io_uring disabled - I/O will be slower\n");
-        else
-            fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
-        bsd->cli = vitastor_c_create_epoll_json((const char**)options.data(), options.size());
-        bsd->epoll_based = true;
-    }
+    bsd->cli = vitastor_c_create_uring_json((const char**)options.data(), options.size());
    for (auto opt: options)
        free(opt);
    options.clear();
@@ -311,24 +289,12 @@ static int sec_setup(struct thread_data *td)
    {
        bsd->watch = NULL;
        vitastor_c_watch_inode(bsd->cli, o->image, watch_callback, bsd);
-        if (!bsd->epoll_based)
+        while (true)
        {
-            while (true)
-            {
-                vitastor_c_uring_handle_events(bsd->cli);
-                if (bsd->watch)
-                    break;
-                vitastor_c_uring_wait_events(bsd->cli);
-            }
-        }
-        else
-        {
-            while (true)
-            {
-                if (bsd->watch)
-                    break;
-                vitastor_c_epoll_handle_events(bsd->cli, 1000);
-            }
+            vitastor_c_uring_handle_events(bsd->cli);
+            if (bsd->watch)
+                break;
+            vitastor_c_uring_wait_events(bsd->cli);
        }
        td->files[0]->real_file_size = vitastor_c_inode_get_size(bsd->watch);
        if (!vitastor_c_inode_get_num(bsd->watch) ||
@@ -471,24 +437,12 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
 static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
 {
    sec_data *bsd = (sec_data*)td->io_ops_data;
-    if (!bsd->epoll_based)
+    while (true)
    {
-        while (true)
-        {
-            vitastor_c_uring_handle_events(bsd->cli);
-            if (bsd->completed.size() >= min)
-                break;
-            vitastor_c_uring_wait_events(bsd->cli);
-        }
-    }
-    else
-    {
-        while (true)
-        {
-            if (bsd->completed.size() >= min)
-                break;
-            vitastor_c_epoll_handle_events(bsd->cli, 1000);
-        }
+        vitastor_c_uring_handle_events(bsd->cli);
+        if (bsd->completed.size() >= min)
+            break;
+        vitastor_c_uring_wait_events(bsd->cli);
    }
    return bsd->completed.size();
 }
--- a/src/fio_engine.cpp
+++ b/src/fio_engine.cpp
@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
                config[p.first] = p.second.dump();
        }
    }
-    bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
+    bsd->ringloop = new ring_loop_t(512);
    bsd->epmgr = new epoll_manager_t(bsd->ringloop);
    bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
    while (1)
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -22,7 +22,7 @@ void osd_messenger_t::init()
    {
        rdma_context = msgr_rdma_context_t::create(
            rdma_device != "" ? rdma_device.c_str() : NULL,
-            rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
+            rdma_port_num, rdma_gid_index, rdma_mtu, log_level
        );
        if (!rdma_context)
        {
@@ -167,7 +167,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
    this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
    if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
        this->rdma_max_msg = 129*1024;
-    this->rdma_odp = config["rdma_odp"].bool_value();
 #endif
    this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
    if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
@@ -396,24 +395,27 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
            },
        },
    };
+    json11::Json::object payload;
+    if (this->osd_num)
+    {
+        payload["osd_num"] = this->osd_num;
+    }
 #ifdef WITH_RDMA
    if (rdma_context)
    {
        cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
        if (cl->rdma_conn)
        {
-            json11::Json payload = json11::Json::object {
-                { "connect_rdma", cl->rdma_conn->addr.to_string() },
-                { "rdma_max_msg", cl->rdma_conn->max_msg },
-            };
-            std::string payload_str = payload.dump();
-            op->req.show_conf.json_len = payload_str.size();
-            op->buf = malloc_or_die(payload_str.size());
-            op->iov.push_back(op->buf, payload_str.size());
-            memcpy(op->buf, payload_str.c_str(), payload_str.size());
+            payload["connect_rdma"] = cl->rdma_conn->addr.to_string();
+            payload["rdma_max_msg"] = cl->rdma_conn->max_msg;
        }
    }
 #endif
+    std::string payload_str = json11::Json(payload).dump();
+    op->req.show_conf.json_len = payload_str.size();
+    op->buf = malloc_or_die(payload_str.size());
+    op->iov.push_back(op->buf, payload_str.size());
+    memcpy(op->buf, payload_str.c_str(), payload_str.size());
    op->callback = [this, cl](osd_op_t *op)
    {
        std::string json_err;
@@ -491,14 +493,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
                    fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
                }
                cl->peer_state = PEER_RDMA;
-                tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
-                {
-                    // Do not miss the disconnection!
-                    if (epoll_events & EPOLLRDHUP)
-                    {
-                        handle_peer_epoll(peer_fd, epoll_events);
-                    }
-                });
+                tfd->set_fd_handler(cl->peer_fd, false, NULL);
                // Add the initial receive request
                try_recv_rdma(cl);
            }
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -131,7 +131,6 @@ protected:
    msgr_rdma_context_t *rdma_context = NULL;
    uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
    uint64_t rdma_max_msg = 0;
-    bool rdma_odp = false;
 #endif

    std::vector<int> read_ready_clients;
@@ -198,9 +197,7 @@ protected:
    void handle_reply_ready(osd_op_t *op);

 #ifdef WITH_RDMA
-    void try_send_rdma(osd_client_t *cl);
-    void try_send_rdma_odp(osd_client_t *cl);
-    void try_send_rdma_nodp(osd_client_t *cl);
+    bool try_send_rdma(osd_client_t *cl);
    bool try_recv_rdma(osd_client_t *cl);
    void handle_rdma_events();
 #endif
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@@ -47,29 +47,11 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
    if (qp)
        ibv_destroy_qp(qp);
    if (recv_buffers.size())
-    {
        for (auto b: recv_buffers)
-        {
-            if (b.mr)
-                ibv_dereg_mr(b.mr);
-            free(b.buf);
-        }
-        recv_buffers.clear();
-    }
-    if (send_out.mr)
-    {
-        ibv_dereg_mr(send_out.mr);
-        send_out.mr = NULL;
-    }
-    if (send_out.buf)
-    {
-        free(send_out.buf);
-        send_out.buf = NULL;
-    }
-    send_out_size = 0;
+            free(b);
 }

-msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
+msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
 {
    int res;
    ibv_device **dev_list = NULL;
@@ -154,27 +136,21 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
            fprintf(stderr, "Couldn't query RDMA device for its features\n");
            goto cleanup;
        }
-        ctx->odp = odp;
-        if (ctx->odp &&
-            (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
+        if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
            !(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
            !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
-            !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
+            !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
        {
-            ctx->odp = false;
-            if (log_level > 0)
-                fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
+            fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
+            goto cleanup;
        }
    }

-    if (ctx->odp)
+    ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
+    if (!ctx->mr)
    {
-        ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
-        if (!ctx->mr)
-        {
-            fprintf(stderr, "Couldn't register RDMA memory region\n");
-            goto cleanup;
-        }
+        fprintf(stderr, "Couldn't register RDMA memory region\n");
+        goto cleanup;
    }

    ctx->channel = ibv_create_comp_channel(ctx->context);
@@ -389,34 +365,12 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
    cl->rdma_conn->cur_send++;
 }

-static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
-{
-    auto rc = cl->rdma_conn;
-    int total_dst_len = dst_len;
-    while (dst_len > 0 && rc->send_pos < cl->send_list.size())
-    {
-        iovec & iov = cl->send_list[rc->send_pos];
-        uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
-            ? iov.iov_len-rc->send_buf_pos : dst_len);
-        memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
-        dst += len;
-        dst_len -= len;
-        rc->send_buf_pos += len;
-        if (rc->send_buf_pos >= iov.iov_len)
-        {
-            rc->send_pos++;
-            rc->send_buf_pos = 0;
-        }
-    }
-    return total_dst_len-dst_len;
-}
-
-void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
+bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
    if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
    {
-        return;
+        return true;
    }
    uint64_t op_size = 0, op_sge = 0;
    ibv_sge sge[rc->max_sge];
@@ -454,70 +408,15 @@ void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
        rc->send_sizes.push_back(op_size);
        try_send_rdma_wr(cl, sge, op_sge);
    }
+    return true;
 }

-void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
-{
-    auto rc = cl->rdma_conn;
-    if (!rc->send_out_size)
-    {
-        // Allocate send ring buffer, if not yet
-        rc->send_out_size = rc->max_msg*rdma_max_send;
-        rc->send_out.buf = malloc_or_die(rc->send_out_size);
-        if (!rdma_context->odp)
-        {
-            rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
-            if (!rc->send_out.mr)
-            {
-                fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
-                exit(1);
-            }
-        }
-    }
-    // Copy data into the buffer and send it
-    uint8_t *dst = NULL;
-    int dst_len = 0;
-    int copied = 1;
-    while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
-    {
-        dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
-        dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
-        if (dst_len > rc->max_msg)
-            dst_len = rc->max_msg;
-        copied = try_send_rdma_copy(cl, dst, dst_len);
-        if (copied > 0)
-        {
-            rc->send_out_pos += copied;
-            if (rc->send_out_pos == rc->send_out_size)
-                rc->send_out_pos = 0;
-            assert(rc->send_out_pos < rc->send_out_size);
-            if (rc->send_out_pos >= rc->send_done_pos)
-                rc->send_out_full = true;
-            ibv_sge sge = {
-                .addr = (uintptr_t)dst,
-                .length = (uint32_t)copied,
-                .lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
-            };
-            try_send_rdma_wr(cl, &sge, 1);
-            rc->send_sizes.push_back(copied);
-        }
-    }
-}
-
-void osd_messenger_t::try_send_rdma(osd_client_t *cl)
-{
-    if (rdma_context->odp)
-        try_send_rdma_odp(cl);
-    else
-        try_send_rdma_nodp(cl);
-}
-
-static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
+static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
 {
    ibv_sge sge = {
-        .addr = (uintptr_t)b.buf,
+        .addr = (uintptr_t)buf,
        .length = (uint32_t)cl->rdma_conn->max_msg,
-        .lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
+        .lkey = cl->rdma_conn->ctx->mr->lkey,
    };
    ibv_recv_wr *bad_wr = NULL;
    ibv_recv_wr wr = {
@@ -539,19 +438,9 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
    auto rc = cl->rdma_conn;
    while (rc->cur_recv < rc->max_recv)
    {
-        msgr_rdma_buf_t b;
-        b.buf = malloc_or_die(rc->max_msg);
-        if (!rdma_context->odp)
-        {
-            b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
-            if (!b.mr)
-            {
-                fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
-                exit(1);
-            }
-        }
-        rc->recv_buffers.push_back(b);
-        try_recv_rdma_wr(cl, b);
+        void *buf = malloc_or_die(rc->max_msg);
+        rc->recv_buffers.push_back(buf);
+        try_recv_rdma_wr(cl, buf);
    }
    return true;
 }
@@ -603,7 +492,7 @@ void osd_messenger_t::handle_rdma_events()
            if (!is_send)
            {
                rc->cur_recv--;
-                if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
+                if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
                {
                    // handle_read_buffer may stop the client
                    continue;
@@ -616,14 +505,6 @@ void osd_messenger_t::handle_rdma_events()
                rc->cur_send--;
                uint64_t sent_size = rc->send_sizes.at(0);
                rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
-                if (!rdma_context->odp)
-                {
-                    rc->send_done_pos += sent_size;
-                    rc->send_out_full = false;
-                    if (rc->send_done_pos == rc->send_out_size)
-                        rc->send_done_pos = 0;
-                    assert(rc->send_done_pos < rc->send_out_size);
-                }
                int send_pos = 0, send_buf_pos = 0;
                while (sent_size > 0)
                {
--- a/src/msgr_rdma.h
+++ b/src/msgr_rdma.h
@@ -23,7 +23,6 @@ struct msgr_rdma_context_t
    ibv_device *dev = NULL;
    ibv_device_attr_ex attrx;
    ibv_pd *pd = NULL;
-    bool odp = false;
    ibv_mr *mr = NULL;
    ibv_comp_channel *channel = NULL;
    ibv_cq *cq = NULL;
@@ -36,16 +35,10 @@ struct msgr_rdma_context_t
    int max_cqe = 0;
    int used_max_cqe = 0;

-    static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
+    static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
    ~msgr_rdma_context_t();
 };

-struct msgr_rdma_buf_t
-{
-    void *buf = NULL;
-    ibv_mr *mr = NULL;
-};
-
 struct msgr_rdma_connection_t
 {
    msgr_rdma_context_t *ctx = NULL;
@@ -57,11 +50,8 @@ struct msgr_rdma_connection_t

    int send_pos = 0, send_buf_pos = 0;
    int next_recv_buf = 0;
-    std::vector<msgr_rdma_buf_t> recv_buffers;
+    std::vector<void*> recv_buffers;
    std::vector<uint64_t> send_sizes;
-    msgr_rdma_buf_t send_out;
-    int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
-    bool send_out_full = false;

    ~msgr_rdma_connection_t();
    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@@ -3,7 +3,6 @@

 #define _XOPEN_SOURCE
 #include <limits.h>
-#include <sys/epoll.h>

 #include "messenger.h"

@@ -120,9 +119,9 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
            try_send(cl);
        }
    }
-    else
+    else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
    {
-        if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
+        if (cl->write_state == 0)
        {
            cl->write_state = CL_WRITE_READY;
            write_ready_clients.push_back(cur_op->peer_fd);
@@ -284,14 +283,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
                fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
            }
            cl->peer_state = PEER_RDMA;
-            tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
-            {
-                // Do not miss the disconnection!
-                if (epoll_events & EPOLLRDHUP)
-                {
-                    handle_peer_epoll(peer_fd, epoll_events);
-                }
-            });
+            tfd->set_fd_handler(cl->peer_fd, false, NULL);
            // Add the initial receive request
            try_recv_rdma(cl);
        }
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@@ -30,7 +30,7 @@ protected:
    std::string image_name;
    uint64_t inode = 0;
    uint64_t device_size = 0;
-    int nbd_timeout = 300;
+    int nbd_timeout = 30;
    int nbd_max_devices = 64;
    int nbd_max_part = 3;
    inode_watch_t *watch = NULL;
@@ -135,16 +135,14 @@ public:
            "  %s unmap /dev/nbd0\n"
            "  %s ls [--json]\n"
            "OPTIONS:\n"
-            "  All usual Vitastor config options like --config_file <path_to_config> plus NBD-specific:\n"
-            "  --nbd_timeout 300\n"
+            "  All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
+            "  --nbd_timeout 30\n"
            "    Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
            "    the device. You can set it to 0 to disable the timeout, but beware that you\n"
            "    won't be able to stop the device at all if vitastor-nbd process dies.\n"
            "  --nbd_max_devices 64 --nbd_max_part 3\n"
            "    Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
            "    note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
-            "    Note that nbd_timeout, nbd_max_devices and nbd_max_part options may also be specified\n"
-            "    in /etc/vitastor/vitastor.conf or in other configuration file specified with --config_file.\n"
            "  --logfile /path/to/log/file.txt\n"
            "    Wite log messages to the specified file instead of dropping them (in background mode)\n"
            "    or printing them to the standard output (in foreground mode).\n"
@@ -206,29 +204,20 @@ public:
                exit(1);
            }
        }
-        auto file_config = osd_messenger_t::read_config(cfg);
-        if (file_config["nbd_max_devices"].is_number() || file_config["nbd_max_devices"].is_string())
+        if (cfg["nbd_max_devices"].is_number() || cfg["nbd_max_devices"].is_string())
        {
-            nbd_max_devices = file_config["nbd_max_devices"].uint64_value();
+            nbd_max_devices = cfg["nbd_max_devices"].uint64_value();
        }
-        if (file_config["nbd_max_part"].is_number() || file_config["nbd_max_part"].is_string())
+        if (cfg["nbd_max_part"].is_number() || cfg["nbd_max_part"].is_string())
        {
-            nbd_max_part = file_config["nbd_max_part"].uint64_value();
+            nbd_max_part = cfg["nbd_max_part"].uint64_value();
        }
-        if (file_config["nbd_timeout"].is_number() || file_config["nbd_timeout"].is_string())
+        if (cfg["nbd_timeout"].is_number() || cfg["nbd_timeout"].is_string())
        {
-            nbd_timeout = file_config["nbd_timeout"].uint64_value();
-        }
-        if (cfg["client_writeback_allowed"].is_null())
-        {
-            // NBD is always aware of fsync, so we allow write-back cache
-            // by default if it's enabled
-            auto obj = cfg.object_items();
-            obj["client_writeback_allowed"] = true;
-            cfg = obj;
+            nbd_timeout = cfg["nbd_timeout"].uint64_value();
        }
        // Create client
-        ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
+        ringloop = new ring_loop_t(512);
        epmgr = new epoll_manager_t(ringloop);
        cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
        if (!inode)
@@ -275,7 +264,7 @@ public:
            int i = 0;
            while (true)
            {
-                int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, nbd_timeout, bg);
+                int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, 30, bg);
                if (r == 0)
                {
                    printf("/dev/nbd%d\n", i);
--- a/src/nfs_proxy.cpp
+++ b/src/nfs_proxy.cpp
@@ -56,7 +56,7 @@ json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
                "(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n"
                "\n"
                "USAGE:\n"
-                "  %s [STANDARD OPTIONS] [OTHER OPTIONS]\n"
+                "  %s [--etcd_address ADDR] [OTHER OPTIONS]\n"
                "  --subdir <DIR>    export images prefixed <DIR>/ (default empty - export all images)\n"
                "  --portmap 0       do not listen on port 111 (portmap/rpcbind, requires root)\n"
                "  --bind <IP>       bind service to <IP> address (default 0.0.0.0)\n"
@@ -65,9 +65,8 @@ json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
                "  --pool <POOL>     use <POOL> as default pool for new files (images)\n"
                "  --foreground 1    stay in foreground, do not daemonize\n"
                "\n"
-                "NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n"
-                "you do not use client_enable_writeback=true, so you can freely use multiple\n"
-                "NFS proxies with L3 load balancing in this case.\n"
+                "NFS proxy is stateless if you use immediate_commit=all in your cluster, so\n"
+                "you can freely use multiple NFS proxies with L3 load balancing in this case.\n"
                "\n"
                "Example start and mount commands for a custom NFS port:\n"
                "  %s --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
@@ -115,16 +114,8 @@ void nfs_proxy_t::run(json11::Json cfg)
        if (name_prefix.size())
            name_prefix += "/";
    }
-    if (cfg["client_writeback_allowed"].is_null())
-    {
-        // NFS is always aware of fsync, so we allow write-back cache
-        // by default if it's enabled
-        auto obj = cfg.object_items();
-        obj["client_writeback_allowed"] = true;
-        cfg = obj;
-    }
    // Create client
-    ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
+    ringloop = new ring_loop_t(512);
    epmgr = new epoll_manager_t(ringloop);
    cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
    cmd = new cli_tool_t();
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -160,9 +160,6 @@ void osd_t::parse_config(bool init)
        etcd_report_interval = config["etcd_report_interval"].uint64_value();
        if (etcd_report_interval <= 0)
            etcd_report_interval = 5;
-        etcd_stats_interval = config["etcd_stats_interval"].uint64_value();
-        if (etcd_stats_interval <= 0)
-            etcd_stats_interval = 30;
        readonly = json_is_true(config["readonly"]);
        run_primary = !json_is_false(config["run_primary"]);
        allow_test_ops = json_is_true(config["allow_test_ops"]);
@@ -187,6 +184,14 @@ void osd_t::parse_config(bool init)
        // Allow to set it to 0
        autosync_writes = config["autosync_writes"].uint64_value();
    }
+    if (!config["fsync_feedback_repeat_interval"].is_null())
+    {
+        fsync_feedback_repeat_interval = config["fsync_feedback_repeat_interval"].uint64_value();
+    }
+    if (!fsync_feedback_repeat_interval)
+    {
+        fsync_feedback_repeat_interval = 500; // ms
+    }
    if (!config["client_queue_depth"].is_null())
    {
        client_queue_depth = config["client_queue_depth"].uint64_value();
@@ -541,15 +546,11 @@ void osd_t::print_slow()
                }
                else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
                {
-                    for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
+                    for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
                    {
                        obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
                        bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
                    }
-                    if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
-                    {
-                        bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
-                    }
                }
                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
                {
--- a/src/osd.h
+++ b/src/osd.h
@@ -93,7 +93,6 @@ class osd_t

    json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
    int etcd_report_interval = 5;
-    int etcd_stats_interval = 30;

    bool readonly = false;
    osd_num_t osd_num = 1; // OSD numbers start with 1
@@ -123,6 +122,7 @@ class osd_t
    uint32_t scrub_list_limit = 1000;
    bool scrub_find_best = true;
    uint64_t scrub_ec_max_bruteforce = 100;
+    uint64_t fsync_feedback_repeat_interval = 500;

    // cluster state

@@ -167,6 +167,8 @@ class osd_t
    uint64_t unstable_write_count = 0;
    std::map<osd_object_id_t, uint64_t> unstable_writes;
    std::deque<osd_op_t*> syncs_in_progress;
+    std::map<int, timespec> unstable_write_osds;
+    int fsync_feedback_timer_id = -1;

    // client & peer I/O

@@ -258,6 +260,7 @@ class osd_t
    void exec_show_config(osd_op_t *cur_op);
    void exec_secondary(osd_op_t *cur_op);
    void secondary_op_callback(osd_op_t *cur_op);
+    void fsync_feedback();

    // primary ops
    void autosync();
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -429,18 +429,14 @@ void osd_t::acquire_lease()
        create_osd_state();
    });
    printf(
-        "[OSD %lu] reporting to etcd at %s every %d seconds (statistics every %d seconds)\n", this->osd_num,
+        "[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num,
        (config["etcd_address"].is_string() ? config["etcd_address"].string_value() : config["etcd_address"].dump()).c_str(),
-        etcd_report_interval, etcd_stats_interval
+        etcd_report_interval
    );
    tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
    {
        renew_lease(false);
    });
-    tfd->set_timer(etcd_stats_interval*1000, true, [this](int timer_id)
-    {
-        report_statistics();
-    });
 }

 // Report "up" state once, then keep it alive using the lease
@@ -545,6 +541,7 @@ void osd_t::renew_lease(bool reload)
        else
        {
            etcd_failed_attempts = 0;
+            report_statistics();
            // Reload PGs
            if (reload && run_primary)
            {
--- a/Show More
+++ b/Show More