Add relatime and lazytime for ext4 mounting

Add vitastor-disk update-sb command
Change qemu to qemu-system-x86 in docs
2023-12-14 17:03:38 +03:00 · 2023-12-14 01:11:42 +03:00 · 2023-12-14 01:01:00 +03:00 · 2023-12-14 01:00:32 +03:00 · 2023-12-10 00:34:13 +03:00 · 2023-12-08 00:10:12 +03:00
140 changed files with 4251 additions and 1078 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
 project(vitastor)
-set(VERSION "1.0.0")
+set(VERSION "1.3.1")
 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - Параметры
    - [Общие](docs/config/common.ru.md)
    - [Сетевые](docs/config/network.ru.md)
    - [Клиентский код](docs/config/client.en.md)
    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
    - [Прочие параметры OSD](docs/config/osd.ru.md)
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Read more details below in the documentation.
  - Parameter Reference
    - [Common](docs/config/common.en.md)
    - [Network](docs/config/network.en.md)
    - [Client](docs/config/client.en.md)
    - [Global Disk Layout](docs/config/layout-cluster.en.md)
    - [OSD Disk Layout](docs/config/layout-osd.en.md)
    - [OSD Runtime Parameters](docs/config/osd.en.md)
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@@ -1,14 +1,15 @@
 # Compile stage
-FROM golang:buster AS build
+FROM golang:bookworm AS build
 ADD go.sum go.mod /app/
 RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
 ADD . /app
-RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
+RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
-RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
+    cd /app && \
    CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
 # Final stage
-FROM debian:buster
+FROM debian:bookworm
 LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
 LABEL description="Vitastor CSI Driver"
@@ -18,19 +19,30 @@ ENV CSI_ENDPOINT=""
 RUN apt-get update && \
    apt-get install -y wget && \
    (echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
    apt-get update && \
-    apt-get install -y e2fsprogs xfsprogs kmod && \
+    apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
        # dependencies of qemu-storage-daemon
        libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
        libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
    apt-get clean && \
    (echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
 COPY --from=build /app/vitastor-csi /bin/
-RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
+RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
    ((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
    apt-get update && \
    apt-get install -y vitastor-client && \
    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-utils_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-block-extra_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
    dpkg -x qemu-utils*.deb tmp1 && \
    dpkg -x qemu-block-extra*.deb tmp1 && \
    cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
    mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
    cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
    rm -rf tmp1 *.deb && \
    apt-get clean
 ENTRYPOINT ["/bin/vitastor-csi"]
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v1.0.0
+VERSION ?= v1.3.1
 all: build push
--- a/csi/deploy/001-csi-config-map.yaml
+++ b/csi/deploy/001-csi-config-map.yaml
@@ -2,6 +2,7 @@
 apiVersion: v1
 kind: ConfigMap
 data:
  # You can add multiple configuration files here to use a multi-cluster setup
  vitastor.conf: |-
    {"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
 metadata:
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.0.0
+          image: vitalif/vitastor-csi:v1.3.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
@@ -82,6 +82,8 @@ spec:
              name: host-sys
            - mountPath: /run/mount
              name: host-mount
            - mountPath: /run/vitastor-csi
              name: run-vitastor-csi
            - mountPath: /lib/modules
              name: lib-modules
              readOnly: true
@@ -132,6 +134,9 @@ spec:
        - name: host-mount
          hostPath:
            path: /run/mount
        - name: run-vitastor-csi
          hostPath:
            path: /run/vitastor-csi
        - name: lib-modules
          hostPath:
            path: /lib/modules
--- a/csi/deploy/005-csi-provisioner-rbac.yaml
+++ b/csi/deploy/005-csi-provisioner-rbac.yaml
@@ -35,10 +35,13 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshots"]
-    verbs: ["get", "list"]
+    verbs: ["get", "list", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshots/status"]
    verbs: ["get", "list", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents"]
-    verbs: ["create", "get", "list", "watch", "update", "delete"]
+    verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotclasses"]
    verbs: ["get", "list", "watch"]
@@ -53,7 +56,7 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents/status"]
-    verbs: ["update"]
+    verbs: ["update", "patch"]
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get"]
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -23,6 +23,11 @@ metadata:
  name: csi-vitastor-provisioner
 spec:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
      maxSurge: 0
  selector:
    matchLabels:
      app: csi-vitastor-provisioner
@@ -46,7 +51,7 @@ spec:
      priorityClassName: system-cluster-critical
      containers:
        - name: csi-provisioner
-          image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
+          image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
          args:
            - "--csi-address=$(ADDRESS)"
            - "--v=5"
@@ -116,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.0.0
+          image: vitalif/vitastor-csi:v1.3.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/009-storage-class.yaml
+++ b/csi/deploy/009-storage-class.yaml
@@ -12,8 +12,6 @@ parameters:
  etcdVolumePrefix: ""
  poolId: "1"
  # you can choose other configuration file if you have it in the config map
  # different etcd URLs and prefixes should also be put in the config
  #configPath: "/etc/vitastor/vitastor.conf"
-  # you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
+allowVolumeExpansion: true
  # multiple etcdUrls may be specified, delimited by comma
  #etcdUrl: "http://192.168.7.2:2379"
  #etcdPrefix: "/vitastor"
--- a/csi/deploy/example-snapshot-class.yaml
+++ b/csi/deploy/example-snapshot-class.yaml
@@ -0,0 +1,7 @@
 apiVersion: snapshot.storage.k8s.io/v1
 kind: VolumeSnapshotClass
 metadata:
  name: vitastor-snapclass
 driver: csi.vitastor.io
 deletionPolicy: Delete
 parameters:
--- a/csi/deploy/example-snapshot-clone.yaml
+++ b/csi/deploy/example-snapshot-clone.yaml
@@ -0,0 +1,16 @@
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: test-vitastor-clone
 spec:
  storageClassName: vitastor
  dataSource:
    name: snap1
    kind: VolumeSnapshot
    apiGroup: snapshot.storage.k8s.io
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 10Gi
--- a/csi/deploy/example-snapshot.yaml
+++ b/csi/deploy/example-snapshot.yaml
@@ -0,0 +1,8 @@
 apiVersion: snapshot.storage.k8s.io/v1
 kind: VolumeSnapshot
 metadata:
  name: snap1
 spec:
  volumeSnapshotClassName: vitastor-snapclass
  source:
    persistentVolumeClaimName: test-vitastor-pvc
--- a/csi/go.mod
+++ b/csi/go.mod
@@ -9,6 +9,7 @@ require (
 	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
 	google.golang.org/protobuf v1.24.0
 	k8s.io/klog v1.0.0
 	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
 )
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor
 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.0.0"
+    vitastorCSIDriverVersion = "1.3.1"
 )
 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -20,6 +20,7 @@ import (
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
    "google.golang.org/protobuf/types/known/timestamppb"
    "github.com/container-storage-interface/spec/lib/go/csi"
 )
@@ -45,6 +46,7 @@ type InodeConfig struct
    ParentPool uint64 `json:"parent_pool,omitempty"`
    ParentId uint64 `json:"parent_id,omitempty"`
    Readonly bool `json:"readonly,omitempty"`
    CreateTs uint64 `json:"create_ts,omitempty"`
 }
 type ControllerServer struct
@@ -60,7 +62,7 @@ func NewControllerServer(driver *Driver) *ControllerServer
    }
 }
-func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
+func GetConnectionParams(params map[string]string) (map[string]string, error)
 {
    ctxVars := make(map[string]string)
    configPath := params["configPath"]
@@ -73,71 +75,69 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
        ctxVars["configPath"] = configPath
    }
    config := make(map[string]interface{})
-    if configFD, err := os.Open(configPath); err == nil
+    configFD, err := os.Open(configPath)
    if (err != nil)
    {
        return nil, err
    }
    defer configFD.Close()
    data, _ := ioutil.ReadAll(configFD)
    json.Unmarshal(data, &config)
-    }
+    // Check etcd URL in the config, but do not use the explicit etcdUrl
-    // Try to load prefix & etcd URL from the config
+    // parameter for CLI calls, otherwise users won't be able to later
    // change them - storage class parameters are saved in volume IDs
    var etcdUrl []string
    if (params["etcdUrl"] != "")
    {
        ctxVars["etcdUrl"] = params["etcdUrl"]
        etcdUrl = strings.Split(params["etcdUrl"], ",")
    }
    if (len(etcdUrl) == 0)
    {
    switch config["etcd_address"].(type)
    {
    case string:
-            etcdUrl = strings.Split(config["etcd_address"].(string), ",")
+        url := strings.TrimSpace(config["etcd_address"].(string))
        if (url != "")
        {
            etcdUrl = strings.Split(url, ",")
        }
    case []string:
        etcdUrl = config["etcd_address"].([]string)
-        }
+    case []interface{}:
-    }
+        for _, url := range config["etcd_address"].([]interface{})
    etcdPrefix := params["etcdPrefix"]
    if (etcdPrefix == "")
        {
-        etcdPrefix, _ = config["etcd_prefix"].(string)
+            s, ok := url.(string)
-        if (etcdPrefix == "")
+            if (ok)
            {
-            etcdPrefix = "/vitastor"
+                etcdUrl = append(etcdUrl, s)
            }
        }
-    else
+    }
    if (len(etcdUrl) == 0)
    {
-        ctxVars["etcdPrefix"] = etcdPrefix
+        return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
    }
-    return ctxVars, etcdUrl, etcdPrefix
+    return ctxVars, nil
 }
 func system(program string, args ...string) ([]byte, []byte, error)
 {
    klog.Infof("Running "+program+" "+strings.Join(args, " "))
    c := exec.Command(program, args...)
    var stdout, stderr bytes.Buffer
    c.Stdout, c.Stderr = &stdout, &stderr
    err := c.Run()
    if (err != nil)
    {
        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
        return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
    }
    return stdout.Bytes(), stderr.Bytes(), nil
 }
 func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
 {
    if (ctxVars["etcdUrl"] != "")
    {
        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
    }
    if (ctxVars["etcdPrefix"] != "")
    {
        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
    }
    if (ctxVars["configPath"] != "")
    {
        args = append(args, "--config_path", ctxVars["configPath"])
    }
-    c := exec.Command("/usr/bin/vitastor-cli", args...)
+    stdout, _, err := system("/usr/bin/vitastor-cli", args...)
-    var stdout, stderr bytes.Buffer
+    return stdout, err
    c.Stdout = &stdout
    c.Stderr = &stderr
    err := c.Run()
    stderrStr := string(stderr.Bytes())
    if (err != nil)
    {
        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
    }
    return stdout.Bytes(), nil
 }
 // Create the volume
@@ -172,32 +172,48 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }
-    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
+    ctxVars, err := GetConnectionParams(req.Parameters)
    if (len(etcdUrl) == 0)
    {
        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }
    // Create image using vitastor-cli
    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
    if (err != nil)
    {
        if (strings.Index(err.Error(), "already exists") > 0)
        {
            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
    if (err != nil)
    {
        return nil, err
    }
-            var inodeCfg []InodeConfig
+
-            err = json.Unmarshal(stat, &inodeCfg)
+    args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
    // Support creation from snapshot
    var src *csi.VolumeContentSource
    if (req.VolumeContentSource.GetSnapshot() != nil)
    {
        snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
        if (snapId != "")
        {
            snapVars := make(map[string]string)
            err := json.Unmarshal([]byte(snapId), &snapVars)
            if (err != nil)
            {
-                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+                return nil, status.Error(codes.Internal, "volume ID not in JSON format")
            }
-            if (len(inodeCfg) == 0)
+            args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
            src = &csi.VolumeContentSource{
                Type: &csi.VolumeContentSource_Snapshot{
                    Snapshot: &csi.VolumeContentSource_SnapshotSource{
                        SnapshotId: snapId,
                    },
                },
            }
        }
    }
    // Create image using vitastor-cli
    _, err = invokeCLI(ctxVars, args)
    if (err != nil)
    {
-                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
+        if (strings.Index(err.Error(), "already exists") > 0)
        {
            inodeCfg, err := invokeList(ctxVars, volName, true)
            if (err != nil)
            {
                return nil, err
            }
            if (inodeCfg[0].Size < uint64(volSize))
            {
@@ -217,6 +233,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
            // Ugly, but VolumeContext isn't passed to DeleteVolume :-(
            VolumeId: string(volumeIdJson),
            CapacityBytes: volSize,
            ContentSource: src,
        },
    }, nil
 }
@@ -230,15 +247,19 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }
-    ctxVars := make(map[string]string)
+    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
-    volName := ctxVars["name"]
+    volName := volVars["name"]
-    ctxVars, _, _ = GetConnectionParams(ctxVars)
+    ctxVars, err := GetConnectionParams(volVars)
    if (err != nil)
    {
        return nil, err
    }
    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
    if (err != nil)
@@ -344,6 +365,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
        csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
        csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
        csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
        csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
        // TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
    } {
        controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
    }
@@ -353,28 +376,226 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
    }, nil
 }
 func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
 {
    stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
    if (err != nil)
    {
        return nil, err
    }
    var inodeCfg []InodeConfig
    err = json.Unmarshal(stat, &inodeCfg)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
    }
    if (expectExist && len(inodeCfg) == 0)
    {
        return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
    }
    return inodeCfg, nil
 }
 // CreateSnapshot create snapshot of an existing PV
 func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
    if (req == nil)
    {
        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
    }
    if (req.SourceVolumeId == "" || req.Name == "")
    {
        return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
    }
    // snapshot name
    snapName := req.Name
    // req.VolumeId is an ugly json string in our case :)
    ctxVars := make(map[string]string)
    err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
    volName := ctxVars["name"]
    // Create image using vitastor-cli
    _, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
    if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
    {
        return nil, err
    }
    // Check created snapshot
    inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
    if (err != nil)
    {
        return nil, err
    }
    // Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
    ctxVars["snapshot"] = snapName
    snapIdJson, _ := json.Marshal(ctxVars)
    return &csi.CreateSnapshotResponse{
        Snapshot: &csi.Snapshot{
            SizeBytes: int64(inodeCfg[0].Size),
            SnapshotId: string(snapIdJson),
            SourceVolumeId: req.SourceVolumeId,
            CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
            ReadyToUse: true,
        },
    }, nil
 }
 // DeleteSnapshot delete provided snapshot of a PV
 func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
    if (req == nil)
    {
        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
    }
    if (req.SnapshotId == "")
    {
        return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
    }
    volVars := make(map[string]string)
    err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
    }
    volName := volVars["name"]
    snapName := volVars["snapshot"]
    ctxVars, err := GetConnectionParams(volVars)
    if (err != nil)
    {
        return nil, err
    }
    _, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
    if (err != nil)
    {
        return nil, err
    }
    return &csi.DeleteSnapshotResponse{}, nil
 }
 // ListSnapshots list the snapshots of a PV
 func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
    if (req == nil)
    {
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }
-// ControllerExpandVolume resizes a volume
+    volVars := make(map[string]string)
    err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
    volName := volVars["name"]
    ctxVars, err := GetConnectionParams(volVars)
    if (err != nil)
    {
        return nil, err
    }
    inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
    if (err != nil)
    {
        return nil, err
    }
    resp := &csi.ListSnapshotsResponse{}
    for _, ino := range inodeCfg
    {
        snapName := ino.Name[len(volName)+1:]
        if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
        {
        }
        else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
        {
            volVars["snapshot"] = snapName
            snapIdJson, _ := json.Marshal(volVars)
            resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
                Snapshot: &csi.Snapshot{
                    SizeBytes: int64(ino.Size),
                    SnapshotId: string(snapIdJson),
                    SourceVolumeId: req.SourceVolumeId,
                    CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
                    ReadyToUse: true,
                },
            })
        }
        else
        {
            resp.NextToken = snapName
            break
        }
    }
    return resp, nil
 }
 // ControllerExpandVolume increases the size of a volume
 func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
    if (req == nil)
    {
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }
    if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
    {
        return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
    }
    volVars := make(map[string]string)
    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
    volName := volVars["name"]
    ctxVars, err := GetConnectionParams(volVars)
    if (err != nil)
    {
        return nil, err
    }
    inodeCfg, err := invokeList(ctxVars, volName, true)
    if (err != nil)
    {
        return nil, err
    }
    if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
    {
        sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
        _, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
        if (err != nil)
        {
            return nil, err
        }
        inodeCfg, err = invokeList(ctxVars, volName, true)
        if (err != nil)
        {
            return nil, err
        }
    }
    return &csi.ControllerExpandVolumeResponse{
        CapacityBytes: int64(inodeCfg[0].Size),
        NodeExpansionRequired: false,
    }, nil
 }
 // ControllerGetVolume get volume info
--- a/csi/src/identityserver.go
+++ b/csi/src/identityserver.go
@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
                    },
                },
            },
            {
                Type: &csi.PluginCapability_VolumeExpansion_{
                    VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
                        Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
                    },
                },
            },
        },
    }, nil
 }
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@@ -5,11 +5,15 @@ package vitastor
 import (
    "context"
    "errors"
    "encoding/json"
    "fmt"
    "os"
    "os/exec"
-    "encoding/json"
+    "path/filepath"
    "strconv"
    "strings"
-    "bytes"
+    "syscall"
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
@@ -25,16 +29,91 @@ import (
 type NodeServer struct
 {
    *Driver
    useVduse bool
    stateDir string
    mounter mount.Interface
 }
 type DeviceState struct
 {
    ConfigPath string `json:"configPath"`
    VdpaId     string `json:"vdpaId"`
    Image      string `json:"image"`
    Blockdev   string `json:"blockdev"`
    Readonly   bool   `json:"readonly"`
    PidFile    string `json:"pidFile"`
 }
 // NewNodeServer create new instance node
 func NewNodeServer(driver *Driver) *NodeServer
 {
-    return &NodeServer{
+    stateDir := os.Getenv("STATE_DIR")
    if (stateDir == "")
    {
        stateDir = "/run/vitastor-csi"
    }
    if (stateDir[len(stateDir)-1] != '/')
    {
        stateDir += "/"
    }
    ns := &NodeServer{
        Driver: driver,
        useVduse: checkVduseSupport(),
        stateDir: stateDir,
        mounter: mount.New(""),
    }
    if (ns.useVduse)
    {
        ns.restoreVduseDaemons()
    }
    return ns
 }
 func checkVduseSupport() bool
 {
    // Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
    vduse := true
    for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
    {
        _, err := os.Stat("/sys/module/"+mod)
        if (err != nil)
        {
            if (!errors.Is(err, os.ErrNotExist))
            {
                klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
            }
            c := exec.Command("/sbin/modprobe", mod)
            c.Stdout = os.Stderr
            c.Stderr = os.Stderr
            err := c.Run()
            if (err != nil)
            {
                klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
                vduse = false
                break
            }
        }
    }
    // Check that vdpa tool functions
    if (vduse)
    {
        c := exec.Command("/sbin/vdpa", "-j", "dev")
        c.Stderr = os.Stderr
        err := c.Run()
        if (err != nil)
        {
            klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
            vduse = false
        }
    }
    if (!vduse)
    {
        klog.Errorf(
            "Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
            " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
        )
    }
    return vduse
 }
 // NodeStageVolume mounts the volume to a staging path on the node.
@@ -61,6 +140,303 @@ func Contains(list []string, s string) bool
    return false
 }
 func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
 {
    // Map NBD device
    // FIXME: Check if already mapped
    args := []string{
        "map", "--image", volName,
    }
    if (ctxVars["configPath"] != "")
    {
        args = append(args, "--config_path", ctxVars["configPath"])
    }
    if (readonly)
    {
        args = append(args, "--readonly", "1")
    }
    stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
    dev := strings.TrimSpace(string(stdout))
    if (dev == "")
    {
        return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
    }
    return dev, err
 }
 func (ns *NodeServer) unmapNbd(devicePath string)
 {
    // unmap NBD device
    unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
    if (unmapErr != nil)
    {
        klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
    }
 }
 func findByPidFile(pidFile string) (*os.Process, error)
 {
    klog.Infof("killing process with PID from file %s", pidFile)
    pidBuf, err := os.ReadFile(pidFile)
    if (err != nil)
    {
        return nil, err
    }
    pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
    if (err != nil)
    {
        return nil, err
    }
    proc, err := os.FindProcess(int(pid))
    if (err != nil)
    {
        return nil, err
    }
    return proc, nil
 }
 func killByPidFile(pidFile string) error
 {
    proc, err := findByPidFile(pidFile)
    if (err != nil)
    {
        return err
    }
    return proc.Signal(syscall.SIGTERM)
 }
 func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
 {
    // Start qemu-storage-daemon
    blockSpec := map[string]interface{}{
        "node-name": "disk1",
        "driver": "vitastor",
        "image": volName,
        "cache": map[string]bool{
            "direct": true,
            "no-flush": false,
        },
        "discard": "unmap",
    }
    if (configPath != "")
    {
        blockSpec["config-path"] = configPath
    }
    blockSpecJson, _ := json.Marshal(blockSpec)
    writable := "true"
    if (readonly)
    {
        writable = "false"
    }
    _, _, err := system(
        "/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
        "--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
    )
    return err
 }
 func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
 {
    // Generate state file
    stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
    if (err != nil)
    {
        return "", "", err
    }
    stateFile := stateFd.Name()
    stateFd.Close()
    vdpaId := filepath.Base(stateFile)
    vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
    pidFile := ns.stateDir + vdpaId + ".pid"
    // Map VDUSE device via qemu-storage-daemon
    err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
    if (err == nil)
    {
        // Add device to VDPA bus
        _, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
        if (err == nil)
        {
            // Find block device name
            var matches []string
            matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
            if (err == nil && len(matches) == 0)
            {
                err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
            }
            if (err == nil)
            {
                blockdev := "/dev/"+filepath.Base(matches[0])
                _, err = os.Stat(blockdev)
                if (err == nil)
                {
                    // Generate state file
                    stateJSON, _ := json.Marshal(&DeviceState{
                        ConfigPath: ctxVars["configPath"],
                        VdpaId:     vdpaId,
                        Image:      volName,
                        Blockdev:   blockdev,
                        Readonly:   readonly,
                        PidFile:    pidFile,
                    })
                    err = os.WriteFile(stateFile, stateJSON, 0600)
                    if (err == nil)
                    {
                        return blockdev, vdpaId, nil
                    }
                }
            }
        }
        killErr := killByPidFile(pidFile)
        if (killErr != nil)
        {
            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
        }
        os.Remove(stateFile)
        os.Remove(pidFile)
    }
    return "", "", err
 }
 func (ns *NodeServer) unmapVduse(devicePath string)
 {
    if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
    {
        klog.Errorf("%s does not start with /dev/v", devicePath)
        return
    }
    vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
    if (err != nil)
    {
        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
        return
    }
    vdpaId := ""
    p := strings.Index(vduseDev, "/vduse/")
    if (p >= 0)
    {
        vduseDev = vduseDev[p+7:]
        p = strings.Index(vduseDev, "/")
        if (p >= 0)
        {
            vdpaId = vduseDev[0:p]
        }
    }
    if (vdpaId == "")
    {
        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
        return
    }
    ns.unmapVduseById(vdpaId)
 }
 func (ns *NodeServer) unmapVduseById(vdpaId string)
 {
    _, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
    if (err != nil)
    {
        klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
    }
    else
    {
        _, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
    }
    stateFile := ns.stateDir + vdpaId + ".json"
    os.Remove(stateFile)
    pidFile := ns.stateDir + vdpaId + ".pid"
    _, err = os.Stat(pidFile)
    if (os.IsNotExist(err))
    {
        // ok, already killed
    }
    else if (err != nil)
    {
        klog.Errorf("Failed to stat %v: %v", pidFile, err)
        return
    }
    else
    {
        err = killByPidFile(pidFile)
        if (err != nil)
        {
            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
        }
        os.Remove(pidFile)
    }
 }
 func (ns *NodeServer) restoreVduseDaemons()
 {
    pattern := ns.stateDir+"vitastor-vduse-*.json"
    matches, err := filepath.Glob(pattern)
    if (err != nil)
    {
        klog.Errorf("failed to list %s: %v", pattern, err)
    }
    if (len(matches) == 0)
    {
        return
    }
    devList := make(map[string]interface{})
    // example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
    devListJSON, _, err := system("/sbin/vdpa", "-j", "dev", "list")
    if (err != nil)
    {
        return
    }
    err = json.Unmarshal(devListJSON, &devList)
    devs, ok := devList["dev"].(map[string]interface{})
    if (err != nil || !ok)
    {
        klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
        return
    }
    for _, stateFile := range matches
    {
        vdpaId := filepath.Base(stateFile)
        vdpaId = vdpaId[0:len(vdpaId)-5]
        // Check if VDPA device is still added to the bus
        if (devs[vdpaId] != nil)
        {
            // Check if the storage daemon is still active
            pidFile := ns.stateDir + vdpaId + ".pid"
            exists := false
            proc, err := findByPidFile(pidFile)
            if (err == nil)
            {
                exists = proc.Signal(syscall.Signal(0)) == nil
            }
            if (!exists)
            {
                // Restart daemon
                stateJSON, err := os.ReadFile(stateFile)
                if (err != nil)
                {
                    klog.Warningf("error reading state file %v: %v", stateFile, err)
                }
                else
                {
                    var state DeviceState
                    err := json.Unmarshal(stateJSON, &state)
                    if (err != nil)
                    {
                        klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
                    }
                    else
                    {
                        klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
                        _ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
                    }
                }
            }
        }
        else
        {
            // Unused, clean it up
            ns.unmapVduseById(vdpaId)
        }
    }
 }
 // NodePublishVolume mounts the volume mounted to the staging path to the target path
 func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
 {
@@ -70,10 +446,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    isBlock := req.GetVolumeCapability().GetBlock() != nil
    // Check that it's not already mounted
-    _, error := mount.IsNotMountPoint(ns.mounter, targetPath)
+    _, err := mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (error != nil)
+    if (err != nil)
    {
-        if (os.IsNotExist(error))
+        if (os.IsNotExist(err))
        {
            if (isBlock)
            {
@@ -81,13 +457,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
                if (err != nil)
                {
                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
+                    return nil, err
                }
                err = pathFile.Close()
                if (err != nil)
                {
                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
+                    return nil, err
                }
            }
            else
@@ -96,70 +472,57 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
                if (err != nil)
                {
                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
+                    return nil, err
                }
            }
        }
        else
        {
-            return nil, status.Error(codes.Internal, error.Error())
+            return nil, err
        }
    }
    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
    volName := ctxVars["name"]
-    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
+    _, err = GetConnectionParams(ctxVars)
    if (len(etcdUrl) == 0)
    {
        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }
    // Map NBD device
    // FIXME: Check if already mapped
    args := []string{
        "map", "--etcd_address", strings.Join(etcdUrl, ","),
        "--etcd_prefix", etcdPrefix,
        "--image", volName,
    };
    if (ctxVars["configPath"] != "")
    {
        args = append(args, "--config_path", ctxVars["configPath"])
    }
    if (req.GetReadonly())
    {
        args = append(args, "--readonly", "1")
    }
    c := exec.Command("/usr/bin/vitastor-nbd", args...)
    var stdout, stderr bytes.Buffer
    c.Stdout, c.Stderr = &stdout, &stderr
    err = c.Run()
    stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
    if (err != nil)
    {
-        klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
+        return nil, err
-        return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
+    }
    var devicePath, vdpaId string
    if (!ns.useVduse)
    {
        devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
    }
    else
    {
        devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
    }
    if (err != nil)
    {
        return nil, err
    }
    devicePath := strings.TrimSpace(stdoutStr)
    // Check existing format
    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
    if (isBlock)
    {
        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
    }
    else
    {
        // Check existing format
        existingFormat, err := diskMounter.GetDiskFormat(devicePath)
        if (err != nil)
        {
            klog.Errorf("failed to get disk format for path %s, error: %v", err)
-        // unmap NBD device
+            goto unmap
        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
        if (unmapErr != nil)
        {
            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
        }
        return nil, err
        }
        // Format the device (ext4 or xfs)
@@ -176,41 +539,49 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
        {
            opt = append(opt, "nouuid")
        }
        if (fsType == "ext4")
        {
            opt = append(opt, "relatime","lazytime")
        }
        readOnly := Contains(opt, "ro")
        if (existingFormat == "" && !readOnly)
        {
-        args := []string{}
+            var cmdOut []byte
            switch fsType
            {
                case "ext4":
-                args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
+                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
                    cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
                case "xfs":
-                args = []string{"-K", devicePath}
+                    cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
            }
-        if (len(args) > 0)
+            if (err != nil)
            {
-            cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
+                klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
-            if (cmdErr != nil)
+                goto unmap
            {
                klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
                // unmap NBD device
                unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
                if (unmapErr != nil)
                {
                    klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
                }
                return nil, status.Error(codes.Internal, cmdErr.Error())
            }
        }
-    }
+
    if (isBlock)
    {
        opt = append(opt, "bind")
        err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
    }
    else
    {
        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
        // Try to run online resize on mount.
        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
        if (err == nil && existingFormat != "" && !readOnly)
        {
            var cmdOut []byte
            switch (fsType)
            {
                case "ext4":
                    cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
                case "xfs":
                    cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
            }
            if (err != nil)
            {
                klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
                goto unmap
            }
        }
    }
    if (err != nil)
    {
@@ -218,15 +589,20 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
            devicePath, targetPath, volName, err,
        )
-        // unmap NBD device
+        goto unmap
        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
        if (unmapErr != nil)
        {
            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
        }
        return nil, status.Error(codes.Internal, err.Error())
    }
    return &csi.NodePublishVolumeResponse{}, nil
 unmap:
    if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
    {
        ns.unmapNbd(devicePath)
    }
    else
    {
        ns.unmapVduseById(vdpaId)
    }
    return nil, err
 }
 // NodeUnpublishVolume unmounts the volume from the target path
@@ -241,25 +617,31 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
        {
            return nil, status.Error(codes.NotFound, "Target path not found")
        }
-        return nil, status.Error(codes.Internal, err.Error())
+        return nil, err
    }
    if (devicePath == "")
    {
-        return nil, status.Error(codes.NotFound, "Volume not mounted")
+        // volume not mounted
        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
        os.Remove(targetPath)
        return &csi.NodeUnpublishVolumeResponse{}, nil
    }
    // unmount
    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
    if (err != nil)
    {
-        return nil, status.Error(codes.Internal, err.Error())
+        return nil, err
    }
    // unmap NBD device
    if (refCount == 1)
    {
-        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (!ns.useVduse)
        if (unmapErr != nil)
        {
-            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+            ns.unmapNbd(devicePath)
        }
        else
        {
            ns.unmapVduse(devicePath)
        }
    }
    return &csi.NodeUnpublishVolumeResponse{}, nil
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (1.0.0-1) unstable; urgency=medium
+vitastor (1.3.1-1) unstable; urgency=medium
  * Bugfixes
 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300
-vitastor (1.0.0-1) unstable; urgency=medium
+vitastor (0.7.0-1) unstable; urgency=medium
  * Implement NFS proxy
  * Add documentation
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: vitastor
 Section: admin
 Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
 Rules-Requires-Root: no
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -7,7 +7,7 @@ ARG REL=
 WORKDIR /root
-RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
+RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
@@ -45,7 +45,7 @@ RUN set -e; \
    rm -rf /root/packages/qemu-$REL/*; \
    cd /root/packages/qemu-$REL; \
    dpkg-source -x /root/qemu*.dsc; \
-    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*(\d+\.\d+).*!$1!'); \
+    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*?(\d+\.\d+).*!$1!'); \
    D=$(ls -d qemu*/); \
    cp /root/vitastor/patches/qemu-$QEMU_VER-vitastor.patch ./qemu-*/debian/patches; \
    echo qemu-$QEMU_VER-vitastor.patch >> $D/debian/patches/series; \
@@ -54,7 +54,8 @@ RUN set -e; \
    quilt add block/vitastor.c; \
    cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
+    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.0.0; \
+    cp -r /root/vitastor vitastor-1.3.1; \
-    cd vitastor-1.0.0; \
+    cd vitastor-1.3.1; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.0.0.orig.tar.xz vitastor-1.0.0; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.3.1.orig.tar.xz vitastor-1.3.1; \
-    cd vitastor-1.0.0; \
+    cd vitastor-1.3.1; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config.en.md
+++ b/docs/config.en.md
@@ -33,6 +33,7 @@ In the future, additional configuration methods may be added:
 - [Common](config/common.en.md)
 - [Network](config/network.en.md)
 - [Client](config/client.en.md)
 - [Global Disk Layout](config/layout-cluster.en.md)
 - [OSD Disk Layout](config/layout-osd.en.md)
 - [OSD Runtime Parameters](config/osd.en.md)
--- a/docs/config.ru.md
+++ b/docs/config.ru.md
@@ -36,6 +36,7 @@
 - [Общие](config/common.ru.md)
 - [Сеть](config/network.ru.md)
 - [Клиентский код](config/client.ru.md)
 - [Глобальные дисковые параметры](config/layout-cluster.ru.md)
 - [Дисковые параметры OSD](config/layout-osd.ru.md)
 - [Прочие параметры OSD](config/osd.ru.md)
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -0,0 +1,137 @@
 [Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
 -----
 [Читать на русском](client.ru.md)
 # Client Parameters
 These parameters apply only to clients and affect their interaction with
 the cluster.
 - [client_max_dirty_bytes](#client_max_dirty_bytes)
 - [client_max_dirty_ops](#client_max_dirty_ops)
 - [client_enable_writeback](#client_enable_writeback)
 - [client_max_buffered_bytes](#client_max_buffered_bytes)
 - [client_max_buffered_ops](#client_max_buffered_ops)
 - [client_max_writeback_iodepth](#client_max_writeback_iodepth)
 - [nbd_timeout](#nbd_timeout)
 - [nbd_max_devices](#nbd_max_devices)
 - [nbd_max_part](#nbd_max_part)
 ## client_max_dirty_bytes
 - Type: integer
 - Default: 33554432
 - Can be changed online: yes
 Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
 (not committed by fsync) data allowed by the client before forcing an
 additional fsync and committing the data. Also note that the client always
 holds a copy of uncommitted data in memory so this setting also affects
 RAM usage of clients.
 ## client_max_dirty_ops
 - Type: integer
 - Default: 1024
 - Can be changed online: yes
 Same as client_max_dirty_bytes, but instead of total size, limits the number
 of uncommitted write operations.
 ## client_enable_writeback
 - Type: boolean
 - Default: false
 - Can be changed online: yes
 This parameter enables client-side write buffering. This means that write
 requests are accumulated in memory for a short time before being sent to
 a Vitastor cluster which allows to send them in parallel and increase
 performance of some applications. Writes are buffered until client forces
 a flush with fsync() or until the amount of buffered writes exceeds the
 limit.
 Write buffering significantly increases performance of some applications,
 for example, CrystalDiskMark under Windows (LOL :-D), but also any other
 applications if they do writes in one of two non-optimal ways: either if
 they do a lot of small (4 kb or so) sequential writes, or if they do a lot
 of small random writes, but without any parallelism or asynchrony, and also
 without calling fsync().
 With write buffering enabled, you can expect around 22000 T1Q1 random write
 iops in QEMU more or less regardless of the quality of your SSDs, and this
 number is in fact bound by QEMU itself rather than Vitastor (check it
 yourself by adding a "driver=null-co" disk in QEMU). Without write
 buffering, the current record is 9900 iops, but the number is usually
 even lower with non-ideal hardware, for example, it may be 5000 iops.
 Even when this parameter is enabled, write buffering isn't enabled until
 the client explicitly allows it, because enabling it without the client
 being aware of the fact that his writes may be buffered may lead to data
 loss. Because of this, older versions of clients don't support write
 buffering at all, newer versions of the QEMU driver allow write buffering
 only if it's enabled in disk settings with `-blockdev cache.direct=false`,
 and newer versions of FIO only allow write buffering if you don't specify
 `-direct=1`. NBD and NFS drivers allow write buffering by default.
 You can overcome this restriction too with the `client_writeback_allowed`
 parameter, but you shouldn't do that unless you **really** know what you
 are doing.
 ## client_max_buffered_bytes
 - Type: integer
 - Default: 33554432
 - Can be changed online: yes
 Maximum total size of buffered writes which triggers write-back when reached.
 ## client_max_buffered_ops
 - Type: integer
 - Default: 1024
 - Can be changed online: yes
 Maximum number of buffered writes which triggers write-back when reached.
 Multiple consecutive modified data regions are counted as 1 write here.
 ## client_max_writeback_iodepth
 - Type: integer
 - Default: 256
 - Can be changed online: yes
 Maximum number of parallel writes when flushing buffered data to the server.
 ## nbd_timeout
 - Type: seconds
 - Default: 300
 Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
 executes for longer than this timeout, including when your cluster is just
 temporarily down for more than timeout, the NBD device will detach by itself
 (and possibly break the mounted file system).
 You can set timeout to 0 to never detach, but in that case you won't be
 able to remove the kernel device at all if the NBD process dies - you'll have
 to reboot the host.
 ## nbd_max_devices
 - Type: integer
 - Default: 64
 Maximum number of NBD devices in the system. This value is passed as
 `nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
 ## nbd_max_part
 - Type: integer
 - Default: 3
 Maximum number of partitions per NBD device. This value is passed as
 `max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
 Note that (nbds_max)*(1+max_part) usually can't exceed 256.
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -0,0 +1,137 @@
 [Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
 -----
 [Read in English](client.en.md)
 # Параметры клиентского кода
 Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
 затрагивают логику их работы с кластером.
 - [client_max_dirty_bytes](#client_max_dirty_bytes)
 - [client_max_dirty_ops](#client_max_dirty_ops)
 - [client_enable_writeback](#client_enable_writeback)
 - [client_max_buffered_bytes](#client_max_buffered_bytes)
 - [client_max_buffered_ops](#client_max_buffered_ops)
 - [client_max_writeback_iodepth](#client_max_writeback_iodepth)
 - [nbd_timeout](#nbd_timeout)
 - [nbd_max_devices](#nbd_max_devices)
 - [nbd_max_part](#nbd_max_part)
 ## client_max_dirty_bytes
 - Тип: целое число
 - Значение по умолчанию: 33554432
 - Можно менять на лету: да
 При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
 зафиксированных fsync-ом) данных, при достижении которого клиент будет
 принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
 что в этом случае до момента fsync клиент хранит копию незафиксированных
 данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
 ## client_max_dirty_ops
 - Тип: целое число
 - Значение по умолчанию: 1024
 - Можно менять на лету: да
 Аналогично client_max_dirty_bytes, но ограничивает количество
 незафиксированных операций записи вместо их общего объёма.
 ## client_enable_writeback
 - Тип: булево (да/нет)
 - Значение по умолчанию: false
 - Можно менять на лету: да
 Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
 означает, что операции записи отправляются на кластер Vitastor не сразу, а
 могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
 до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
 пока клиент не вызовет fsync.
 Буферизация значительно повышает производительность некоторых приложений,
 например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
 которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
 (например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
 есть, например, отправляя 128 операций записи в разные места диска, но не
 все сразу с помощью асинхронного I/O, а по одной.
 В QEMU с буферизацией записи можно ожидать показателя примерно 22000
 операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
 без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
 цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
 в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
 в секунду.
 При этом, даже если данный параметр включён, буферизация не включается, если
 явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
 буферизуются, это может приводить к потере данных. Поэтому в старых версиях
 клиентских драйверов буферизация записи не включается вообще, в новых
 версиях QEMU-драйвера включается, только если разрешена опцией диска
 `-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
 В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
 Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
 но делать так не надо, если только вы не уверены в том, что делаете, на все
 100%. :-)
 ## client_max_buffered_bytes
 - Тип: целое число
 - Значение по умолчанию: 33554432
 - Можно менять на лету: да
 Максимальный общий размер буферизованных записей, при достижении которого
 начинается процесс сброса данных на сервер.
 ## client_max_buffered_ops
 - Тип: целое число
 - Значение по умолчанию: 1024
 - Можно менять на лету: да
 Максимальное количество буферизованных записей, при достижении которого
 начинается процесс сброса данных на сервер. При этом несколько
 последовательных изменённых областей здесь считаются 1 записью.
 ## client_max_writeback_iodepth
 - Тип: целое число
 - Значение по умолчанию: 256
 - Можно менять на лету: да
 Максимальное число параллельных операций записи при сбросе буферов на сервер.
 ## nbd_timeout
 - Тип: секунды
 - Значение по умолчанию: 300
 Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
 операция выполняется дольше таймаута, включая временную недоступность
 кластера на время, большее таймаута, NBD-устройство отключится само собой
 (и, возможно, сломает примонтированную ФС).
 Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
 таймауту, но в этом случае вы вообще не сможете удалить устройство, если
 процесс NBD умрёт - вам придётся перезагружать сервер.
 ## nbd_max_devices
 - Тип: целое число
 - Значение по умолчанию: 64
 Максимальное число NBD-устройств в системе. Данное значение передаётся
 модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
 ## nbd_max_part
 - Тип: целое число
 - Значение по умолчанию: 3
 Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
 модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
 Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
--- a/docs/config/layout-cluster.en.md
+++ b/docs/config/layout-cluster.en.md
@@ -96,8 +96,9 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
 it (they have internal SSD cache even though it's not stated in datasheets).
 Setting this parameter to "all" or "small" in OSD parameters requires enabling
-disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
+[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
-enabling disable_data_fsync.
+[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
 "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
 TLDR: For optimal performance, set immediate_commit to "all" if you only use
 SSDs with supercapacitor-based power loss protection (nonvolatile
--- a/docs/config/layout-cluster.ru.md
+++ b/docs/config/layout-cluster.ru.md
@@ -103,8 +103,9 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
 указано в спецификациях).
 Указание "all" или "small" в настройках / командной строке OSD требует
-включения disable_journal_fsync и disable_meta_fsync, значение "all" также
+включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
-требует включения disable_data_fsync.
+[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
 также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
 Итого, вкратце: для оптимальной производительности установите
 immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/layout-osd.en.md
+++ b/docs/config/layout-osd.en.md
@@ -213,6 +213,6 @@ Thus, recommended setups are:
 3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
 4. HDD-only, faster random read: csum_block_size=32k
 5. HDD-only, faster random write: csum_block_size=4k +
-   inmemory_metadata=false + cached_io_meta=true
+   inmemory_metadata=false + meta_io=cached
-See also [cached_io_meta](osd.en.md#cached_io_meta).
+See also [meta_io](osd.en.md#meta_io).
--- a/docs/config/layout-osd.ru.md
+++ b/docs/config/layout-osd.ru.md
@@ -226,6 +226,6 @@ csum_block_size данных.
 3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
 4. Только HDD, быстрее случайное чтение: csum_block_size=32k
 5. Только HDD, быстрее случайная запись: csum_block_size=4k +
-   inmemory_metadata=false + cached_io_meta=true
+   inmemory_metadata=false + meta_io=cached
-Смотрите также [cached_io_meta](osd.ru.md#cached_io_meta).
+Смотрите также [meta_io](osd.ru.md#meta_io).
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -20,6 +20,7 @@ between clients, OSDs and etcd.
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
 - [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -30,7 +31,6 @@ between clients, OSDs and etcd.
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
 - [client_dirty_limit](#client_dirty_limit)
 ## tcp_header_buffer_size
@@ -69,11 +69,14 @@ but they are not connected to the cluster.
 - Type: string
 RDMA device name to use for Vitastor OSD communications (for example,
-"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
+"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
-Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
+ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
-to work. For example, Mellanox ConnectX-3 and older adapters don't have
+
-Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
+Versions up to Vitastor 1.2.0 required ODP which is only present in
-root to list available RDMA devices and their features.
+Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
 Run `ibv_devinfo -v` as root to list available RDMA devices and their
 features.
 Remember that you also have to configure your network switches if you use
 RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -148,6 +151,28 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
 Doesn't affect memory usage - additional memory isn't allocated for send
 operations.
 ## rdma_odp
 - Type: boolean
 - Default: false
 Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
 ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
 for RDMA adapter to be able to use it. This, in turn, allows to skip memory
 copying during sending. One would think this should improve performance, but
 **in reality** RDMA performance with ODP is **drastically** worse. Example
 3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
 without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
 This happens because Mellanox ODP implementation seems to be based on
 message retransmissions when the adapter doesn't know about the buffer yet -
 it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
 which is generally slow in RDMA/RoCE networks. Here's a presentation about
 it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
 ODP support is retained in the code just in case a good ODP implementation
 appears one day.
 ## peer_connect_interval
 - Type: seconds
@@ -240,17 +265,3 @@ etcd_report_interval to guarantee that keepalive actually works.
 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
 ## client_dirty_limit
 - Type: integer
 - Default: 33554432
 - Can be changed online: yes
 Without immediate_commit=all this parameter sets the limit of "dirty"
 (not committed by fsync) data allowed by the client before forcing an
 additional fsync and committing the data. Also note that the client always
 holds a copy of uncommitted data in memory so this setting also affects
 RAM usage of clients.
 This parameter doesn't affect OSDs themselves.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -20,6 +20,7 @@
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
 - [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -30,7 +31,6 @@
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
 - [client_dirty_limit](#client_dirty_limit)
 ## tcp_header_buffer_size
@@ -72,12 +72,15 @@ RDMA может быть нужно только если у клиентов е
 - Тип: строка
 Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
+Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
-Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
+нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
-адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
+картами производства не Mellanox.
-потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
+
-суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
+Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
-параметры и возможности.
+на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
 Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
 список доступных RDMA-устройств, их параметры и возможности.
 Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
 правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -156,6 +159,29 @@ OSD в любом случае согласовывают реальное зн
 Не влияет на потребление памяти - дополнительная память на операции отправки
 не выделяется.
 ## rdma_odp
 - Тип: булево (да/нет)
 - Значение по умолчанию: false
 Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
 исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
 не регистрировать память для её использования RDMA-картой. Благодаря этому
 можно не копировать данные при отправке их в сеть и, казалось бы, это должно
 улучшать производительность - но **по факту** получается так, что
 производительность только ухудшается, причём сильно. Пример - на 3-узловом
 кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
 удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
 Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
 основана на повторной передаче сообщений, когда карте не известен буфер -
 вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
 А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
 Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
 Возможность использования ODP сохранена в коде на случай, если вдруг в один
 прекрасный день появится хорошая реализация ODP.
 ## peer_connect_interval
 - Тип: секунды
@@ -251,17 +277,3 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 - Можно менять на лету: да
 Интервал проверки живости вебсокет-подключений к etcd.
 ## client_dirty_limit
 - Тип: целое число
 - Значение по умолчанию: 33554432
 - Можно менять на лету: да
 При работе без immediate_commit=all - это лимит объёма "грязных" (не
 зафиксированных fsync-ом) данных, при достижении которого клиент будет
 принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
 что в этом случае до момента fsync клиент хранит копию незафиксированных
 данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
 Параметр не влияет на сами OSD.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -11,6 +11,7 @@ initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.
 - [etcd_report_interval](#etcd_report_interval)
 - [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -31,9 +32,9 @@ them, even without restarting by updating configuration in etcd.
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
- [cached_io_data](#cached_io_data)
+- [data_io](#data_io)
- [cached_io_meta](#cached_io_meta)
+- [meta_io](#meta_io)
- [cached_io_journal](#cached_io_journal)
+- [journal_io](#journal_io)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@@ -56,11 +57,21 @@ them, even without restarting by updating configuration in etcd.
 - Type: seconds
 - Default: 5
-Interval at which OSDs report their state to etcd. Affects OSD lease time
+Interval at which OSDs report their liveness to etcd. Affects OSD lease time
 and thus the failover speed. Lease time is equal to this parameter value
 plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
 that every OSD always refreshes its lease in time.
 ## etcd_stats_interval
 - Type: seconds
 - Default: 30
 Interval at which OSDs report their statistics to etcd. Highly affects the
 imposed load on etcd, because statistics include a key for every OSD and
 for every PG. At the same time, low statistic intervals make `vitastor-cli`
 statistics more responsive.
 ## run_primary
 - Type: boolean
@@ -258,47 +269,59 @@ is typically very small because it's sufficient to have 16-32 MB journal
 for SSD OSDs. However, in theory it's possible that you'll want to turn it
 off for hybrid (HDD+SSD) OSDs with large journals on quick devices.
-## cached_io_data
+## data_io
- Type: boolean
+- Type: string
- Default: false
+- Default: direct
-Read and write *data* through Linux page cache, i.e. use a file descriptor
+I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
-opened with O_SYNC, but without O_DIRECT for I/O. May improve read
+to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
 performance for hot data and slower disks - HDDs and maybe SATA SSDs.
 Not recommended for desktop SSDs without capacitors because O_SYNC flushes
 disk cache on every write.
-## cached_io_meta
+Choose "cached" to use Linux page cache. This may improve read performance
 for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
 decrease write performance for fast disks because page cache is an overhead
 itself.
- Type: boolean
+Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
- Default: false
+(which requires disable_data_fsync) with drives having write-back cache
 which can't be turned off, for example, Intel Optane. Also note that *some*
 desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
 disable_data_fsync unsafe even with "directsync".
-Read and write *metadata* through Linux page cache. May improve read
+## meta_io
 performance only if your drives are relatively slow (HDD, SATA SSD), and
 only if checksums are enabled and [inmemory_metadata](#inmemory_metadata)
 is disabled, because in this case metadata blocks are read from disk
 on every read request to verify checksums and caching them may reduce this
 extra read load.
-Absolutely pointless to enable with enabled inmemory_metadata because all
+- Type: string
-metadata is kept in memory anyway, and likely pointless without checksums,
+- Default: direct
-because in that case, metadata blocks are read from disk only during journal
+
 I/O mode for *metadata*. One of "direct", "cached" or "directsync".
 "cached" may improve read performance, but only under the following conditions:
 1. your drives are relatively slow (HDD, SATA SSD), and
 2. checksums are enabled, and
 3. [inmemory_metadata](#inmemory_metadata) is disabled.
 Under all these conditions, metadata blocks are read from disk on every
 read request to verify checksums and caching them may reduce this extra
 read load. Without (3) metadata is never read from the disk after starting,
 and without (2) metadata blocks are read from disk only during journal
 flushing.
-If the same device is used for data and metadata, enabling [cached_io_data](#cached_io_data)
+"directsync" is the same as above.
 also enables this parameter, given that it isn't turned off explicitly.
-## cached_io_journal
+If the same device is used for data and metadata, meta_io by default is set
 to the same value as [data_io](#data_io).
- Type: boolean
+## journal_io
 - Default: false
-Read and write *journal* through Linux page cache. May improve read
+- Type: string
-performance if [inmemory_journal](#inmemory_journal) is turned off.
+- Default: direct
-If the same device is used for metadata and journal, enabling [cached_io_meta](#cached_io_meta)
+I/O mode for *journal*. One of "direct", "cached" or "directsync".
-also enables this parameter, given that it isn't turned off explicitly.
+
 Here, "cached" may only improve read performance for recent writes and
 only if [inmemory_journal](#inmemory_journal) is turned off.
 If the same device is used for metadata and journal, journal_io by default
 is set to the same value as [meta_io](#meta_io).
 ## journal_sector_buffer_count
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -12,6 +12,7 @@
 изменения конфигурации в etcd.
 - [etcd_report_interval](#etcd_report_interval)
 - [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -32,9 +33,9 @@
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
- [cached_io_data](#cached_io_data)
+- [data_io](#data_io)
- [cached_io_meta](#cached_io_meta)
+- [meta_io](#meta_io)
- [cached_io_journal](#cached_io_journal)
+- [journal_io](#journal_io)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@@ -57,11 +58,21 @@
 - Тип: секунды
 - Значение по умолчанию: 5
-Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
+Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
-влияет на время резервации (lease) OSD и поэтому на скорость переключения
+влияет на время резервации (lease) OSD и поэтому - на скорость переключения
 при падении OSD. Время lease равняется значению этого параметра плюс
 max_etcd_attempts * etcd_quick_timeout.
 ## etcd_stats_interval
 - Тип: секунды
 - Значение по умолчанию: 30
 Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
 создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
 каждый OSD и на каждую PG. В то же время низкий интервал делает
 статистику, печатаемую `vitastor-cli`, отзывчивей.
 ## run_primary
 - Тип: булево (да/нет)
@@ -266,51 +277,62 @@ Flusher - это микро-поток (корутина), которая коп
 параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
 журналами, расположенными на быстром по сравнению с HDD устройстве.
-## cached_io_data
+## data_io
- Тип: булево (да/нет)
+- Тип: строка
- Значение по умолчанию: false
+- Значение по умолчанию: direct
-Читать и записывать *данные* через системный кэш Linux (page cache), то есть,
+Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
-использовать для данных файловый дескриптор, открытый без флага O_DIRECT, но
+"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
 с флагом O_SYNC. Может улучшить скорость чтения для относительно медленных
 дисков - HDD и, возможно, SATA SSD. Не рекомендуется для потребительских
 SSD без конденсаторов, так как O_SYNC сбрасывает кэш диска при каждой записи.
-## cached_io_meta
+Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
 чтении и записи. Это может улучшить скорость чтения горячих данных с
 относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
 снижает производительность записи для быстрых дисков, так как кэш сам по
 себе тоже добавляет накладные расходы.
- Тип: булево (да/нет)
+Выберите "directsync", если хотите задействовать
- Значение по умолчанию: false
+[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
 включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
 дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
 настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
 fsync небезопасным даже с режимом "directsync".
-Читать и записывать *метаданные* через системный кэш Linux. Может улучшить
+## meta_io
-скорость чтения, если у вас медленные диски, и только если контрольные суммы
+
-включены, а параметр [inmemory_metadata](#inmemory_metadata) отключён, так
+- Тип: строка
-как в этом случае блоки метаданных читаются с диска при каждом запросе чтения
+- Значение по умолчанию: direct
 Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
 "directsync".
 "cached" может улучшить скорость чтения, если:
 1. у вас медленные диски (HDD, SATA SSD)
 2. контрольные суммы включены
 3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
 При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
 для проверки контрольных сумм и их кэширование может снизить дополнительную
-нагрузку на диск.
+нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
 запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
-Абсолютно бессмысленно включать данный параметр, если параметр
+Если одно и то же устройство используется для данных и метаданных, режим
-inmemory_metadata включён (по умолчанию это так), и также вероятно
+ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
 бессмысленно включать его, если не включены контрольные суммы, так как в
 этом случае блоки метаданных читаются с диска только во время сброса
 журнала.
-Если одно и то же устройство используется для данных и метаданных, включение
+## journal_io
 [cached_io_data](#cached_io_data) также включает данный параметр, при
 условии, что он не отключён явным образом.
-## cached_io_journal
+- Тип: строка
 - Значение по умолчанию: direct
- Тип: булево (да/нет)
+Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
- Значение по умолчанию: false
+"directsync".
-Читать и записывать *журнал* через системный кэш Linux. Может улучшить
+Здесь "cached" может улучшить скорость чтения только недавно записанных
-скорость чтения, если параметр [inmemory_journal](#inmemory_journal)
+данных и только если параметр [inmemory_journal](#inmemory_journal)
 отключён.
 Если одно и то же устройство используется для метаданных и журнала,
-включение [cached_io_meta](#cached_io_meta) также включает данный
+режим ввода-вывода журнала по умолчанию устанавливается равным
-параметр, при условии, что он не отключён явным образом.
+[meta_io](#meta_io).
 ## journal_sector_buffer_count
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -205,9 +205,8 @@ This parameter usually doesn't require to be changed.
 - Default: 131072
 Block size for this pool. The value from /vitastor/config/global is used when
-unspecified. If your cluster has OSDs with different block sizes then pool must
+unspecified. Only OSDs with matching block_size are used for each pool. If you
-be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
+want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).
 size.
 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).
@@ -216,10 +215,9 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Type: integer
 - Default: 4096
-"Sector" size of virtual disks in this pool. The value from
+"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
-/vitastor/config/global is used when unspecified. Similar to block_size, the
+is used when unspecified. Similarly to block_size, only OSDs with matching
-pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
+bitmap_granularity are used for each pool.
 matching bitmap_granularity.
 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).
@@ -229,10 +227,11 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Default: none
 Immediate commit setting for this pool. The value from /vitastor/config/global
-is used when unspecified. Similar to block_size, the pool must be restricted by
+is used when unspecified. Similarly to block_size, only OSDs with compatible
-[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
+bitmap_granularity are used for each pool. "Compatible" means that a pool with
-Compatible means that a pool with non-immediate commit will work with OSDs with
+non-immediate commit will use OSDs with immediate commit enabled, but not vice
-immediate commit enabled, but not vice versa.
+versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
 with "all" or "small", and pools with "all" only use OSDs with "all".
 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).
--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -208,8 +208,9 @@ PG в Vitastor эферемерны, то есть вы можете менят
 Размер блока для данного пула. Если не задан, используется значение из
 /vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
-блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
+блока, пул будет использовать только OSD с размером блока, равным размеру блока
-с помощью [osd_tags](#osd_tags).
+пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
 используйте [osd_tags](#osd_tags).
 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).
@@ -219,9 +220,8 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: 4096
 Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
-значение из /vitastor/config/global. Аналогично block_size, пул должен быть
+значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
-ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
+использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.
 [osd_tags](#osd_tags).
 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).
@@ -231,11 +231,13 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: none
 Настройка мгновенного коммита для данного пула. Если не задана, используется
-значение из /vitastor/config/global. Аналогично block_size, пул должен быть
+значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
-ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
+использовать только OSD с *совместимыми* настройками immediate_commit.
-помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
+"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
-мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
+использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
-не наоборот.
+пул со значением "none" будет использовать все OSD, пул со "small" будет
 использовать OSD с "all" или "small", а пул с "all" будет использовать только
 OSD с "all".
 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).
--- a/docs/config/src/client.en.md
+++ b/docs/config/src/client.en.md
@@ -0,0 +1,4 @@
 # Client Parameters
 These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
 affect their interaction with the cluster.
--- a/docs/config/src/client.ru.md
+++ b/docs/config/src/client.ru.md
@@ -0,0 +1,4 @@
 # Параметры клиентского кода
 Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
 затрагивают логику их работы с кластером.
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -0,0 +1,168 @@
 - name: client_max_dirty_bytes
  type: int
  default: 33554432
  online: true
  info: |
    Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
    (not committed by fsync) data allowed by the client before forcing an
    additional fsync and committing the data. Also note that the client always
    holds a copy of uncommitted data in memory so this setting also affects
    RAM usage of clients.
  info_ru: |
    При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
    зафиксированных fsync-ом) данных, при достижении которого клиент будет
    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
    что в этом случае до момента fsync клиент хранит копию незафиксированных
    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
 - name: client_max_dirty_ops
  type: int
  default: 1024
  online: true
  info: |
    Same as client_max_dirty_bytes, but instead of total size, limits the number
    of uncommitted write operations.
  info_ru: |
    Аналогично client_max_dirty_bytes, но ограничивает количество
    незафиксированных операций записи вместо их общего объёма.
 - name: client_enable_writeback
  type: bool
  default: false
  online: true
  info: |
    This parameter enables client-side write buffering. This means that write
    requests are accumulated in memory for a short time before being sent to
    a Vitastor cluster which allows to send them in parallel and increase
    performance of some applications. Writes are buffered until client forces
    a flush with fsync() or until the amount of buffered writes exceeds the
    limit.
    Write buffering significantly increases performance of some applications,
    for example, CrystalDiskMark under Windows (LOL :-D), but also any other
    applications if they do writes in one of two non-optimal ways: either if
    they do a lot of small (4 kb or so) sequential writes, or if they do a lot
    of small random writes, but without any parallelism or asynchrony, and also
    without calling fsync().
    With write buffering enabled, you can expect around 22000 T1Q1 random write
    iops in QEMU more or less regardless of the quality of your SSDs, and this
    number is in fact bound by QEMU itself rather than Vitastor (check it
    yourself by adding a "driver=null-co" disk in QEMU). Without write
    buffering, the current record is 9900 iops, but the number is usually
    even lower with non-ideal hardware, for example, it may be 5000 iops.
    Even when this parameter is enabled, write buffering isn't enabled until
    the client explicitly allows it, because enabling it without the client
    being aware of the fact that his writes may be buffered may lead to data
    loss. Because of this, older versions of clients don't support write
    buffering at all, newer versions of the QEMU driver allow write buffering
    only if it's enabled in disk settings with `-blockdev cache.direct=false`,
    and newer versions of FIO only allow write buffering if you don't specify
    `-direct=1`. NBD and NFS drivers allow write buffering by default.
    You can overcome this restriction too with the `client_writeback_allowed`
    parameter, but you shouldn't do that unless you **really** know what you
    are doing.
  info_ru: |
    Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
    означает, что операции записи отправляются на кластер Vitastor не сразу, а
    могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
    до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
    пока клиент не вызовет fsync.
    Буферизация значительно повышает производительность некоторых приложений,
    например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
    которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
    (например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
    есть, например, отправляя 128 операций записи в разные места диска, но не
    все сразу с помощью асинхронного I/O, а по одной.
    В QEMU с буферизацией записи можно ожидать показателя примерно 22000
    операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
    без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
    цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
    в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
    в секунду.
    При этом, даже если данный параметр включён, буферизация не включается, если
    явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
    буферизуются, это может приводить к потере данных. Поэтому в старых версиях
    клиентских драйверов буферизация записи не включается вообще, в новых
    версиях QEMU-драйвера включается, только если разрешена опцией диска
    `-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
    В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
    Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
    но делать так не надо, если только вы не уверены в том, что делаете, на все
    100%. :-)
 - name: client_max_buffered_bytes
  type: int
  default: 33554432
  online: true
  info: |
    Maximum total size of buffered writes which triggers write-back when reached.
  info_ru: |
    Максимальный общий размер буферизованных записей, при достижении которого
    начинается процесс сброса данных на сервер.
 - name: client_max_buffered_ops
  type: int
  default: 1024
  online: true
  info: |
    Maximum number of buffered writes which triggers write-back when reached.
    Multiple consecutive modified data regions are counted as 1 write here.
  info_ru: |
    Максимальное количество буферизованных записей, при достижении которого
    начинается процесс сброса данных на сервер. При этом несколько
    последовательных изменённых областей здесь считаются 1 записью.
 - name: client_max_writeback_iodepth
  type: int
  default: 256
  online: true
  info: |
    Maximum number of parallel writes when flushing buffered data to the server.
  info_ru: |
    Максимальное число параллельных операций записи при сбросе буферов на сервер.
 - name: nbd_timeout
  type: sec
  default: 300
  online: false
  info: |
    Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
    executes for longer than this timeout, including when your cluster is just
    temporarily down for more than timeout, the NBD device will detach by itself
    (and possibly break the mounted file system).
    You can set timeout to 0 to never detach, but in that case you won't be
    able to remove the kernel device at all if the NBD process dies - you'll have
    to reboot the host.
  info_ru: |
    Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
    операция выполняется дольше таймаута, включая временную недоступность
    кластера на время, большее таймаута, NBD-устройство отключится само собой
    (и, возможно, сломает примонтированную ФС).
    Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
    таймауту, но в этом случае вы вообще не сможете удалить устройство, если
    процесс NBD умрёт - вам придётся перезагружать сервер.
 - name: nbd_max_devices
  type: int
  default: 64
  online: false
  info: |
    Maximum number of NBD devices in the system. This value is passed as
    `nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
  info_ru: |
    Максимальное число NBD-устройств в системе. Данное значение передаётся
    модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
 - name: nbd_max_part
  type: int
  default: 3
  online: false
  info: |
    Maximum number of partitions per NBD device. This value is passed as
    `max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
    Note that (nbds_max)*(1+max_part) usually can't exceed 256.
  info_ru: |
    Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
    модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
    Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
--- a/docs/config/src/included.en.md
+++ b/docs/config/src/included.en.md
@@ -28,6 +28,8 @@
 {{../../config/network.en.md|indent=2}}
 {{../../config/client.en.md|indent=2}}
 {{../../config/layout-cluster.en.md|indent=2}}
 {{../../config/layout-osd.en.md|indent=2}}
--- a/docs/config/src/included.ru.md
+++ b/docs/config/src/included.ru.md
@@ -28,6 +28,8 @@
 {{../../config/network.ru.md|indent=2}}
 {{../../config/client.ru.md|indent=2}}
 {{../../config/layout-cluster.ru.md|indent=2}}
 {{../../config/layout-osd.ru.md|indent=2}}
--- a/docs/config/src/layout-cluster.yml
+++ b/docs/config/src/layout-cluster.yml
@@ -87,8 +87,9 @@
    it (they have internal SSD cache even though it's not stated in datasheets).
    Setting this parameter to "all" or "small" in OSD parameters requires enabling
-    disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
+    [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
-    enabling disable_data_fsync.
+    [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
    "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
    TLDR: For optimal performance, set immediate_commit to "all" if you only use
    SSDs with supercapacitor-based power loss protection (nonvolatile
@@ -140,8 +141,9 @@
    указано в спецификациях).
    Указание "all" или "small" в настройках / командной строке OSD требует
-    включения disable_journal_fsync и disable_meta_fsync, значение "all" также
+    включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
-    требует включения disable_data_fsync.
+    [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
    также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
    Итого, вкратце: для оптимальной производительности установите
    immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/src/layout-osd.yml
+++ b/docs/config/src/layout-osd.yml
@@ -244,9 +244,9 @@
    3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
    4. HDD-only, faster random read: csum_block_size=32k
    5. HDD-only, faster random write: csum_block_size=4k +
-       inmemory_metadata=false + cached_io_meta=true
+       inmemory_metadata=false + meta_io=cached
-    See also [cached_io_meta](osd.en.md#cached_io_meta).
+    See also [meta_io](osd.en.md#meta_io).
  info_ru: |
    Размер блока расчёта контрольных сумм.
@@ -271,6 +271,6 @@
    3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
    4. Только HDD, быстрее случайное чтение: csum_block_size=32k
    5. Только HDD, быстрее случайная запись: csum_block_size=4k +
-       inmemory_metadata=false + cached_io_meta=true
+       inmemory_metadata=false + meta_io=cached
-    Смотрите также [cached_io_meta](osd.ru.md#cached_io_meta).
+    Смотрите также [meta_io](osd.ru.md#meta_io).
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -48,11 +48,14 @@
  type: string
  info: |
    RDMA device name to use for Vitastor OSD communications (for example,
-    "rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
+    "rocep5s0f0"). Now Vitastor supports all adapters, even ones without
-    Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
+    ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
-    to work. For example, Mellanox ConnectX-3 and older adapters don't have
+
-    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
+    Versions up to Vitastor 1.2.0 required ODP which is only present in
-    root to list available RDMA devices and their features.
+    Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
    Run `ibv_devinfo -v` as root to list available RDMA devices and their
    features.
    Remember that you also have to configure your network switches if you use
    RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -61,12 +64,15 @@
    PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
+    Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
-    Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
+    нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
-    адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
+    картами производства не Mellanox.
-    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
+
-    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
+    Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
-    параметры и возможности.
+    на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
    Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
    список доступных RDMA-устройств, их параметры и возможности.
    Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
    правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -160,6 +166,45 @@
    у принимающей стороны в процессе работы не заканчивались буферы на приём.
    Не влияет на потребление памяти - дополнительная память на операции отправки
    не выделяется.
 - name: rdma_odp
  type: bool
  default: false
  online: false
  info: |
    Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
    ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
    for RDMA adapter to be able to use it. This, in turn, allows to skip memory
    copying during sending. One would think this should improve performance, but
    **in reality** RDMA performance with ODP is **drastically** worse. Example
    3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
    without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
    This happens because Mellanox ODP implementation seems to be based on
    message retransmissions when the adapter doesn't know about the buffer yet -
    it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
    which is generally slow in RDMA/RoCE networks. Here's a presentation about
    it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
    ODP support is retained in the code just in case a good ODP implementation
    appears one day.
  info_ru: |
    Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
    исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
    не регистрировать память для её использования RDMA-картой. Благодаря этому
    можно не копировать данные при отправке их в сеть и, казалось бы, это должно
    улучшать производительность - но **по факту** получается так, что
    производительность только ухудшается, причём сильно. Пример - на 3-узловом
    кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
    удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
    Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
    основана на повторной передаче сообщений, когда карте не известен буфер -
    вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
    А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
    Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
    Возможность использования ODP сохранена в коде на случай, если вдруг в один
    прекрасный день появится хорошая реализация ODP.
 - name: peer_connect_interval
  type: sec
  min: 1
@@ -259,23 +304,3 @@
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
 - name: client_dirty_limit
  type: int
  default: 33554432
  online: true
  info: |
    Without immediate_commit=all this parameter sets the limit of "dirty"
    (not committed by fsync) data allowed by the client before forcing an
    additional fsync and committing the data. Also note that the client always
    holds a copy of uncommitted data in memory so this setting also affects
    RAM usage of clients.
    This parameter doesn't affect OSDs themselves.
  info_ru: |
    При работе без immediate_commit=all - это лимит объёма "грязных" (не
    зафиксированных fsync-ом) данных, при достижении которого клиент будет
    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
    что в этом случае до момента fsync клиент хранит копию незафиксированных
    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
    Параметр не влияет на сами OSD.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -2,15 +2,28 @@
  type: sec
  default: 5
  info: |
-    Interval at which OSDs report their state to etcd. Affects OSD lease time
+    Interval at which OSDs report their liveness to etcd. Affects OSD lease time
    and thus the failover speed. Lease time is equal to this parameter value
    plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
    that every OSD always refreshes its lease in time.
  info_ru: |
-    Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
+    Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
-    влияет на время резервации (lease) OSD и поэтому на скорость переключения
+    влияет на время резервации (lease) OSD и поэтому - на скорость переключения
    при падении OSD. Время lease равняется значению этого параметра плюс
    max_etcd_attempts * etcd_quick_timeout.
 - name: etcd_stats_interval
  type: sec
  default: 30
  info: |
    Interval at which OSDs report their statistics to etcd. Highly affects the
    imposed load on etcd, because statistics include a key for every OSD and
    for every PG. At the same time, low statistic intervals make `vitastor-cli`
    statistics more responsive.
  info_ru: |
    Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
    создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
    каждый OSD и на каждую PG. В то же время низкий интервал делает
    статистику, печатаемую `vitastor-cli`, отзывчивей.
 - name: run_primary
  type: bool
  default: true
@@ -260,73 +273,96 @@
    достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
    параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
    журналами, расположенными на быстром по сравнению с HDD устройстве.
- name: cached_io_data
+- name: data_io
-  type: bool
+  type: string
-  default: false
+  default: direct
  info: |
-    Read and write *data* through Linux page cache, i.e. use a file descriptor
+    I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
-    opened with O_SYNC, but without O_DIRECT for I/O. May improve read
+    to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
    performance for hot data and slower disks - HDDs and maybe SATA SSDs.
    Not recommended for desktop SSDs without capacitors because O_SYNC flushes
    disk cache on every write.
  info_ru: |
    Читать и записывать *данные* через системный кэш Linux (page cache), то есть,
    использовать для данных файловый дескриптор, открытый без флага O_DIRECT, но
    с флагом O_SYNC. Может улучшить скорость чтения для относительно медленных
    дисков - HDD и, возможно, SATA SSD. Не рекомендуется для потребительских
    SSD без конденсаторов, так как O_SYNC сбрасывает кэш диска при каждой записи.
 - name: cached_io_meta
  type: bool
  default: false
  info: |
    Read and write *metadata* through Linux page cache. May improve read
    performance only if your drives are relatively slow (HDD, SATA SSD), and
    only if checksums are enabled and [inmemory_metadata](#inmemory_metadata)
    is disabled, because in this case metadata blocks are read from disk
    on every read request to verify checksums and caching them may reduce this
    extra read load.
-    Absolutely pointless to enable with enabled inmemory_metadata because all
+    Choose "cached" to use Linux page cache. This may improve read performance
-    metadata is kept in memory anyway, and likely pointless without checksums,
+    for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
-    because in that case, metadata blocks are read from disk only during journal
+    decrease write performance for fast disks because page cache is an overhead
    itself.
    Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
    (which requires disable_data_fsync) with drives having write-back cache
    which can't be turned off, for example, Intel Optane. Also note that *some*
    desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
    disable_data_fsync unsafe even with "directsync".
  info_ru: |
    Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
    "directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
    Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
    чтении и записи. Это может улучшить скорость чтения горячих данных с
    относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
    снижает производительность записи для быстрых дисков, так как кэш сам по
    себе тоже добавляет накладные расходы.
    Выберите "directsync", если хотите задействовать
    [immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
    включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
    дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
    настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
    fsync небезопасным даже с режимом "directsync".
 - name: meta_io
  type: string
  default: direct
  info: |
    I/O mode for *metadata*. One of "direct", "cached" or "directsync".
    "cached" may improve read performance, but only under the following conditions:
    1. your drives are relatively slow (HDD, SATA SSD), and
    2. checksums are enabled, and
    3. [inmemory_metadata](#inmemory_metadata) is disabled.
    Under all these conditions, metadata blocks are read from disk on every
    read request to verify checksums and caching them may reduce this extra
    read load. Without (3) metadata is never read from the disk after starting,
    and without (2) metadata blocks are read from disk only during journal
    flushing.
-    If the same device is used for data and metadata, enabling [cached_io_data](#cached_io_data)
+    "directsync" is the same as above.
-    also enables this parameter, given that it isn't turned off explicitly.
+
    If the same device is used for data and metadata, meta_io by default is set
    to the same value as [data_io](#data_io).
  info_ru: |
-    Читать и записывать *метаданные* через системный кэш Linux. Может улучшить
+    Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
-    скорость чтения, если у вас медленные диски, и только если контрольные суммы
+    "directsync".
-    включены, а параметр [inmemory_metadata](#inmemory_metadata) отключён, так
+
-    как в этом случае блоки метаданных читаются с диска при каждом запросе чтения
+    "cached" может улучшить скорость чтения, если:
    1. у вас медленные диски (HDD, SATA SSD)
    2. контрольные суммы включены
    3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
    При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
    для проверки контрольных сумм и их кэширование может снизить дополнительную
-    нагрузку на диск.
+    нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
    запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
-    Абсолютно бессмысленно включать данный параметр, если параметр
+    Если одно и то же устройство используется для данных и метаданных, режим
-    inmemory_metadata включён (по умолчанию это так), и также вероятно
+    ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
-    бессмысленно включать его, если не включены контрольные суммы, так как в
+- name: journal_io
-    этом случае блоки метаданных читаются с диска только во время сброса
+  type: string
-    журнала.
+  default: direct
    Если одно и то же устройство используется для данных и метаданных, включение
    [cached_io_data](#cached_io_data) также включает данный параметр, при
    условии, что он не отключён явным образом.
 - name: cached_io_journal
  type: bool
  default: false
  info: |
-    Read and write *journal* through Linux page cache. May improve read
+    I/O mode for *journal*. One of "direct", "cached" or "directsync".
    performance if [inmemory_journal](#inmemory_journal) is turned off.
-    If the same device is used for metadata and journal, enabling [cached_io_meta](#cached_io_meta)
+    Here, "cached" may only improve read performance for recent writes and
-    also enables this parameter, given that it isn't turned off explicitly.
+    only if [inmemory_journal](#inmemory_journal) is turned off.
    If the same device is used for metadata and journal, journal_io by default
    is set to the same value as [meta_io](#meta_io).
  info_ru: |
-    Читать и записывать *журнал* через системный кэш Linux. Может улучшить
+    Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
-    скорость чтения, если параметр [inmemory_journal](#inmemory_journal)
+    "directsync".
    Здесь "cached" может улучшить скорость чтения только недавно записанных
    данных и только если параметр [inmemory_journal](#inmemory_journal)
    отключён.
    Если одно и то же устройство используется для метаданных и журнала,
-    включение [cached_io_meta](#cached_io_meta) также включает данный
+    режим ввода-вывода журнала по умолчанию устанавливается равным
-    параметр, при условии, что он не отключён явным образом.
+    [meta_io](#meta_io).
 - name: journal_sector_buffer_count
  type: int
  default: 32
--- a/docs/installation/kubernetes.en.md
+++ b/docs/installation/kubernetes.en.md
@@ -17,4 +17,26 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```
-After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
+After that you'll be able to create PersistentVolumes.
 **Important:** For best experience, use Linux kernel at least 5.15 with [VDUSE](../usage/qemu.en.md#vduse)
 kernel modules enabled (vdpa, vduse, virtio-vdpa). If your distribution doesn't
 have them pre-built - build them yourself ([instructions](../usage/qemu.en.md#vduse)),
 I promise it's worth it :-). When VDUSE is unavailable, CSI driver uses [NBD](../usage/nbd.en.md)
 to map Vitastor devices. NBD is slower and prone to timeout issues: if Vitastor
 cluster becomes unresponsible for more than [nbd_timeout](../config/client.en.md#nbd_timeout),
 the NBD device detaches and breaks pods using it.
 ## Features
 Vitastor CSI supports:
 - Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
 - Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
 - Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
 - Volume expansion
 - Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
 - [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
 - Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
 - Multiple clusters by using multiple configuration files in ConfigMap.
 Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
--- a/docs/installation/kubernetes.ru.md
+++ b/docs/installation/kubernetes.ru.md
@@ -17,4 +17,26 @@
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```
-После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
+После этого вы сможете создавать PersistentVolume.
 **Важно:** Лучше всего использовать ядро Linux версии не менее 5.15 с включёнными модулями
 [VDUSE](../usage/qemu.ru.md#vduse) (vdpa, vduse, virtio-vdpa). Если в вашем дистрибутиве
 они не собраны из коробки - соберите их сами, обещаю, что это стоит того ([инструкция](../usage/qemu.ru.md#vduse)) :-).
 Когда VDUSE недоступно, CSI-плагин использует [NBD](../usage/nbd.ru.md) для подключения
 дисков, а NBD медленнее и имеет проблему таймаута - если кластер остаётся недоступным
 дольше, чем [nbd_timeout](../config/client.ru.md#nbd_timeout), NBD-устройство отключается
 и ломает поды, использующие его.
 ## Возможности
 CSI-плагин Vitastor поддерживает:
 - Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
 - Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
 - Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
 - Расширение размера томов
 - Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
 - Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
 - Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
 - Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
 Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -18,7 +18,7 @@
    stable version from 0.9.x branch instead of 1.x
 - For Debian 10 (Buster) also enable backports repository:
  `deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
+- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
 ## CentOS
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -18,7 +18,7 @@
    установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
 - Для Debian 10 (Buster) также включите репозиторий backports:
  `deb http://deb.debian.org/debian buster-backports main`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
+- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
 ## CentOS
--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -6,10 +6,10 @@
 # Proxmox VE
-To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):
+To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
 - Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
-  bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
+  bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
 - Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
 - Define storage in `/etc/pve/storage.cfg` (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -6,10 +6,10 @@
 # Proxmox VE
-Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):
+Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
 - Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
-  bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
+  bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
 - Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
 - Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
--- a/docs/intro/architecture.ru.md
+++ b/docs/intro/architecture.ru.md
@@ -54,7 +54,8 @@
  виртуальные диски, их снимки и клоны.
 - **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
  с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
-  библиотеки, без необходимости отображения дисков в виде блочных устройств.
+  библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
  позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
 - **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
  с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
  (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -31,6 +31,7 @@
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
 - [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
 - [Checksums](../config/layout-osd.en.md#data_csum_type)
 - [Client write-back cache](../config/client.en.md#client_enable_writeback)
 ## Plugins and tools
@@ -50,13 +51,15 @@
 The following features are planned for the future:
 - File system
 - Control plane optimisation
 - Other administrative tools
 - Web GUI
 - OpenNebula plugin
- iSCSI proxy
+- iSCSI and NVMeoF gateways
 - Multi-threaded client
 - Faster failover
 - S3
 - Tiered storage (SSD caching)
 - NVDIMM support
 - Compression (possibly)
 - Read caching using system page cache (possibly)
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -33,6 +33,7 @@
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
 - [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
 - [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
 - [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
 ## Драйверы и инструменты
@@ -50,12 +51,15 @@
 ## Планы развития
 - Файловая система
 - Оптимизация слоя управления
 - Другие инструменты администрирования
 - Web-интерфейс
 - Плагин для OpenNebula
- iSCSI-прокси
+- iSCSI и NVMeoF прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
 - S3
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
 - Возможно, сжатие
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -28,7 +28,8 @@ It supports the following commands:
 Global options:
 ```
--etcd_address ADDR  Etcd connection address
+--config_file FILE   Path to Vitastor configuration file
 --etcd_address URL   Etcd connection address
 --iodepth N          Send N operations in parallel to each OSD when possible (default 32)
 --parallel_osds M    Work with M osds in parallel when possible (default 4)
 --progress 1|0       Report progress (default 1)
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -27,7 +27,8 @@ vitastor-cli - интерфейс командной строки для адм
 Глобальные опции:
 ```
--etcd_address ADDR  Адрес соединения с etcd
+--config_file FILE   Путь к файлу конфигурации Vitastor
 --etcd_address URL   Адрес соединения с etcd
 --iodepth N          Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
 --parallel_osds M    Работать параллельно с M OSD (по умолчанию 4)
 --progress 1|0       Печатать прогресс выполнения (по умолчанию 1)
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -17,6 +17,7 @@ It supports the following commands:
 - [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
 - [update-sb](#update-sb)
 - [udev](#udev)
 - [exec-osd](#exec-osd)
 - [pre-exec](#pre-exec)
@@ -182,6 +183,14 @@ Try to read Vitastor OSD superblock from `<device>` and print it in JSON format.
 Read JSON from STDIN and write it into Vitastor OSD superblock on `<device>`.
 ## update-sb
 `vitastor-disk update-sb <device> [--force] [--<parameter> <value>] [...]`
 Read Vitastor OSD superblock from <device>, update parameters in it and write it back.
 `--force` allows to ignore validation errors.
 ## udev
 `vitastor-disk udev <device>`
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -17,6 +17,7 @@ vitastor-disk - инструмент командной строки для уп
 - [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
 - [update-sb](#update-sb)
 - [udev](#udev)
 - [exec-osd](#exec-osd)
 - [pre-exec](#pre-exec)
@@ -187,6 +188,15 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
 Прочитать JSON со стандартного ввода и записать его в суперблок OSD на диск `<device>`.
 ## update-sb
 `vitastor-disk update-sb <device> [--force] [--<параметр> <значение>] [...]`
 Прочитать суперблок OSD с диска `<device>`, изменить в нём заданные параметры и записать обратно.
 Опция `--force` позволяет читать суперблок, даже если он считается некорректным
 из-за ошибок валидации.
 ## udev
 `vitastor-disk udev <device>`
--- a/docs/usage/nbd.en.md
+++ b/docs/usage/nbd.en.md
@@ -11,25 +11,25 @@ NBD stands for "Network Block Device", but in fact it also functions as "BUSE"
 NBD slighly lowers the performance due to additional overhead, but performance still
 remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).
-Vitastor Kubernetes CSI driver is based on NBD.
+See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
-See also [VDUSE](qemu.en.md#vduse).
+Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
 ## Map image
 To create a local block device for a Vitastor image run:
 ```
-vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
+vitastor-nbd map --image testimg
 ```
 It will output a block device name like /dev/nbd0 which you can then use as a normal disk.
 You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
-Additional options for map command:
+vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
-* `--nbd_timeout 30` \
+* `--nbd_timeout 300` \
  Timeout for I/O operations in seconds after exceeding which the kernel stops
  the device. You can set it to 0 to disable the timeout, but beware that you
  won't be able to stop the device at all if vitastor-nbd process dies.
@@ -44,6 +44,9 @@ Additional options for map command:
 * `--foreground 1` \
  Stay in foreground, do not daemonize.
 Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
 in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
 ## Unmap image
 To unmap the device run:
--- a/docs/usage/nbd.ru.md
+++ b/docs/usage/nbd.ru.md
@@ -14,16 +14,16 @@ NBD на данный момент необходимо, чтобы монтир
 NBD немного снижает производительность из-за дополнительных копирований памяти,
 но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).
-CSI-драйвер Kubernetes Vitastor основан на NBD.
+Смотрите также [VDUSE](qemu.ru.md#vduse), как лучшую альтернативу NBD.
-Смотрите также [VDUSE](qemu.ru.md#vduse).
+CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
 ## Подключить устройство
 Чтобы создать локальное блочное устройство для образа, выполните команду:
 ```
-vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
+vitastor-nbd map --image testimg
 ```
 Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
@@ -32,7 +32,8 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 Для обращения по номеру инода, аналогично другим командам, можно использовать опции
 `--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
-Дополнительные опции для команды подключения NBD-устройства:
+vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
 плюс специфичные для NBD:
 * `--nbd_timeout 30` \
  Максимальное время выполнения любой операции чтения/записи в секундах, при
@@ -53,6 +54,10 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 * `--foreground 1` \
  Не уводить процесс в фоновый режим.
 Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
 также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
 заданном опцией `--config_file`.
 ## Отключить устройство
 Для отключения устройства выполните:
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@@ -23,7 +23,7 @@ balancer or any failover method you want to in that case.
 vitastor-nfs usage:
 ```
-vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
+vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
 --subdir <DIR>    export images prefixed <DIR>/ (default empty - export all images)
 --portmap 0       do not listen on port 111 (portmap/rpcbind, requires root)
--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@@ -22,7 +22,7 @@
 Использование vitastor-nfs:
 ```
-vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
+vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
 --subdir <DIR>    экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
 --portmap 0       отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -34,6 +34,20 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```
 With a separate I/O thread:
 ```
 qemu-system-x86_64 -enable-kvm -m 1024 \
    -object iothread,id=vitastor1 \
    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
        id=virtio-disk0,bootindex=1,write-cache=off' \
    -vnc 0.0.0.0:0
 ```
 You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
 ## qemu-img
 For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -84,25 +98,75 @@ This can be used for backups. Just note that exporting an image that is currentl
 is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
 on a live VM.
 ## vhost-user-blk
 QEMU, starting with 6.0, includes support for attaching disks via a separate
 userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
 lower latency.
 Example commands to use it with Vitastor:
 ```
 qemu-storage-daemon \
    --daemonize \
    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
 qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
    -object memory-backend-memfd,id=mem,size=2G,share=on \
    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
    -vnc 0.0.0.0:0
 ```
 memfd memory-backend is crucial, vhost-user-blk does not work without it.
 ## VDUSE
 Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
 to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
 exporting QEMU block devices over this protocol using qemu-storage-daemon.
-VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
+VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
-for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
+- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
-hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
+- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
-In this case reboot will be the only way to remove VDUSE devices from system.
+- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
  and block device will continue operation
 - It doesn't seem to have the device number limit
-On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
+Example performance comparison:
-performance is important for you. Approximate performance numbers:
+
-direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
+|                      | direct fio  | NBD         | VDUSE       |
 |----------------------|-------------|-------------|-------------|
 | linear write         | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
 | 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
 | 4k random write Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
 | linear read          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
 | 4k random read Q128  | 287000 iops | 140000 iops | 189000 iops |
 | 4k random read Q1    | 9600 iops   | 7640 iops   | 7780 iops   |
 To try VDUSE you need at least Linux 5.15, built with VDUSE support
-(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
+(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
-disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
+
-[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
+Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
 use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
 or build modules for Debian kernel manually:
 ```
 mkdir build
 cd build
 apt-get install linux-headers-`uname -r`
 apt-get build-dep linux-image-`uname -r`-unsigned
 apt-get source linux-image-`uname -r`-unsigned
 cd linux*/drivers/vdpa
 make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
 cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
 cd ../virtio
 make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
 depmod -a
 ```
 You also need `vdpa` tool from the `iproute2` package.
 Commands to attach Vitastor image as a VDUSE device:
@@ -115,7 +179,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
 vdpa dev add name test1 mgmtdev vduse
 ```
-After running these commands /dev/vda device will appear in the system and you'll be able to
+After running these commands, `/dev/vda` device will appear in the system and you'll be able to
 use it as a normal disk.
 To remove the device:
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -36,6 +36,18 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```
 С отдельным потоком ввода-вывода:
 ```
 qemu-system-x86_64 -enable-kvm -m 1024 \
    -object iothread,id=vitastor1 \
    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
        id=virtio-disk0,bootindex=1,write-cache=off' \
    -vnc 0.0.0.0:0
 ```
 Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
 ## qemu-img
@@ -88,25 +100,76 @@ qemu-img rebase -u -b '' testimg.qcow2
 в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
 с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
 ## vhost-user-blk
 QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
 Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
 задержку (ниже на 20-30 микросекунд, чем при обычном методе).
 Пример команд для использования vhost-user-blk с Vitastor:
 ```
 qemu-storage-daemon \
    --daemonize \
    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
 qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
    -object memory-backend-memfd,id=mem,size=2G,share=on \
    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
    -vnc 0.0.0.0:0
 ```
 Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
 ## VDUSE
 В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
 к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
 экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
-VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
+VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
-подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
+устройств на уровне ядра, ибо:
-процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
+- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
-через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
+- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
 - Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
  перезапустить (!) и блочное устройство продолжит работать
 - По-видимому, у него нет предела числа подключаемых в систему устройств
-С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
+Пример сравнения производительности:
 быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
 прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
-Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
+|                          | Прямой fio  | NBD         | VDUSE       |
-VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
+|--------------------------|-------------|-------------|-------------|
-отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
+| линейная запись          | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
-[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
+| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
 | 4k случайная запись Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
 | линейное чтение          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
 | 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
 | 4k случайное чтение Q1   | 9600 iops   | 7640 iops   | 7780 iops   |
 Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
 VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
 В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
 на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
 из Proxmox или соберите модули для ядра Debian вручную:
 ```
 mkdir build
 cd build
 apt-get install linux-headers-`uname -r`
 apt-get build-dep linux-image-`uname -r`-unsigned
 apt-get source linux-image-`uname -r`-unsigned
 cd linux*/drivers/vdpa
 make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
 cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
 cd ../virtio
 make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
 depmod -a
 ```
 Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
 Команды для подключения виртуального диска через VDUSE:
@@ -119,7 +182,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
 vdpa dev add name test1 mgmtdev vduse
 ```
-После этого в системе появится устройство /dev/vda, которое можно будет использовать как
+После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
 обычный диск.
 Для удаления устройства из системы:
--- a/mon/90-vitastor.rules
+++ b/mon/90-vitastor.rules
@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
    IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
    SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
-ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
-ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -78,9 +78,15 @@ const etcd_tree = {
            disk_alignment: 4096,
            bitmap_granularity: 4096,
            immediate_commit: false, // 'all' or 'small'
            // client - configurable online
            client_max_dirty_bytes: 33554432,
            client_max_dirty_ops: 1024,
            client_enable_writeback: false,
            client_max_buffered_bytes: 33554432,
            client_max_buffered_ops: 1024,
            client_max_writeback_iodepth: 256,
            // client and osd - configurable online
            log_level: 0,
            client_dirty_limit: 33554432,
            peer_connect_interval: 5, // seconds. min: 1
            peer_connect_timeout: 5, // seconds. min: 1
            osd_idle_timeout: 5, // seconds. min: 1
@@ -93,6 +99,7 @@ const etcd_tree = {
            etcd_ws_keepalive_interval: 30, // seconds
            // osd
            etcd_report_interval: 5, // seconds
            etcd_stats_interval: 30, // seconds
            run_primary: true,
            osd_network: null, // "192.168.7.0/24" or an array of masks
            bind_address: "0.0.0.0",
@@ -390,12 +397,13 @@ class Mon
        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
        this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
        this.prev_stats = { osd_stats: {}, osd_diff: {} };
        this.signals_set = false;
        this.stat_time = Date.now();
        this.ws = null;
        this.ws_alive = false;
        this.ws_keepalive_timer = null;
        this.on_stop_cb = () => this.on_stop(0).catch(console.error);
        this.recheck_pgs_active = false;
    }
    parse_etcd_addresses(addrs)
@@ -545,9 +553,9 @@ class Mon
            const cur_addr = this.pick_next_etcd();
            const base = 'ws'+cur_addr.substr(4);
            let now = Date.now();
-            if (tried[base] && now-tried[base] < timeout)
+            if (tried[base] && now-tried[base] < this.etcd_start_timeout)
            {
-                await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
+                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
                now = Date.now();
            }
            tried[base] = now;
@@ -685,8 +693,27 @@ class Mon
        });
    }
    // Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
    schedule_save_last_clean()
    {
        if (!this.save_last_clean_timer)
        {
            this.save_last_clean_timer = setTimeout(() =>
            {
                this.save_last_clean_timer = null;
                this.save_last_clean().catch(this.die);
            }, this.config.mon_change_timeout || 1000);
        }
    }
    async save_last_clean()
    {
        if (this.save_last_clean_running)
        {
            this.schedule_save_last_clean();
            return;
        }
        this.save_last_clean_running = true;
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
        const new_clean_pgs = { items: {} };
    next_pool:
@@ -723,6 +750,7 @@ class Mon
                value: b64(JSON.stringify(this.state.history.last_clean_pgs))
            } } ],
        }, this.etcd_start_timeout, 0);
        this.save_last_clean_running = false;
    }
    get_mon_state()
@@ -1156,6 +1184,33 @@ class Mon
        }
    }
    filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
    {
        for (const host in flat_tree)
        {
            let found = 0;
            for (const osd in flat_tree[host])
            {
                const osd_stat = this.state.osd.stats[osd];
                if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
                    osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
                    osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
                    osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
                {
                    delete flat_tree[host][osd];
                }
                else
                {
                    found++;
                }
            }
            if (!found)
            {
                delete flat_tree[host];
            }
        }
    }
    get_affinity_osds(pool_cfg, up_osds, osd_tree)
    {
        let aff_osds = up_osds;
@@ -1169,6 +1224,12 @@ class Mon
    async recheck_pgs()
    {
        if (this.recheck_pgs_active)
        {
            this.schedule_recheck();
            return;
        }
        this.recheck_pgs_active = true;
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
        // FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1190,6 +1251,7 @@ class Mon
                    // Pool deleted. Delete all PGs, but first stop them.
                    if (!await this.stop_all_pgs(pool_id))
                    {
                        this.recheck_pgs_active = false;
                        this.schedule_recheck();
                        return;
                    }
@@ -1216,6 +1278,12 @@ class Mon
                pool_tree = pool_tree ? pool_tree.children : [];
                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
                this.filter_osds_by_block_layout(
                    pool_tree,
                    pool_cfg.block_size || this.config.block_size || 131072,
                    pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
                    pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
                );
                // These are for the purpose of building history.osd_sets
                const real_prev_pgs = [];
                let pg_history = [];
@@ -1252,9 +1320,16 @@ class Mon
                        // PG count changed. Need to bring all PGs down.
                        if (!await this.stop_all_pgs(pool_id))
                        {
                            this.recheck_pgs_active = false;
                            this.schedule_recheck();
                            return;
                        }
                    }
                    if (prev_pgs.length != pool_cfg.pg_count)
                    {
                        // Scale PG count
                        // Do it even if old_pg_count is already equal to pool_cfg.pg_count,
                        // because last_clean_pgs may still contain the old number of PGs
                        const new_pg_history = [];
                        PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
                        pg_history = new_pg_history;
@@ -1356,6 +1431,7 @@ class Mon
                await this.save_pg_config(new_config_pgs);
            }
        }
        this.recheck_pgs_active = false;
    }
    async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1405,7 +1481,6 @@ class Mon
    }
    // Schedule a recheck to run after a small timeout (1s)
    // If already scheduled, cancel previous timer and schedule it again
    // This is required for multiple change events to trigger at most 1 recheck in 1s
    schedule_recheck()
    {
@@ -1419,15 +1494,15 @@ class Mon
        }
    }
-    derive_osd_stats(st, prev)
+    derive_osd_stats(st, prev, prev_diff)
    {
        const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
-        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
+        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
-        if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
+        if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
        {
-            return diff;
+            return prev_diff || diff;
        }
-        const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
+        const timediff = BigInt(st.time*1000 - prev.time*1000);
        for (const op in st.op_stats||{})
        {
            const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1459,25 +1534,47 @@ class Mon
            if (n > 0)
                diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
        }
        for (const pool_id in st.inode_stats||{})
        {
            const pool_diff = diff.inode_stats[pool_id] = {};
            for (const inode_num in st.inode_stats[pool_id])
            {
                const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
                for (const op of [ 'read', 'write', 'delete' ])
                {
                    const c = st.inode_stats[pool_id][inode_num][op];
                    const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
                        prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
                    const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
                    inode_diff[op] = {
                        bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
                        iops: n*1000n/timediff,
                        lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
                    };
                }
            }
        }
        return diff;
    }
-    sum_op_stats(timestamp, prev_stats)
+    sum_op_stats()
    {
-        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
+        for (const osd in this.state.osd.stats)
        if (!prev_stats || prev_stats.timestamp >= timestamp)
        {
-            return sum_diff;
+            const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
            this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
                cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
            );
            this.prev_stats.osd_stats[osd] = cur;
        }
-        const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
+        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
        // Sum derived values instead of deriving summed
        for (const osd in this.state.osd.stats)
        {
-            const derived = this.derive_osd_stats(this.state.osd.stats[osd],
+            const derived = this.prev_stats.osd_diff[osd];
-                this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
+            for (const type in sum_diff)
            for (const type in derived)
            {
-                for (const op in derived[type])
+                for (const op in derived[type]||{})
                {
                    for (const k in derived[type][op])
                    {
@@ -1534,14 +1631,14 @@ class Mon
        return { object_counts, object_bytes };
    }
-    sum_inode_stats(prev_stats, timestamp, prev_timestamp)
+    sum_inode_stats()
    {
        const inode_stats = {};
        const inode_stub = () => ({
            raw_used: 0n,
-            read: { count: 0n, usec: 0n, bytes: 0n },
+            read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
-            write: { count: 0n, usec: 0n, bytes: 0n },
+            write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
-            delete: { count: 0n, usec: 0n, bytes: 0n },
+            delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
        });
        const seen_pools = {};
        for (const pool_id in this.state.config.pools)
@@ -1593,11 +1690,25 @@ class Mon
                }
            }
        }
-        if (prev_stats && prev_timestamp >= timestamp)
+        for (const osd in this.prev_stats.osd_diff)
        {
-            prev_stats = null;
+            for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
            {
                for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
                {
                    inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
                    for (const op of [ 'read', 'write', 'delete' ])
                    {
                        const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
                        const op_st = inode_stats[pool_id][inode_num][op];
                        op_st.bps += op_diff.bps;
                        op_st.iops += op_diff.iops;
                        op_st.lat += op_diff.lat;
                        op_st.n_osd = (op_st.n_osd || 0) + 1;
                    }
                }
            }
        }
        const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
        for (const pool_id in inode_stats)
        {
            for (const inode_num in inode_stats[pool_id])
@@ -1606,11 +1717,12 @@ class Mon
                for (const op of [ 'read', 'write', 'delete' ])
                {
                    const op_st = inode_stats[pool_id][inode_num][op];
-                    const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
+                    if (op_st.n_osd)
-                    op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
+                    {
-                    op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
+                        op_st.lat /= BigInt(op_st.n_osd);
-                    op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
+                        delete op_st.n_osd;
-                    if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
+                    }
                    if (op_st.bps > 0 || op_st.iops > 0)
                        nonzero = true;
                }
                if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1643,15 +1755,9 @@ class Mon
    async update_total_stats()
    {
        const txn = [];
        const timestamp = Date.now();
        const { object_counts, object_bytes } = this.sum_object_counts();
-        let stats = this.sum_op_stats(timestamp, this.prev_stats);
+        let stats = this.sum_op_stats();
-        let { inode_stats, seen_pools } = this.sum_inode_stats(
+        let { inode_stats, seen_pools } = this.sum_inode_stats();
            this.prev_stats ? this.prev_stats.inode_stats : null,
            timestamp, this.prev_stats ? this.prev_stats.timestamp : null
        );
        this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
        this.stat_time = Date.now();
        stats.object_counts = object_counts;
        stats.object_bytes = object_bytes;
        stats = this.serialize_bigints(stats);
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.0.0",
+  "version": "1.3.1",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils
-VERSION = '1.0.0'
+VERSION = '1.3.1'
 LOG = logging.getLogger(__name__)
--- a/patches/pve-qemu-8.1-vitastor.patch
+++ b/patches/pve-qemu-8.1-vitastor.patch
@@ -0,0 +1,190 @@
 Index: pve-qemu-kvm-8.1.2/block/meson.build
 ===================================================================
 --- pve-qemu-kvm-8.1.2.orig/block/meson.build
 +++ pve-qemu-kvm-8.1.2/block/meson.build
@@ -123,6 +123,7 @@ foreach m : [
   [libnfs, 'nfs', files('nfs.c')],
   [libssh, 'ssh', files('ssh.c')],
   [rbd, 'rbd', files('rbd.c')],
 +  [vitastor, 'vitastor', files('vitastor.c')],
 ]
   if m[0].found()
     module_ss = ss.source_set()
 Index: pve-qemu-kvm-8.1.2/meson.build
 ===================================================================
 --- pve-qemu-kvm-8.1.2.orig/meson.build
 +++ pve-qemu-kvm-8.1.2/meson.build
@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_
   endif
 endif
 +vitastor = not_found
 +if not get_option('vitastor').auto() or have_block
 +  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
 +    required: get_option('vitastor'))
 +  if libvitastor_client.found()
 +    if cc.links('''
 +      #include <vitastor_c.h>
 +      int main(void) {
 +        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 +        return 0;
 +      }''', dependencies: libvitastor_client)
 +      vitastor = declare_dependency(dependencies: libvitastor_client)
 +    elif get_option('vitastor').enabled()
 +      error('could not link libvitastor_client')
 +    else
 +      warning('could not link libvitastor_client, disabling')
 +    endif
 +  endif
 +endif
 +
 glusterfs = not_found
 glusterfs_ftruncate_has_stat = false
 glusterfs_iocb_has_stat = false
@@ -2123,6 +2143,7 @@ if numa.found()
 endif
 config_host_data.set('CONFIG_OPENGL', opengl.found())
 config_host_data.set('CONFIG_RBD', rbd.found())
 +config_host_data.set('CONFIG_VITASTOR', vitastor.found())
 config_host_data.set('CONFIG_RDMA', rdma.found())
 config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
 config_host_data.set('CONFIG_SDL', sdl.found())
@@ -4298,6 +4319,7 @@ summary_info += {'fdt support':       fd
 summary_info += {'libcap-ng support': libcap_ng}
 summary_info += {'bpf support':       libbpf}
 summary_info += {'rbd support':       rbd}
 +summary_info += {'vitastor support':  vitastor}
 summary_info += {'smartcard support': cacard}
 summary_info += {'U2F support':       u2f}
 summary_info += {'libusb':            libusb}
 Index: pve-qemu-kvm-8.1.2/meson_options.txt
 ===================================================================
 --- pve-qemu-kvm-8.1.2.orig/meson_options.txt
 +++ pve-qemu-kvm-8.1.2/meson_options.txt
@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value :
        description: 'lzo compression support')
 option('rbd', type : 'feature', value : 'auto',
        description: 'Ceph block device driver')
 +option('vitastor', type : 'feature', value : 'auto',
 +       description: 'Vitastor block device driver')
 option('opengl', type : 'feature', value : 'auto',
        description: 'OpenGL support')
 option('rdma', type : 'feature', value : 'auto',
 Index: pve-qemu-kvm-8.1.2/qapi/block-core.json
 ===================================================================
 --- pve-qemu-kvm-8.1.2.orig/qapi/block-core.json
 +++ pve-qemu-kvm-8.1.2/qapi/block-core.json
@@ -3403,7 +3403,7 @@
             'raw', 'rbd',
             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
             'pbs',
 -            'ssh', 'throttle', 'vdi', 'vhdx',
 +            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4465,6 +4465,28 @@
             '*server': ['InetSocketAddressBase'] } }
 ##
 +# @BlockdevOptionsVitastor:
 +#
 +# Driver specific block device options for vitastor
 +#
 +# @image:       Image name
 +# @inode:       Inode number
 +# @pool:        Pool ID
 +# @size:        Desired image size in bytes
 +# @config-path: Path to Vitastor configuration
 +# @etcd-host:   etcd connection address(es)
 +# @etcd-prefix: etcd key/value prefix
 +##
 +{ 'struct': 'BlockdevOptionsVitastor',
 +  'data': { '*inode': 'uint64',
 +            '*pool': 'uint64',
 +            '*size': 'uint64',
 +            '*image': 'str',
 +            '*config-path': 'str',
 +            '*etcd-host': 'str',
 +            '*etcd-prefix': 'str' } }
 +
 +##
 # @ReplicationMode:
 #
 # An enumeration of replication modes.
@@ -4923,6 +4945,7 @@
       'throttle':   'BlockdevOptionsThrottle',
       'vdi':        'BlockdevOptionsGenericFormat',
       'vhdx':       'BlockdevOptionsGenericFormat',
 +      'vitastor':   'BlockdevOptionsVitastor',
       'virtio-blk-vfio-pci':
                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
                       'if': 'CONFIG_BLKIO' },
@@ -5360,6 +5383,17 @@
             '*encrypt' :        'RbdEncryptionCreateOptions' } }
 ##
 +# @BlockdevCreateOptionsVitastor:
 +#
 +# Driver specific image creation options for Vitastor.
 +#
 +# @size: Size of the virtual disk in bytes
 +##
 +{ 'struct': 'BlockdevCreateOptionsVitastor',
 +  'data': { 'location':         'BlockdevOptionsVitastor',
 +            'size':             'size' } }
 +
 +##
 # @BlockdevVmdkSubformat:
 #
 # Subformat options for VMDK images
@@ -5581,6 +5615,7 @@
       'ssh':            'BlockdevCreateOptionsSsh',
       'vdi':            'BlockdevCreateOptionsVdi',
       'vhdx':           'BlockdevCreateOptionsVhdx',
 +      'vitastor':       'BlockdevCreateOptionsVitastor',
       'vmdk':           'BlockdevCreateOptionsVmdk',
       'vpc':            'BlockdevCreateOptionsVpc'
   } }
 Index: pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
 ===================================================================
 --- pve-qemu-kvm-8.1.2.orig/scripts/ci/org.centos/stream/8/x86_64/configure
 +++ pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
 --with-suffix="qemu-kvm" \
 --firmwarepath=/usr/share/qemu-firmware \
 --target-list="x86_64-softmmu" \
 ---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
 +--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
 --audio-drv-list="" \
 --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
 --with-coroutine=ucontext \
@@ -176,6 +176,7 @@
 --enable-opengl \
 --enable-pie \
 --enable-rbd \
 +--enable-vitastor \
 --enable-rdma \
 --enable-seccomp \
 --enable-snappy \
 Index: pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
 ===================================================================
 --- pve-qemu-kvm-8.1.2.orig/scripts/meson-buildoptions.sh
 +++ pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
@@ -153,6 +153,7 @@ meson_options_help() {
   printf "%s\n" '  qed             qed image format support'
   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
   printf "%s\n" '  rbd             Ceph block device driver'
 +  printf "%s\n" '  vitastor        Vitastor block device driver'
   printf "%s\n" '  rdma            Enable RDMA-based migration'
   printf "%s\n" '  replication     replication support'
   printf "%s\n" '  sdl             SDL user interface'
@@ -416,6 +417,8 @@ _meson_option_parse() {
     --disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
     --enable-rbd) printf "%s" -Drbd=enabled ;;
     --disable-rbd) printf "%s" -Drbd=disabled ;;
 +    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
 +    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
     --enable-rdma) printf "%s" -Drdma=enabled ;;
     --disable-rdma) printf "%s" -Drdma=disabled ;;
     --enable-replication) printf "%s" -Dreplication=enabled ;;
--- a/patches/qemu-8.1-vitastor.patch
+++ b/patches/qemu-8.1-vitastor.patch
@@ -0,0 +1,190 @@
 diff --git a/block/meson.build b/block/meson.build
 index 529fc172c6..d542dc0609 100644
 --- a/block/meson.build
 +++ b/block/meson.build
@@ -110,6 +110,7 @@ foreach m : [
   [libnfs, 'nfs', files('nfs.c')],
   [libssh, 'ssh', files('ssh.c')],
   [rbd, 'rbd', files('rbd.c')],
 +  [vitastor, 'vitastor', files('vitastor.c')],
 ]
   if m[0].found()
     module_ss = ss.source_set()
 diff --git a/meson.build b/meson.build
 index a9c4f28247..8496cf13f1 100644
 --- a/meson.build
 +++ b/meson.build
@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_block
   endif
 endif
 +vitastor = not_found
 +if not get_option('vitastor').auto() or have_block
 +  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
 +    required: get_option('vitastor'))
 +  if libvitastor_client.found()
 +    if cc.links('''
 +      #include <vitastor_c.h>
 +      int main(void) {
 +        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 +        return 0;
 +      }''', dependencies: libvitastor_client)
 +      vitastor = declare_dependency(dependencies: libvitastor_client)
 +    elif get_option('vitastor').enabled()
 +      error('could not link libvitastor_client')
 +    else
 +      warning('could not link libvitastor_client, disabling')
 +    endif
 +  endif
 +endif
 +
 glusterfs = not_found
 glusterfs_ftruncate_has_stat = false
 glusterfs_iocb_has_stat = false
@@ -2119,6 +2139,7 @@ if numa.found()
 endif
 config_host_data.set('CONFIG_OPENGL', opengl.found())
 config_host_data.set('CONFIG_RBD', rbd.found())
 +config_host_data.set('CONFIG_VITASTOR', vitastor.found())
 config_host_data.set('CONFIG_RDMA', rdma.found())
 config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
 config_host_data.set('CONFIG_SDL', sdl.found())
@@ -4286,6 +4307,7 @@ summary_info += {'fdt support':       fdt_opt == 'disabled' ? false : fdt_opt}
 summary_info += {'libcap-ng support': libcap_ng}
 summary_info += {'bpf support':       libbpf}
 summary_info += {'rbd support':       rbd}
 +summary_info += {'vitastor support':  vitastor}
 summary_info += {'smartcard support': cacard}
 summary_info += {'U2F support':       u2f}
 summary_info += {'libusb':            libusb}
 diff --git a/meson_options.txt b/meson_options.txt
 index ae6d8f469d..e3d9f8404d 100644
 --- a/meson_options.txt
 +++ b/meson_options.txt
@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value : 'auto',
        description: 'lzo compression support')
 option('rbd', type : 'feature', value : 'auto',
        description: 'Ceph block device driver')
 +option('vitastor', type : 'feature', value : 'auto',
 +       description: 'Vitastor block device driver')
 option('opengl', type : 'feature', value : 'auto',
        description: 'OpenGL support')
 option('rdma', type : 'feature', value : 'auto',
 diff --git a/qapi/block-core.json b/qapi/block-core.json
 index 2b1d493d6e..90673fdbdc 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
@@ -3146,7 +3146,7 @@
             'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
             'raw', 'rbd',
             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
 -            'ssh', 'throttle', 'vdi', 'vhdx',
 +            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4196,6 +4196,28 @@
             '*key-secret': 'str',
             '*server': ['InetSocketAddressBase'] } }
 +##
 +# @BlockdevOptionsVitastor:
 +#
 +# Driver specific block device options for vitastor
 +#
 +# @image:       Image name
 +# @inode:       Inode number
 +# @pool:        Pool ID
 +# @size:        Desired image size in bytes
 +# @config-path: Path to Vitastor configuration
 +# @etcd-host:   etcd connection address(es)
 +# @etcd-prefix: etcd key/value prefix
 +##
 +{ 'struct': 'BlockdevOptionsVitastor',
 +  'data': { '*inode': 'uint64',
 +            '*pool': 'uint64',
 +            '*size': 'uint64',
 +            '*image': 'str',
 +            '*config-path': 'str',
 +            '*etcd-host': 'str',
 +            '*etcd-prefix': 'str' } }
 +
 ##
 # @ReplicationMode:
 #
@@ -4654,6 +4676,7 @@
       'throttle':   'BlockdevOptionsThrottle',
       'vdi':        'BlockdevOptionsGenericFormat',
       'vhdx':       'BlockdevOptionsGenericFormat',
 +      'vitastor':   'BlockdevOptionsVitastor',
       'virtio-blk-vfio-pci':
                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
                       'if': 'CONFIG_BLKIO' },
@@ -5089,6 +5112,17 @@
             '*cluster-size' :   'size',
             '*encrypt' :        'RbdEncryptionCreateOptions' } }
 +##
 +# @BlockdevCreateOptionsVitastor:
 +#
 +# Driver specific image creation options for Vitastor.
 +#
 +# @size: Size of the virtual disk in bytes
 +##
 +{ 'struct': 'BlockdevCreateOptionsVitastor',
 +  'data': { 'location':         'BlockdevOptionsVitastor',
 +            'size':             'size' } }
 +
 ##
 # @BlockdevVmdkSubformat:
 #
@@ -5311,6 +5345,7 @@
       'ssh':            'BlockdevCreateOptionsSsh',
       'vdi':            'BlockdevCreateOptionsVdi',
       'vhdx':           'BlockdevCreateOptionsVhdx',
 +      'vitastor':       'BlockdevCreateOptionsVitastor',
       'vmdk':           'BlockdevCreateOptionsVmdk',
       'vpc':            'BlockdevCreateOptionsVpc'
   } }
 diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
 index d02b09a4b9..f0b5fbfef3 100755
 --- a/scripts/ci/org.centos/stream/8/x86_64/configure
 +++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
 --with-suffix="qemu-kvm" \
 --firmwarepath=/usr/share/qemu-firmware \
 --target-list="x86_64-softmmu" \
 ---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
 +--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
 --audio-drv-list="" \
 --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
 --with-coroutine=ucontext \
@@ -176,6 +176,7 @@
 --enable-opengl \
 --enable-pie \
 --enable-rbd \
 +--enable-vitastor \
 --enable-rdma \
 --enable-seccomp \
 --enable-snappy \
 diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
 index d7020af175..94958eb6fa 100644
 --- a/scripts/meson-buildoptions.sh
 +++ b/scripts/meson-buildoptions.sh
@@ -153,6 +153,7 @@ meson_options_help() {
   printf "%s\n" '  qed             qed image format support'
   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
   printf "%s\n" '  rbd             Ceph block device driver'
 +  printf "%s\n" '  vitastor        Vitastor block device driver'
   printf "%s\n" '  rdma            Enable RDMA-based migration'
   printf "%s\n" '  replication     replication support'
   printf "%s\n" '  sdl             SDL user interface'
@@ -416,6 +417,8 @@ _meson_option_parse() {
     --disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
     --enable-rbd) printf "%s" -Drbd=enabled ;;
     --disable-rbd) printf "%s" -Drbd=disabled ;;
 +    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
 +    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
     --enable-rdma) printf "%s" -Drdma=enabled ;;
     --disable-rdma) printf "%s" -Drdma=disabled ;;
     --enable-replication) printf "%s" -Dreplication=enabled ;;
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-1.0.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.0.0$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.3.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.3.1$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -15,6 +15,7 @@ RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
 RUN rpm --nomd5 -i fio*.src.rpm
 RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
 RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
 RUN yum -y install cmake3
 ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
@@ -35,7 +36,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.0.0.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.3.1.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.0.0
+Version:        1.3.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage
 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.0.0.el7.tar.gz
+Source0:        vitastor-1.3.1.el7.tar.gz
 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -16,7 +16,7 @@ BuildRequires:  jerasure-devel
 BuildRequires:  libisa-l-devel
 BuildRequires:  gf-complete-devel
 BuildRequires:  libibverbs-devel
-BuildRequires:  cmake
+BuildRequires:  cmake3
 Requires:       vitastor-osd = %{version}-%{release}
 Requires:       vitastor-mon = %{version}-%{release}
 Requires:       vitastor-client = %{version}-%{release}
@@ -94,7 +94,7 @@ Vitastor fio drivers for benchmarking.
 %build
 . /opt/rh/devtoolset-9/enable
-%cmake .
+%cmake3 .
 %make_build
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.0.0.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.3.1.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.0.0
+Version:        1.3.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage
 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.0.0.el8.tar.gz
+Source0:        vitastor-1.3.1.el8.tar.gz
 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.0.0.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.3.1.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.0.0
+Version:        1.3.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage
 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.0.0.el9.tar.gz
+Source0:        vitastor-1.3.1.el9.tar.gz
 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,10 +16,11 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()
-add_definitions(-DVERSION="1.0.0")
+add_definitions(-DVERSION="1.3.1")
-add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
+add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
-	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
+	add_definitions(-fsanitize=address)
 	add_link_options(-fsanitize=address -fno-omit-frame-pointer)
 endif (${WITH_ASAN})
@@ -137,6 +138,7 @@ endif (${WITH_FIO})
 add_library(vitastor_client SHARED
 	cluster_client.cpp
 	cluster_client_list.cpp
 	cluster_client_wb.cpp
 	vitastor_c.cpp
 	cli_common.cpp
 	cli_alloc_osd.cpp
@@ -300,7 +302,7 @@ target_link_libraries(test_crc32
 add_executable(test_cluster_client
 	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
-	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
+	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
 	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
--- a/src/blockstore_disk.cpp
+++ b/src/blockstore_disk.cpp
@@ -45,13 +45,31 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    meta_block_size = parse_size(config["meta_block_size"]);
    bitmap_granularity = parse_size(config["bitmap_granularity"]);
    meta_format = stoull_full(config["meta_format"]);
-    cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
+    if (config.find("data_io") == config.end() &&
-    cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
+        config.find("meta_io") == config.end() &&
        config.find("journal_io") == config.end())
    {
        bool cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
        bool cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
            config.find("cached_io_meta") == config.end() ||
            config["cached_io_meta"] == "true" || config["cached_io_meta"] == "yes" || config["cached_io_meta"] == "1";
-    cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
+        bool cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
            config.find("cached_io_journal") == config.end() ||
            config["cached_io_journal"] == "true" || config["cached_io_journal"] == "yes" || config["cached_io_journal"] == "1";
        data_io = cached_io_data ? "cached" : "direct";
        meta_io = cached_io_meta ? "cached" : "direct";
        journal_io = cached_io_journal ? "cached" : "direct";
    }
    else
    {
        data_io = config.find("data_io") != config.end() ? config["data_io"] : "direct";
        meta_io = config.find("meta_io") != config.end()
            ? config["meta_io"]
            : (meta_device == data_device || meta_device == "" ? data_io : "direct");
        journal_io = config.find("journal_io") != config.end()
            ? config["journal_io"]
            : (journal_device == meta_device || journal_device == "" ? meta_io : "direct");
    }
    if (config["data_csum_type"] == "crc32c")
    {
        data_csum_type = BLOCKSTORE_CSUM_CRC32C;
@@ -272,9 +290,19 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
    }
 }
 static int bs_openmode(const std::string & mode)
 {
    if (mode == "directsync")
        return O_DIRECT|O_SYNC;
    else if (mode == "cached")
        return O_SYNC;
    else
        return O_DIRECT;
 }
 void blockstore_disk_t::open_data()
 {
-    data_fd = open(data_device.c_str(), (cached_io_data ? O_SYNC : O_DIRECT) | O_RDWR);
+    data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
    if (data_fd == -1)
    {
        throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
@@ -299,9 +327,9 @@ void blockstore_disk_t::open_data()
 void blockstore_disk_t::open_meta()
 {
-    if (meta_device != data_device || cached_io_meta != cached_io_data)
+    if (meta_device != data_device || meta_io != data_io)
    {
-        meta_fd = open(meta_device.c_str(), (cached_io_meta ? O_SYNC : O_DIRECT) | O_RDWR);
+        meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
        if (meta_fd == -1)
        {
            throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
@@ -337,9 +365,9 @@ void blockstore_disk_t::open_meta()
 void blockstore_disk_t::open_journal()
 {
-    if (journal_device != meta_device || cached_io_journal != cached_io_meta)
+    if (journal_device != meta_device || journal_io != meta_io)
    {
-        journal_fd = open(journal_device.c_str(), (cached_io_journal ? O_SYNC : O_DIRECT) | O_RDWR);
+        journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
        if (journal_fd == -1)
        {
            throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
--- a/src/blockstore_disk.h
+++ b/src/blockstore_disk.h
@@ -31,8 +31,9 @@ struct blockstore_disk_t
    uint32_t csum_block_size = 4096;
    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
    bool disable_flock = false;
-    // Use Linux page cache for reads and writes, i.e. open FDs with O_SYNC instead of O_DIRECT
+    // I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
-    bool cached_io_data = false, cached_io_meta = false, cached_io_journal = false;
+    // O_SYNC without O_DIRECT = use Linux page cache for reads and writes
    std::string data_io, meta_io, journal_io;
    int meta_fd = -1, data_fd = -1, journal_fd = -1;
    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -1372,7 +1372,8 @@ bool journal_flusher_co::trim_journal(int wait_base)
                    ? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
                .reserved = 0,
                .journal_start = new_trim_pos,
-                .version = JOURNAL_VERSION_V2,
+                .version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
                    ? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
                .data_csum_type = bs->dsk.data_csum_type,
                .csum_block_size = bs->dsk.csum_block_size,
            };
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -384,6 +384,10 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
    if (op->opcode == BS_OP_SYNC)
    {
        unsynced_queued_ops = 0;
    }
    init_op(op);
    submit_queue.push_back(op);
    ringloop->wakeup();
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -262,6 +262,8 @@ class blockstore_impl_t
    int throttle_target_parallelism = 1;
    // Minimum difference in microseconds between target and real execution times to throttle the response
    int throttle_threshold_us = 50;
    // Maximum writes between automatically added fsync operations
    uint64_t autosync_writes = 128;
    /******* END OF OPTIONS *******/
    struct ring_consumer_t ring_consumer;
@@ -272,7 +274,8 @@ class blockstore_impl_t
    blockstore_dirty_db_t dirty_db;
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    int unsynced_big_write_count = 0;
+    int unsynced_big_write_count = 0, unstable_unsynced = 0;
    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -553,7 +553,7 @@ resume_1:
        }
        if (je_start->size == JE_START_V0_SIZE ||
            (je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
-            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
+            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
        {
            fprintf(
                stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,7 +562,8 @@ resume_1:
            );
            exit(1);
        }
-        if (je_start->version == JOURNAL_VERSION_V1)
+        if (je_start->version == JOURNAL_VERSION_V1 ||
            je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
        {
            je_start->data_csum_type = 0;
            je_start->csum_block_size = 0;
@@ -731,8 +732,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
    resume:
        while (pos < bs->journal.block_size)
        {
-            journal_entry *je = (journal_entry*)((uint8_t*)buf + proc_pos - done_pos + pos);
+            auto buf_pos = proc_pos - done_pos + pos;
-            if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
+            journal_entry *je = (journal_entry*)((uint8_t*)buf + buf_pos);
            if (je->magic != JOURNAL_MAGIC || buf_pos+je->size > len || je_crc32(je) != je->crc32 ||
                je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
            {
                if (pos == 0)
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -144,7 +144,10 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        journal.sector_info[journal.cur_sector].written = false;
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
-        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
+        auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
        // double check that next_free doesn't cross used_start from the left
        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        memset(journal.inmemory
            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
            : (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -13,12 +13,6 @@
 #define JOURNAL_BUFFER_SIZE 4*1024*1024
 #define JOURNAL_ENTRY_HEADER_SIZE 16
 // We reserve some extra space for future stabilize requests during writes
 // FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
 // writing more than can be stabilized afterwards
 #define JOURNAL_STABILIZE_RESERVATION 65536
 #define JOURNAL_INSTANT_RESERVATION 131072
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -19,6 +19,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
    if (config.find("autosync_writes") != config.end())
    {
        autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
    }
    if (!max_flusher_count)
    {
        max_flusher_count = 256;
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -86,14 +86,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
                auto & dirty_entry = dirty_db.at(sbw);
                uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
                if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-                    left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
+                    (unstable_writes.size()+unstable_unsynced)*journal.block_size))
                {
                    return 0;
                }
            }
        }
        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -184,6 +185,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            mark_stable(dirty_it->first);
        }
        else
        {
            unstable_unsynced--;
            assert(unstable_unsynced >= 0);
        }
        dirty_it++;
        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
        {
@@ -214,6 +220,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
            {
                mark_stable(*it);
            }
            else
            {
                unstable_unsynced--;
                assert(unstable_unsynced >= 0);
            }
        }
    }
    op->retval = 0;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        dyn = calloc_or_die(1, dyn_size+sizeof(int));
        *((int*)dyn) = 1;
    }
-    uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
+    uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -127,8 +127,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            return false;
        }
    }
-    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size &&
+    bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL));
-        immediate_commit != IMMEDIATE_ALL)
+    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm ||
        !imm && unsynced_queued_ops >= autosync_writes)
    {
        // Issue an additional sync so that the previous big write can reach the journal
        blockstore_op_t *sync_op = new blockstore_op_t;
@@ -139,6 +140,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        };
        enqueue_op(sync_op);
    }
    else if (!imm)
        unsynced_queued_ops++;
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
@@ -317,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
            sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
-            (dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -383,6 +386,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
        if (!(dirty_it->second.state & BS_ST_INSTANT))
        {
            unstable_unsynced++;
        }
        if (immediate_commit != IMMEDIATE_ALL)
        {
            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -405,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
            || !space_check.check_available(op, 1,
                sizeof(journal_entry_small_write) + dyn_size,
-                op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+                op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -455,6 +462,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                exit(1);
            }
        }
        // double check that next_free doesn't cross used_start from the left
        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        je->oid = op->oid;
        je->version = op->version;
@@ -492,10 +501,15 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        dirty_it->second.location = journal.next_free;
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
-        journal.next_free += op->len;
+        next_next_free = journal.next_free + op->len;
-        if (journal.next_free >= journal.len)
+        if (next_next_free >= journal.len)
            next_next_free = dsk.journal_block_size;
        // double check that next_free doesn't cross used_start from the left
        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        if (!(dirty_it->second.state & BS_ST_INSTANT))
        {
-            journal.next_free = dsk.journal_block_size;
+            unstable_unsynced++;
        }
        if (!PRIV(op)->pending_ops)
        {
@@ -535,7 +549,7 @@ resume_2:
        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-            ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -579,14 +593,20 @@ resume_4:
 #endif
        bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
        bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
        bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
        if (imm)
        {
            auto & unstab = unstable_writes[op->oid];
            unstab = unstab < op->version ? op->version : unstab;
            if (!is_instant)
            {
                unstable_unsynced--;
                assert(unstable_unsynced >= 0);
            }
        }
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
            | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
-        if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
+        if (imm && is_instant)
        {
            // Deletions and 'instant' operations are treated as immediately stable
            mark_stable(dirty_it->first);
@@ -732,7 +752,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
    {
        return 0;
    }
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -17,7 +17,7 @@
 static const char *exe_name = NULL;
 static const char* help_text =
-    "Vitastor command-line tool\n"
+    "Vitastor command-line tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -116,7 +116,8 @@ static const char* help_text =
    "Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
    "\n"
    "GLOBAL OPTIONS:\n"
-    "  --etcd_address <etcd_address>\n"
+    "  --config_file FILE  Path to Vitastor configuration file\n"
    "  --etcd_address URL  Etcd connection address\n"
    "  --iodepth N         Send N operations in parallel to each OSD when possible (default 32)\n"
    "  --parallel_osds M   Work with M osds in parallel when possible (default 4)\n"
    "  --progress 1|0      Report progress (default 1)\n"
@@ -331,7 +332,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
    {
        // Create client
        json11::Json cfg_j = cfg;
-        p->ringloop = new ring_loop_t(512);
+        p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
        p->epmgr = new epoll_manager_t(p->ringloop);
        p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
        // Smaller timeout by default for more interactiveness
@@ -349,6 +350,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
                p->ringloop->wait();
        }
        // Destroy the client
        p->cli->flush();
        delete p->cli;
        delete p->epmgr;
        delete p->ringloop;
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -109,7 +109,7 @@ resume_1:
            }
            for (auto pg_per_pair: pg_per_osd)
            {
-                uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
+                uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
                if (pool_avail > pg_free)
                {
                    pool_avail = pg_free;
@@ -124,8 +124,10 @@ resume_1:
                pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
            }
            pool_stats[pool_cfg.id] = json11::Json::object {
                { "id", (uint64_t)pool_cfg.id },
                { "name", pool_cfg.name },
                { "pg_count", pool_cfg.pg_count },
                { "real_pg_count", pool_cfg.real_pg_count },
                { "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
                { "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
                    ? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -176,7 +178,7 @@ resume_1:
            { "title", "SCHEME" },
        });
        cols.push_back(json11::Json::object{
-            { "key", "pg_count" },
+            { "key", "pg_count_fmt" },
            { "title", "PGS" },
        });
        cols.push_back(json11::Json::object{
@@ -205,6 +207,9 @@ resume_1:
            double raw_to = kv.second["raw_to_usable"].number_value();
            if (raw_to < 0.000001 && raw_to > -0.000001)
                raw_to = 1;
            kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
                ? kv.second["real_pg_count"].as_string()
                : kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
            kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
            kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
            kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@@ -53,6 +53,7 @@ struct snap_merger_t
    std::map<inode_t, std::vector<uint64_t>> layer_lists;
    std::map<inode_t, uint64_t> layer_block_size;
    std::map<inode_t, uint64_t> layer_list_pos;
    std::vector<snap_rw_op_t*> continue_rwo, continue_rwo2;
    int in_flight = 0;
    uint64_t last_fsync_offset = 0;
    uint64_t last_written_offset = 0;
@@ -304,6 +305,12 @@ struct snap_merger_t
        oit = merge_offsets.begin();
    resume_5:
        // Now read, overwrite and optionally delete offsets one by one
        continue_rwo2.swap(continue_rwo);
        for (auto rwo: continue_rwo2)
        {
            next_write(rwo);
        }
        continue_rwo2.clear();
        while (in_flight < parent->iodepth*parent->parallel_osds &&
            oit != merge_offsets.end() && !rwo_error.size())
        {
@@ -464,7 +471,8 @@ struct snap_merger_t
                rwo->error_offset = op->offset;
                rwo->error_read = true;
            }
-            next_write(rwo);
+            continue_rwo.push_back(rwo);
            parent->ringloop->wakeup();
        };
        parent->cli->execute(op);
    }
@@ -544,11 +552,9 @@ struct snap_merger_t
            }
            // Increment CAS version
            rwo->op.version = subop->version;
            if (use_cas)
                next_write(rwo);
            else
                autofree_op(rwo);
            delete subop;
            continue_rwo.push_back(rwo);
            parent->ringloop->wakeup();
        };
        parent->cli->execute(subop);
    }
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -158,12 +158,7 @@ resume_2:
        for (auto & pool_pair: parent->cli->st_cli.pool_config)
        {
            auto & pool_cfg = pool_pair.second;
-            bool active = true;
+            bool active = pool_cfg.real_pg_count > 0;
            if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
            {
                active = false;
                pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
            }
            pool_count++;
            for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
            {
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -3,21 +3,13 @@
 #include <stdexcept>
 #include <assert.h>
-#include "cluster_client.h"
+#include "cluster_client_impl.h"
-
+#include "http_client.h" // json_is_true
 #define SCRAP_BUFFER_SIZE 4*1024*1024
 #define PART_SENT 1
 #define PART_DONE 2
 #define PART_ERROR 4
 #define PART_RETRY 8
 #define CACHE_DIRTY 1
 #define CACHE_FLUSHING 2
 #define CACHE_REPEATING 3
 #define OP_FLUSH_BUFFER 0x02
 #define OP_IMMEDIATE_COMMIT 0x04
 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
    wb = new writeback_cache_t();
    cli_config = config.object_items();
    file_config = osd_messenger_t::read_config(config);
    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
@@ -37,21 +29,15 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
            continue_lists();
            continue_raw_ops(peer_osd);
        }
-        else if (dirty_buffers.size())
+        else
        {
            // peer_osd just dropped connection
            // determine WHICH dirty_buffers are now obsolete and repeat them
-            for (auto & wr: dirty_buffers)
+            if (wb->repeat_ops_for(this, peer_osd) > 0)
            {
                if (affects_osd(wr.first.inode, wr.first.stripe, wr.second.len, peer_osd) &&
                    wr.second.state != CACHE_REPEATING)
                {
                    // FIXME: Flush in larger parts
                    flush_buffer(wr.first, &wr.second);
                }
            }
                continue_ops();
            }
        }
    };
    msgr.exec_op = [this](osd_op_t *op)
    {
@@ -78,16 +64,14 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
 cluster_client_t::~cluster_client_t()
 {
-    for (auto bp: dirty_buffers)
+    msgr.repeer_pgs = [](osd_num_t){};
    {
        free(bp.second.buf);
    }
    dirty_buffers.clear();
    if (ringloop)
    {
        ringloop->unregister_consumer(&consumer);
    }
    free(scrap_buffer);
    delete wb;
    wb = NULL;
 }
 cluster_op_t::~cluster_op_t()
@@ -136,6 +120,19 @@ void cluster_client_t::init_msgr()
    }
 }
 void cluster_client_t::unshift_op(cluster_op_t *op)
 {
    op->next = op_queue_head;
    if (op_queue_head)
    {
        op_queue_head->prev = op;
        op_queue_head = op;
    }
    else
        op_queue_tail = op_queue_head = op;
    inc_wait(op->opcode, op->flags, op->next, 1);
 }
 void cluster_client_t::calc_wait(cluster_op_t *op)
 {
    op->prev_wait = 0;
@@ -156,7 +153,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    {
        for (auto prev = op->prev; prev; prev = prev->prev)
        {
-            if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && !(prev->flags & OP_IMMEDIATE_COMMIT))
+            if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE && (!(prev->flags & OP_IMMEDIATE_COMMIT) || enable_writeback))
            {
                op->prev_wait++;
            }
@@ -166,68 +163,58 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    }
    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
    {
        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
        {
            if (prev->opcode == OSD_OP_WRITE && (prev->flags & OP_FLUSH_BUFFER))
            {
                op->prev_wait++;
            }
            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
                prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
            {
                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
                break;
            }
        }
        if (!op->prev_wait)
        continue_rw(op);
    }
 }
 void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
 {
-    if (opcode == OSD_OP_WRITE)
+    if (opcode != OSD_OP_WRITE && opcode != OSD_OP_SYNC)
    {
        return;
    }
    cluster_op_t *bh_ops_local[32], **bh_ops = bh_ops_local;
    int bh_op_count = 0, bh_op_max = 32;
    while (next)
    {
        auto n2 = next->next;
-            if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
+        if (opcode == OSD_OP_WRITE
-                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
+            ? (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
-                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
+                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
-                    next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
+            : (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE))
        {
            next->prev_wait += inc;
            assert(next->prev_wait >= 0);
            if (!next->prev_wait)
            {
                // Kind of std::vector with local "small vector optimisation"
                if (bh_op_count >= bh_op_max)
                {
                    bh_op_max *= 2;
                    cluster_op_t **n = (cluster_op_t**)malloc_or_die(sizeof(cluster_op_t*) * bh_op_max);
                    memcpy(n, bh_ops, sizeof(cluster_op_t*) * bh_op_count);
                    if (bh_ops != bh_ops_local)
                    {
                        free(bh_ops);
                    }
                    bh_ops = n;
                }
                bh_ops[bh_op_count++] = next;
            }
        }
        next = n2;
    }
    for (int i = 0; i < bh_op_count; i++)
    {
        cluster_op_t *next = bh_ops[i];
        if (next->opcode == OSD_OP_SYNC)
            continue_sync(next);
        else
            continue_rw(next);
    }
-            }
+    if (bh_ops != bh_ops_local)
            next = n2;
        }
    }
    else if (opcode == OSD_OP_SYNC)
    {
-        while (next)
+        free(bh_ops);
        {
            auto n2 = next->next;
            if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
            {
                next->prev_wait += inc;
                assert(next->prev_wait >= 0);
                if (!next->prev_wait)
                {
                    if (next->opcode == OSD_OP_SYNC)
                        continue_sync(next);
                    else
                        continue_rw(next);
                }
            }
            next = n2;
        }
    }
 }
@@ -245,14 +232,38 @@ void cluster_client_t::erase_op(cluster_op_t *op)
        op_queue_tail = op->prev;
    op->next = op->prev = NULL;
    if (flags & OP_FLUSH_BUFFER)
    {
        // Completed flushes change writeback buffer states,
        // so the callback should be run before inc_wait()
        // which may continue following SYNCs, but these SYNCs
        // should know about the changed buffer state
        // This is ugly but this is the way we do it
        std::function<void(cluster_op_t*)>(op->callback)(op);
-    if (!(flags & OP_IMMEDIATE_COMMIT))
+    }
    if (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
    {
        inc_wait(opcode, flags, next, -1);
    }
    if (!(flags & OP_FLUSH_BUFFER))
    {
        // Call callback at the end to avoid inconsistencies in prev_wait
        // if the callback adds more operations itself
    if (!(flags & OP_FLUSH_BUFFER))
        std::function<void(cluster_op_t*)>(op->callback)(op);
    }
    if (flags & OP_FLUSH_BUFFER)
    {
        int i = 0;
        while (i < wb->writeback_overflow.size() && wb->writebacks_active < client_max_writeback_iodepth)
        {
            execute_internal(wb->writeback_overflow[i]);
            i++;
        }
        if (i > 0)
        {
            wb->writeback_overflow.erase(wb->writeback_overflow.begin(), wb->writeback_overflow.begin()+i);
        }
    }
 }
 void cluster_client_t::continue_ops(bool up_retry)
 {
@@ -295,6 +306,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
 {
    this->etcd_global_config = etcd_global_config;
    config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
    // client_max_dirty_bytes/client_dirty_limit
    if (config.find("client_max_dirty_bytes") != config.end())
    {
        client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -310,11 +322,34 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
    {
        client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
    }
    // client_max_dirty_ops
    client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
    if (!client_max_dirty_ops)
    {
        client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
    }
    // client_enable_writeback
    enable_writeback = json_is_true(config["client_enable_writeback"]) &&
        json_is_true(config["client_writeback_allowed"]);
    // client_max_buffered_bytes
    client_max_buffered_bytes = config["client_max_buffered_bytes"].uint64_value();
    if (!client_max_buffered_bytes)
    {
        client_max_buffered_bytes = DEFAULT_CLIENT_MAX_BUFFERED_BYTES;
    }
    // client_max_buffered_ops
    client_max_buffered_ops = config["client_max_buffered_ops"].uint64_value();
    if (!client_max_buffered_ops)
    {
        client_max_buffered_ops = DEFAULT_CLIENT_MAX_BUFFERED_OPS;
    }
    // client_max_writeback_iodepth
    client_max_writeback_iodepth = config["client_max_writeback_iodepth"].uint64_value();
    if (!client_max_writeback_iodepth)
    {
        client_max_writeback_iodepth = DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH;
    }
    // up_wait_retry_interval
    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
    if (!up_wait_retry_interval)
    {
@@ -374,6 +409,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
 bool cluster_client_t::get_immediate_commit(uint64_t inode)
 {
    if (enable_writeback)
        return false;
    pool_id_t pool_id = INODE_POOL(inode);
    if (!pool_id)
        return true;
@@ -408,6 +445,41 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
    }
 }
 bool cluster_client_t::flush()
 {
    if (!ringloop)
    {
        if (wb->writeback_queue.size())
        {
            wb->start_writebacks(this, 0);
            cluster_op_t *sync = new cluster_op_t;
            sync->opcode = OSD_OP_SYNC;
            sync->callback = [](cluster_op_t *sync)
            {
                delete sync;
            };
            execute(sync);
        }
        return op_queue_head == NULL;
    }
    bool sync_done = false;
    cluster_op_t *sync = new cluster_op_t;
    sync->opcode = OSD_OP_SYNC;
    sync->callback = [&sync_done](cluster_op_t *sync)
    {
        delete sync;
        sync_done = true;
    };
    execute(sync);
    while (!sync_done)
    {
        ringloop->loop();
        if (!sync_done)
            ringloop->wait();
    }
    return true;
 }
 /**
 * How writes are synced when immediate_commit is false
 *
@@ -428,6 +500,9 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 * 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
 * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
 * 5) if any of them fail due to other errors, fail the SYNC operation
 *
 * If writeback caching is turned on and writeback limit is not exhausted:
 * data is just copied and the write is confirmed to the client.
 */
 void cluster_client_t::execute(cluster_op_t *op)
 {
@@ -443,67 +518,73 @@ void cluster_client_t::execute(cluster_op_t *op)
        offline_ops.push_back(op);
        return;
    }
    op->flags = op->flags & OSD_OP_IGNORE_READONLY; // the only allowed flag
    execute_internal(op);
 }
 void cluster_client_t::execute_internal(cluster_op_t *op)
 {
    op->cur_inode = op->inode;
    op->retval = 0;
-    op->flags = op->flags & OSD_OP_IGNORE_READONLY; // single allowed flag
+    // check alignment, readonly flag and so on
-    if (op->opcode != OSD_OP_SYNC)
+    if (!check_rw(op))
    {
        pool_id_t pool_id = INODE_POOL(op->cur_inode);
        if (!pool_id)
        {
            op->retval = -EINVAL;
            std::function<void(cluster_op_t*)>(op->callback)(op);
        return;
    }
-        auto pool_it = st_cli.pool_config.find(pool_id);
+    if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
-        if (pool_it == st_cli.pool_config.end() || pool_it->second.real_pg_count == 0)
+        !op->version /* FIXME no CAS writeback */)
    {
-            // Pools are loaded, but this one is unknown
+        if (wb->writebacks_active >= client_max_writeback_iodepth)
-            op->retval = -EINVAL;
+        {
-            std::function<void(cluster_op_t*)>(op->callback)(op);
+            // Writeback queue is full, postpone the operation
            wb->writeback_overflow.push_back(op);
            return;
        }
-        // Check alignment
+        // Just copy and acknowledge the operation
-        if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
+        wb->copy_write(op, CACHE_DIRTY);
-            op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
+        while (wb->writeback_bytes + op->len > client_max_buffered_bytes || wb->writeback_queue_size > client_max_buffered_ops)
        {
-            op->retval = -EINVAL;
+            // Initiate some writeback (asynchronously)
            wb->start_writebacks(this, 1);
        }
        op->retval = op->len;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return;
    }
        if (pool_it->second.immediate_commit == IMMEDIATE_ALL)
        {
            op->flags |= OP_IMMEDIATE_COMMIT;
        }
    }
    if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
    {
        if (!(op->flags & OP_FLUSH_BUFFER))
        {
            wb->copy_write(op, CACHE_WRITTEN);
        }
        if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
        {
            // Push an extra SYNC operation to flush previous writes
            cluster_op_t *sync_op = new cluster_op_t;
            sync_op->opcode = OSD_OP_SYNC;
            sync_op->flags = OP_FLUSH_BUFFER;
            sync_op->callback = [](cluster_op_t* sync_op)
            {
                delete sync_op;
            };
-            sync_op->prev = op_queue_tail;
+            execute_internal(sync_op);
            if (op_queue_tail)
            {
                op_queue_tail->next = sync_op;
                op_queue_tail = sync_op;
            }
            else
                op_queue_tail = op_queue_head = sync_op;
            dirty_bytes = 0;
            dirty_ops = 0;
            calc_wait(sync_op);
        }
        dirty_bytes += op->len;
        dirty_ops++;
    }
    else if (op->opcode == OSD_OP_SYNC)
    {
        // Flush the whole write-back queue first
        if (!(op->flags & OP_FLUSH_BUFFER) && wb->writeback_overflow.size() > 0)
        {
            // Writeback queue is full, postpone the operation
            wb->writeback_overflow.push_back(op);
            return;
        }
        if (wb->writeback_queue.size())
        {
            wb->start_writebacks(this, 0);
        }
        dirty_bytes = 0;
        dirty_ops = 0;
    }
@@ -515,7 +596,7 @@ void cluster_client_t::execute(cluster_op_t *op)
    }
    else
        op_queue_tail = op_queue_head = op;
-    if (!(op->flags & OP_IMMEDIATE_COMMIT))
+    if (!(op->flags & OP_IMMEDIATE_COMMIT) || enable_writeback)
        calc_wait(op);
    else
    {
@@ -526,6 +607,52 @@ void cluster_client_t::execute(cluster_op_t *op)
    }
 }
 bool cluster_client_t::check_rw(cluster_op_t *op)
 {
    if (op->opcode == OSD_OP_SYNC)
    {
        return true;
    }
    pool_id_t pool_id = INODE_POOL(op->cur_inode);
    if (!pool_id)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return false;
    }
    auto pool_it = st_cli.pool_config.find(pool_id);
    if (pool_it == st_cli.pool_config.end() || pool_it->second.real_pg_count == 0)
    {
        // Pools are loaded, but this one is unknown
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return false;
    }
    // Check alignment
    if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
        op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
        return false;
    }
    if (pool_it->second.immediate_commit == IMMEDIATE_ALL)
    {
        op->flags |= OP_IMMEDIATE_COMMIT;
    }
    if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OSD_OP_IGNORE_READONLY))
    {
        auto ino_it = st_cli.inode_config.find(op->inode);
        if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
        {
            op->retval = -EROFS;
            std::function<void(cluster_op_t*)>(op->callback)(op);
            return false;
        }
    }
    return true;
 }
 void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
 {
    auto fd_it = msgr.osd_peer_fds.find(osd_num);
@@ -543,114 +670,6 @@ void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
    }
 }
 void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
 {
    // Save operation for replay when one of PGs goes out of sync
    // (primary OSD drops our connection in this case)
    auto dirty_it = dirty_buffers.lower_bound((object_id){
        .inode = op->inode,
        .stripe = op->offset,
    });
    while (dirty_it != dirty_buffers.begin())
    {
        dirty_it--;
        if (dirty_it->first.inode != op->inode ||
            (dirty_it->first.stripe + dirty_it->second.len) <= op->offset)
        {
            dirty_it++;
            break;
        }
    }
    uint64_t pos = op->offset, len = op->len, iov_idx = 0, iov_pos = 0;
    while (len > 0)
    {
        uint64_t new_len = 0;
        if (dirty_it == dirty_buffers.end() || dirty_it->first.inode != op->inode)
        {
            new_len = len;
        }
        else if (dirty_it->first.stripe > pos)
        {
            new_len = dirty_it->first.stripe - pos;
            if (new_len > len)
            {
                new_len = len;
            }
        }
        if (new_len > 0)
        {
            dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
                .inode = op->inode,
                .stripe = pos,
            }, (cluster_buffer_t){
                .buf = malloc_or_die(new_len),
                .len = new_len,
            });
        }
        // FIXME: Split big buffers into smaller ones on overwrites. But this will require refcounting
        dirty_it->second.state = CACHE_DIRTY;
        uint64_t cur_len = (dirty_it->first.stripe + dirty_it->second.len - pos);
        if (cur_len > len)
        {
            cur_len = len;
        }
        while (cur_len > 0 && iov_idx < op->iov.count)
        {
            unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
            if (iov_len <= cur_len)
            {
                memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
                    (uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
                pos += iov_len;
                len -= iov_len;
                cur_len -= iov_len;
                iov_pos = 0;
                iov_idx++;
            }
            else
            {
                memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe,
                    (uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
                pos += cur_len;
                len -= cur_len;
                iov_pos += cur_len;
                cur_len = 0;
            }
        }
        dirty_it++;
    }
 }
 void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
 {
    wr->state = CACHE_REPEATING;
    cluster_op_t *op = new cluster_op_t;
    op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
    op->opcode = OSD_OP_WRITE;
    op->cur_inode = op->inode = oid.inode;
    op->offset = oid.stripe;
    op->len = wr->len;
    op->iov.push_back(wr->buf, wr->len);
    op->callback = [wr](cluster_op_t* op)
    {
        if (wr->state == CACHE_REPEATING)
        {
            wr->state = CACHE_DIRTY;
        }
        delete op;
    };
    op->next = op_queue_head;
    if (op_queue_head)
    {
        op_queue_head->prev = op;
        op_queue_head = op;
    }
    else
        op_queue_tail = op_queue_head = op;
    inc_wait(op->opcode, op->flags, op->next, 1);
    continue_rw(op);
 }
 int cluster_client_t::continue_rw(cluster_op_t *op)
 {
    if (op->state == 0)
@@ -659,27 +678,7 @@ int cluster_client_t::continue_rw(cluster_op_t *op)
        goto resume_1;
    else if (op->state == 2)
        goto resume_2;
    else if (op->state == 3)
        goto resume_3;
 resume_0:
    if (op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE)
    {
        if (!(op->flags & OSD_OP_IGNORE_READONLY))
        {
            auto ino_it = st_cli.inode_config.find(op->inode);
            if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
            {
                op->retval = -EINVAL;
                erase_op(op);
                return 1;
            }
        }
        if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT) && !(op->flags & OP_FLUSH_BUFFER))
        {
            copy_write(op, dirty_buffers);
        }
    }
 resume_1:
    // Slice the operation into parts
    slice_rw(op);
    op->needs_reslice = false;
@@ -690,9 +689,9 @@ resume_1:
        erase_op(op);
        return 1;
    }
-resume_2:
+resume_1:
    // Send unsent parts, if they're not subject to change
-    op->state = 3;
+    op->state = 2;
    if (op->needs_reslice)
    {
        for (int i = 0; i < op->parts.size(); i++)
@@ -702,7 +701,7 @@ resume_2:
                op->retval = -EPIPE;
            }
        }
-        goto resume_3;
+        goto resume_2;
    }
    for (int i = 0; i < op->parts.size(); i++)
    {
@@ -723,18 +722,18 @@ resume_2:
                        });
                    }
                }
-                op->state = 2;
+                op->state = 1;
            }
        }
    }
-    if (op->state == 2)
+    if (op->state == 1)
    {
        return 0;
    }
-resume_3:
+resume_2:
    if (op->inflight_count > 0)
    {
-        op->state = 3;
+        op->state = 2;
        return 0;
    }
    if (op->done_count >= op->parts.size())
@@ -762,7 +761,7 @@ resume_3:
                op->cur_inode = ino_it->second.parent_id;
                op->parts.clear();
                op->done_count = 0;
-                goto resume_1;
+                goto resume_0;
            }
        }
        op->retval = op->len;
@@ -774,7 +773,8 @@ resume_3:
        erase_op(op);
        return 1;
    }
-    else if (op->retval != 0 && op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
+    else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
        op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
    {
        // Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
        // FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
@@ -789,7 +789,7 @@ resume_3:
        {
            op->parts.clear();
            op->done_count = 0;
-            goto resume_1;
+            goto resume_0;
        }
        else
        {
@@ -800,7 +800,7 @@ resume_3:
                    op->parts[i].flags = PART_RETRY;
                }
            }
-            goto resume_2;
+            goto resume_1;
        }
    }
    return 0;
@@ -874,6 +874,11 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    int iov_idx = 0;
    size_t iov_pos = 0;
    int i = 0;
    // We also have to return reads from CACHE_REPEATING buffers - they are not
    // guaranteed to be present on target OSDs at the moment of repeating
    // And we're also free to return data from other cached buffers just
    // because it's faster
    bool dirty_copied = wb->read_from_cache(op, pool_cfg.bitmap_granularity);
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
        pg_num_t pg_num = (stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
@@ -882,7 +887,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            ? (stripe + pg_block_size) : (op->offset + op->len);
        op->parts[i].iov.reset();
        op->parts[i].flags = 0;
-        if (op->cur_inode != op->inode)
+        if (op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied)
        {
            // Read remaining parts from upper layers
            uint64_t prev = begin, cur = begin;
@@ -1045,13 +1050,7 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
            do_it++;
    }
    // Post sync to affected OSDs
-    for (auto & prev_op: dirty_buffers)
+    wb->fsync_start();
    {
        if (prev_op.second.state == CACHE_DIRTY)
        {
            prev_op.second.state = CACHE_FLUSHING;
        }
    }
    op->parts.resize(dirty_osds.size());
    op->retval = 0;
    {
@@ -1076,13 +1075,7 @@ resume_1:
    }
    if (op->retval != 0)
    {
-        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); uw_it++)
+        wb->fsync_error();
        {
            if (uw_it->second.state == CACHE_FLUSHING)
            {
                uw_it->second.state = CACHE_DIRTY;
            }
        }
        if (op->retval == -EPIPE || op->retval == -EIO || op->retval == -ENOSPC)
        {
            // Retry later
@@ -1096,16 +1089,7 @@ resume_1:
    }
    else
    {
-        for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
+        wb->fsync_ok();
        {
            if (uw_it->second.state == CACHE_FLUSHING)
            {
                free(uw_it->second.buf);
                dirty_buffers.erase(uw_it++);
            }
            else
                uw_it++;
        }
    }
    erase_op(op);
    return 1;
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -8,6 +8,9 @@
 #define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
 #define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024
 #define DEFAULT_CLIENT_MAX_BUFFERED_BYTES 32*1024*1024
 #define DEFAULT_CLIENT_MAX_BUFFERED_OPS 1024
 #define DEFAULT_CLIENT_MAX_WRITEBACK_IODEPTH 256
 #define INODE_LIST_DONE 1
 #define INODE_LIST_HAS_UNSTABLE 2
 #define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
@@ -64,17 +67,12 @@ protected:
    cluster_op_t *prev = NULL, *next = NULL;
    int prev_wait = 0;
    friend class cluster_client_t;
-};
+    friend class writeback_cache_t;
 struct cluster_buffer_t
 {
    void *buf;
    uint64_t len;
    int state;
 };
 struct inode_list_t;
 struct inode_list_osd_t;
 class writeback_cache_t;
 // FIXME: Split into public and private interfaces
 class cluster_client_t
@@ -83,16 +81,23 @@ class cluster_client_t
    ring_loop_t *ringloop;
    std::map<pool_id_t, uint64_t> pg_counts;
-    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
+    // client_max_dirty_* is actually "max unsynced", for the case when immediate_commit is off
    uint64_t client_max_dirty_bytes = 0;
    uint64_t client_max_dirty_ops = 0;
    // writeback improves (1) small consecutive writes and (2) Q1 writes without fsync
    bool enable_writeback = false;
    // client_max_buffered_* is the real "dirty limit" - maximum amount of writes buffered in memory
    uint64_t client_max_buffered_bytes = 0;
    uint64_t client_max_buffered_ops = 0;
    uint64_t client_max_writeback_iodepth = 0;
    int log_level;
    int up_wait_retry_interval = 500; // ms
    int retry_timeout_id = 0;
    std::vector<cluster_op_t*> offline_ops;
    cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
-    std::map<object_id, cluster_buffer_t> dirty_buffers;
+    writeback_cache_t *wb = NULL;
    std::set<osd_num_t> dirty_osds;
    uint64_t dirty_bytes = 0, dirty_ops = 0;
@@ -122,10 +127,10 @@ public:
    void execute_raw(osd_num_t osd_num, osd_op_t *op);
    bool is_ready();
    void on_ready(std::function<void(void)> fn);
    bool flush();
    bool get_immediate_commit(uint64_t inode);
    static void copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers);
    void continue_ops(bool up_retry = false);
    inode_list_t *list_inode_start(inode_t inode,
        std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback);
@@ -138,12 +143,14 @@ public:
 protected:
    bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
    void flush_buffer(const object_id & oid, cluster_buffer_t *wr);
    void on_load_config_hook(json11::Json::object & config);
    void on_load_pgs_hook(bool success);
    void on_change_hook(std::map<std::string, etcd_kv_t> & changes);
    void on_change_osd_state_hook(uint64_t peer_osd);
    void execute_internal(cluster_op_t *op);
    void unshift_op(cluster_op_t *op);
    int continue_rw(cluster_op_t *op);
    bool check_rw(cluster_op_t *op);
    void slice_rw(cluster_op_t *op);
    bool try_send(cluster_op_t *op, int i);
    int continue_sync(cluster_op_t *op);
@@ -157,4 +164,6 @@ protected:
    void continue_listing(inode_list_t *lst);
    void send_list(inode_list_osd_t *cur_list);
    void continue_raw_ops(osd_num_t peer_osd);
    friend class writeback_cache_t;
 };
--- a/src/cluster_client_impl.h
+++ b/src/cluster_client_impl.h
@@ -0,0 +1,57 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #pragma once
 #include "cluster_client.h"
 #define SCRAP_BUFFER_SIZE 4*1024*1024
 #define PART_SENT 1
 #define PART_DONE 2
 #define PART_ERROR 4
 #define PART_RETRY 8
 #define CACHE_DIRTY 1
 #define CACHE_WRITTEN 2
 #define CACHE_FLUSHING 3
 #define CACHE_REPEATING 4
 #define OP_FLUSH_BUFFER 0x02
 #define OP_IMMEDIATE_COMMIT 0x04
 struct cluster_buffer_t
 {
    uint8_t *buf;
    uint64_t len;
    int state;
    uint64_t flush_id;
    uint64_t *refcnt;
 };
 typedef std::map<object_id, cluster_buffer_t>::iterator dirty_buf_it_t;
 class writeback_cache_t
 {
 public:
    uint64_t writeback_bytes = 0;
    int writeback_queue_size = 0;
    int writebacks_active = 0;
    uint64_t last_flush_id = 0;
    std::map<object_id, cluster_buffer_t> dirty_buffers;
    std::vector<cluster_op_t*> writeback_overflow;
    std::vector<object_id> writeback_queue;
    std::multimap<uint64_t, uint64_t*> flushed_buffers; // flush_id => refcnt
    ~writeback_cache_t();
    dirty_buf_it_t find_dirty(uint64_t inode, uint64_t offset);
    bool is_left_merged(dirty_buf_it_t dirty_it);
    bool is_right_merged(dirty_buf_it_t dirty_it);
    bool is_merged(const dirty_buf_it_t & dirty_it);
    void copy_write(cluster_op_t *op, int state);
    int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd);
    void start_writebacks(cluster_client_t *cli, int count);
    bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
    void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
    void fsync_start();
    void fsync_error();
    void fsync_ok();
 };
--- a/src/cluster_client_wb.cpp
+++ b/src/cluster_client_wb.cpp
@@ -0,0 +1,498 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
 #include <cassert>
 #include "cluster_client_impl.h"
 writeback_cache_t::~writeback_cache_t()
 {
    for (auto & bp: dirty_buffers)
    {
        if (!--(*bp.second.refcnt))
        {
            free(bp.second.refcnt); // refcnt is allocated with the buffer
        }
    }
    dirty_buffers.clear();
 }
 dirty_buf_it_t writeback_cache_t::find_dirty(uint64_t inode, uint64_t offset)
 {
    auto dirty_it = dirty_buffers.lower_bound((object_id){
        .inode = inode,
        .stripe = offset,
    });
    while (dirty_it != dirty_buffers.begin())
    {
        dirty_it--;
        if (dirty_it->first.inode != inode ||
            (dirty_it->first.stripe + dirty_it->second.len) <= offset)
        {
            dirty_it++;
            break;
        }
    }
    return dirty_it;
 }
 bool writeback_cache_t::is_left_merged(dirty_buf_it_t dirty_it)
 {
    if (dirty_it != dirty_buffers.begin())
    {
        auto prev_it = dirty_it;
        prev_it--;
        if (prev_it->first.inode == dirty_it->first.inode &&
            prev_it->first.stripe+prev_it->second.len == dirty_it->first.stripe &&
            prev_it->second.state == CACHE_DIRTY)
        {
            return true;
        }
    }
    return false;
 }
 bool writeback_cache_t::is_right_merged(dirty_buf_it_t dirty_it)
 {
    auto next_it = dirty_it;
    next_it++;
    if (next_it != dirty_buffers.end() &&
        next_it->first.inode == dirty_it->first.inode &&
        next_it->first.stripe == dirty_it->first.stripe+dirty_it->second.len &&
        next_it->second.state == CACHE_DIRTY)
    {
        return true;
    }
    return false;
 }
 bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
 {
    return is_left_merged(dirty_it) || is_right_merged(dirty_it);
 }
 void writeback_cache_t::copy_write(cluster_op_t *op, int state)
 {
    // Save operation for replay when one of PGs goes out of sync
    // (primary OSD drops our connection in this case)
    // ...or just save it for writeback if write buffering is enabled
    if (op->len == 0)
    {
        return;
    }
    auto dirty_it = find_dirty(op->inode, op->offset);
    auto new_end = op->offset + op->len;
    while (dirty_it != dirty_buffers.end() &&
        dirty_it->first.inode == op->inode &&
        dirty_it->first.stripe < op->offset+op->len)
    {
        assert(dirty_it->first.stripe + dirty_it->second.len > op->offset);
        // Remove overlapping part(s) of buffers
        auto old_end = dirty_it->first.stripe + dirty_it->second.len;
        if (dirty_it->first.stripe < op->offset)
        {
            if (old_end > new_end)
            {
                // Split into end and start
                dirty_it->second.len = op->offset - dirty_it->first.stripe;
                dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
                    .inode = op->inode,
                    .stripe = new_end,
                }, (cluster_buffer_t){
                    .buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
                    .len = old_end - new_end,
                    .state = dirty_it->second.state,
                    .flush_id = dirty_it->second.flush_id,
                    .refcnt = dirty_it->second.refcnt,
                });
                (*dirty_it->second.refcnt)++;
                if (dirty_it->second.state == CACHE_DIRTY)
                {
                    writeback_bytes -= op->len;
                    writeback_queue_size++;
                }
                break;
            }
            else
            {
                // Only leave the beginning
                if (dirty_it->second.state == CACHE_DIRTY)
                {
                    writeback_bytes -= old_end - op->offset;
                    if (is_left_merged(dirty_it) && !is_right_merged(dirty_it))
                    {
                        writeback_queue_size++;
                    }
                }
                dirty_it->second.len = op->offset - dirty_it->first.stripe;
                dirty_it++;
            }
        }
        else if (old_end > new_end)
        {
            // Only leave the end
            if (dirty_it->second.state == CACHE_DIRTY)
            {
                writeback_bytes -= new_end - dirty_it->first.stripe;
                if (!is_left_merged(dirty_it) && is_right_merged(dirty_it))
                {
                    writeback_queue_size++;
                }
            }
            auto new_dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
                .inode = op->inode,
                .stripe = new_end,
            }, (cluster_buffer_t){
                .buf = dirty_it->second.buf + new_end - dirty_it->first.stripe,
                .len = old_end - new_end,
                .state = dirty_it->second.state,
                .flush_id = dirty_it->second.flush_id,
                .refcnt = dirty_it->second.refcnt,
            });
            dirty_buffers.erase(dirty_it);
            dirty_it = new_dirty_it;
            break;
        }
        else
        {
            // Remove the whole buffer
            if (dirty_it->second.state == CACHE_DIRTY && !is_merged(dirty_it))
            {
                writeback_bytes -= dirty_it->second.len;
                assert(writeback_queue_size > 0);
                writeback_queue_size--;
            }
            if (!--(*dirty_it->second.refcnt))
            {
                free(dirty_it->second.refcnt);
            }
            dirty_buffers.erase(dirty_it++);
        }
    }
    // Overlapping buffers are removed, just insert the new one
    uint64_t *refcnt = (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
    uint8_t *buf = (uint8_t*)refcnt + sizeof(uint64_t);
    *refcnt = 1;
    dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
        .inode = op->inode,
        .stripe = op->offset,
    }, (cluster_buffer_t){
        .buf = buf,
        .len = op->len,
        .state = state,
        .refcnt = refcnt,
    });
    if (state == CACHE_DIRTY)
    {
        writeback_bytes += op->len;
        // Track consecutive write-back operations
        if (!is_merged(dirty_it))
        {
            // <writeback_queue> is OK to contain more than actual number of consecutive
            // requests as long as it doesn't miss anything. But <writeback_queue_size>
            // is always calculated correctly.
            writeback_queue_size++;
            writeback_queue.push_back((object_id){
                .inode = op->inode,
                .stripe = op->offset,
            });
        }
    }
    uint64_t pos = 0, len = op->len, iov_idx = 0;
    while (len > 0 && iov_idx < op->iov.count)
    {
        auto & iov = op->iov.buf[iov_idx];
        memcpy(buf + pos, iov.iov_base, iov.iov_len);
        pos += iov.iov_len;
        iov_idx++;
    }
 }
 int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
 {
    int repeated = 0;
    if (dirty_buffers.size())
    {
        // peer_osd just dropped connection
        // determine WHICH dirty_buffers are now obsolete and repeat them
        for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
        {
            bool end = wr_it == dirty_buffers.end();
            bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
                cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
            if (flush_it != wr_it && (end || !flush_this ||
                wr_it->first.inode != flush_it->first.inode ||
                wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
            {
                repeated++;
                flush_buffers(cli, flush_it, wr_it);
                flush_it = wr_it;
            }
            if (end)
                break;
            last_it = wr_it;
            wr_it++;
            if (!flush_this)
                flush_it = wr_it;
        }
    }
    return repeated;
 }
 void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it)
 {
    auto prev_it = to_it;
    prev_it--;
    bool is_writeback = from_it->second.state == CACHE_DIRTY;
    cluster_op_t *op = new cluster_op_t;
    op->flags = OSD_OP_IGNORE_READONLY|OP_FLUSH_BUFFER;
    op->opcode = OSD_OP_WRITE;
    op->cur_inode = op->inode = from_it->first.inode;
    op->offset = from_it->first.stripe;
    op->len = prev_it->first.stripe + prev_it->second.len - from_it->first.stripe;
    uint32_t calc_len = 0;
    uint64_t flush_id = ++last_flush_id;
    for (auto it = from_it; it != to_it; it++)
    {
        it->second.state = CACHE_REPEATING;
        it->second.flush_id = flush_id;
        (*it->second.refcnt)++;
        flushed_buffers.emplace(flush_id, it->second.refcnt);
        op->iov.push_back(it->second.buf, it->second.len);
        calc_len += it->second.len;
    }
    assert(calc_len == op->len);
    writebacks_active++;
    op->callback = [this, flush_id](cluster_op_t* op)
    {
        // Buffer flushes should be always retried, regardless of the error,
        // so they should never result in an error here
        assert(op->retval == op->len);
        for (auto fl_it = flushed_buffers.find(flush_id);
            fl_it != flushed_buffers.end() && fl_it->first == flush_id; )
        {
            if (!--(*fl_it->second)) // refcnt
            {
                free(fl_it->second);
            }
            flushed_buffers.erase(fl_it++);
        }
        for (auto dirty_it = find_dirty(op->inode, op->offset);
            dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
            dirty_it->first.stripe < op->offset+op->len; dirty_it++)
        {
            if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
            {
                dirty_it->second.flush_id = 0;
                dirty_it->second.state = CACHE_WRITTEN;
            }
        }
        delete op;
        writebacks_active--;
        // We can't call execute_internal because it affects an invalid copy of the list here
        // (erase_op remembers `next` after writeback callback)
    };
    if (is_writeback)
    {
        cli->execute_internal(op);
    }
    else
    {
        // Insert repeated flushes into the beginning
        cli->unshift_op(op);
        cli->continue_rw(op);
    }
 }
 void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
 {
    if (!writeback_queue.size())
    {
        return;
    }
    std::vector<object_id> queue_copy;
    queue_copy.swap(writeback_queue);
    int started = 0, i = 0;
    for (i = 0; i < queue_copy.size() && (!count || started < count); i++)
    {
        object_id & req = queue_copy[i];
        auto dirty_it = find_dirty(req.inode, req.stripe);
        if (dirty_it == dirty_buffers.end() ||
            dirty_it->first.inode != req.inode ||
            dirty_it->second.state != CACHE_DIRTY)
        {
            continue;
        }
        auto from_it = dirty_it;
        uint64_t off = dirty_it->first.stripe;
        while (from_it != dirty_buffers.begin())
        {
            from_it--;
            if (from_it->second.state != CACHE_DIRTY ||
                from_it->first.inode != req.inode ||
                from_it->first.stripe+from_it->second.len != off)
            {
                from_it++;
                break;
            }
            off = from_it->first.stripe;
        }
        off = dirty_it->first.stripe + dirty_it->second.len;
        auto to_it = dirty_it;
        to_it++;
        while (to_it != dirty_buffers.end())
        {
            if (to_it->second.state != CACHE_DIRTY ||
                to_it->first.inode != req.inode ||
                to_it->first.stripe != off)
            {
                break;
            }
            off = to_it->first.stripe + to_it->second.len;
            to_it++;
        }
        started++;
        assert(writeback_queue_size > 0);
        writeback_queue_size--;
        writeback_bytes -= off - from_it->first.stripe;
        flush_buffers(cli, from_it, to_it);
    }
    queue_copy.erase(queue_copy.begin(), queue_copy.begin()+i);
    if (writeback_queue.size())
    {
        queue_copy.insert(queue_copy.end(), writeback_queue.begin(), writeback_queue.end());
    }
    queue_copy.swap(writeback_queue);
 }
 static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t len, uint32_t bitmap_granularity)
 {
    if (op->opcode == OSD_OP_READ)
    {
        // Not OSD_OP_READ_BITMAP or OSD_OP_READ_CHAIN_BITMAP
        int iov_idx = 0;
        uint64_t cur_offset = op->offset;
        while (iov_idx < op->iov.count && cur_offset+op->iov.buf[iov_idx].iov_len <= offset)
        {
            cur_offset += op->iov.buf[iov_idx].iov_len;
            iov_idx++;
        }
        while (iov_idx < op->iov.count && cur_offset < offset+len)
        {
            auto & v = op->iov.buf[iov_idx];
            auto begin = (cur_offset < offset ? offset : cur_offset);
            auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
            memcpy(
                (uint8_t*)v.iov_base + begin - cur_offset,
                buf + (cur_offset <= offset ? 0 : cur_offset-offset),
                end - begin
            );
            cur_offset += v.iov_len;
            iov_idx++;
        }
    }
    // Set bitmap bits
    int start_bit = (offset-op->offset)/bitmap_granularity;
    int end_bit = (offset-op->offset+len)/bitmap_granularity;
    for (int bit = start_bit; bit < end_bit;)
    {
        if (!(bit%8) && bit <= end_bit-8)
        {
            ((uint8_t*)op->bitmap_buf)[bit/8] = 0xFF;
            bit += 8;
        }
        else
        {
            ((uint8_t*)op->bitmap_buf)[bit/8] |= (1 << (bit%8));
            bit++;
        }
    }
 }
 bool writeback_cache_t::read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity)
 {
    bool dirty_copied = false;
    if (dirty_buffers.size() && (op->opcode == OSD_OP_READ ||
        op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP))
    {
        // We also have to return reads from CACHE_REPEATING buffers - they are not
        // guaranteed to be present on target OSDs at the moment of repeating
        // And we're also free to return data from other cached buffers just
        // because it's faster
        auto dirty_it = find_dirty(op->cur_inode, op->offset);
        while (dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->cur_inode &&
            dirty_it->first.stripe < op->offset+op->len)
        {
            uint64_t begin = dirty_it->first.stripe, end = dirty_it->first.stripe + dirty_it->second.len;
            if (begin < op->offset)
                begin = op->offset;
            if (end > op->offset+op->len)
                end = op->offset+op->len;
            bool skip_prev = true;
            uint64_t cur = begin, prev = begin;
            while (cur < end)
            {
                unsigned bmp_loc = (cur - op->offset)/bitmap_granularity;
                bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
                if (skip_prev != skip)
                {
                    if (cur > prev && !skip)
                    {
                        // Copy data
                        dirty_copied = true;
                        copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
                    }
                    skip_prev = skip;
                    prev = cur;
                }
                cur += bitmap_granularity;
            }
            assert(cur > prev);
            if (!skip_prev)
            {
                // Copy data
                dirty_copied = true;
                copy_to_op(op, prev, dirty_it->second.buf + prev - dirty_it->first.stripe, cur-prev, bitmap_granularity);
            }
            dirty_it++;
        }
    }
    return dirty_copied;
 }
 void writeback_cache_t::fsync_start()
 {
    for (auto & prev_op: dirty_buffers)
    {
        if (prev_op.second.state == CACHE_WRITTEN)
        {
            prev_op.second.state = CACHE_FLUSHING;
        }
    }
 }
 void writeback_cache_t::fsync_error()
 {
    for (auto & prev_op: dirty_buffers)
    {
        if (prev_op.second.state == CACHE_FLUSHING)
        {
            prev_op.second.state = CACHE_WRITTEN;
        }
    }
 }
 void writeback_cache_t::fsync_ok()
 {
    for (auto uw_it = dirty_buffers.begin(); uw_it != dirty_buffers.end(); )
    {
        if (uw_it->second.state == CACHE_FLUSHING)
        {
            if (!--(*uw_it->second.refcnt))
                free(uw_it->second.refcnt);
            dirty_buffers.erase(uw_it++);
        }
        else
            uw_it++;
    }
 }
--- a/src/disk_tool.cpp
+++ b/src/disk_tool.cpp
@@ -5,7 +5,7 @@
 #include "str_util.h"
 static const char *help_text =
-    "Vitastor disk management tool\n"
+    "Vitastor disk management tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -74,7 +74,7 @@ static const char *help_text =
    "  If it doesn't succeed it issues a warning in the system log.\n"
    "  \n"
    "  You can also pass other OSD options here as arguments and they'll be persisted\n"
-    "  in the superblock: cached_io_data, cached_io_meta, cached_io_journal,\n"
+    "  in the superblock: data_io, meta_io, journal_io,\n"
    "  inmemory_metadata, inmemory_journal, max_write_iodepth,\n"
    "  min_flusher_count, max_flusher_count, journal_sector_buffer_count,\n"
    "  journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,\n"
@@ -127,6 +127,10 @@ static const char *help_text =
    "vitastor-disk write-sb <device>\n"
    "  Read JSON from STDIN and write it into Vitastor OSD superblock on <device>.\n"
    "\n"
    "vitastor-disk update-sb <device> [--force] [--<parameter> <value>] [...]\n"
    "  Read Vitastor OSD superblock from <device>, update parameters in it and write it back.\n"
    "  --force allows to ignore validation errors.\n"
    "\n"
    "vitastor-disk udev <device>\n"
    "  Try to read Vitastor OSD superblock from <device> and print variables for udev.\n"
    "\n"
@@ -229,7 +233,7 @@ int main(int argc, char *argv[])
        {
            self.options["allow_data_loss"] = "1";
        }
-        else if (argv[i][0] == '-' && argv[i][1] == '-')
+        else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
        {
            char *key = argv[i]+2;
            self.options[key] = argv[++i];
@@ -363,6 +367,15 @@ int main(int argc, char *argv[])
        }
        return self.write_sb(cmd[1]);
    }
    else if (!strcmp(cmd[0], "update-sb"))
    {
        if (cmd.size() != 2)
        {
            fprintf(stderr, "Exactly 1 device path argument is required\n");
            return 1;
        }
        return self.update_sb(cmd[1]);
    }
    else if (!strcmp(cmd[0], "start") || !strcmp(cmd[0], "stop") ||
        !strcmp(cmd[0], "restart") || !strcmp(cmd[0], "enable") || !strcmp(cmd[0], "disable"))
    {
--- a/src/disk_tool.h
+++ b/src/disk_tool.h
@@ -109,6 +109,7 @@ struct disk_tool_t
    int udev_import(std::string device);
    int read_sb(std::string device);
    int write_sb(std::string device);
    int update_sb(std::string device);
    int exec_osd(std::string device);
    int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices);
    int pre_exec_osd(std::string device);
--- a/src/disk_tool_journal.cpp
+++ b/src/disk_tool_journal.cpp
@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
        if (journal_calc_data_pos != sw.data_offset)
        {
            printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
-                : " (mismatched, calculated = %lu)", journal_pos);
+                : " (mismatched, calculated = %08lx)", journal_pos);
        }
        uint32_t data_csum_size = (!je_start.csum_block_size
            ? 0
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dmitry Yusko	5727772bdc	Add relatime and lazytime for ext4 mounting	2023-12-14 17:03:38 +03:00
Vitaliy Filippov	aca2bef15f	Add vitastor-disk update-sb command	2023-12-14 01:11:42 +03:00
Vitaliy Filippov	4dd6e89263	Change qemu to qemu-system-x86 in docs	2023-12-14 01:01:00 +03:00
Vitaliy Filippov	9bac99ffb6	Fix incorrect error in CSI when searching for the device in /sys	2023-12-14 01:00:32 +03:00
Vitaliy Filippov	62ed130960	Support building qemu 8.1 from bookworm-backports	2023-12-10 00:34:13 +03:00
Vitaliy Filippov	9c7755b6e8	Use qemu-storage-daemon from QEMU 8.1.2 for CSI	2023-12-08 00:10:12 +03:00
Vitaliy Filippov	691ebd991a	Move 2 last log printfs to stderr from stdout in etcd_state_client	2023-12-08 00:01:52 +03:00
Vitaliy Filippov	6d5df908a3	Fix possible out of bounds when checking invalid journal entries	2023-12-08 00:01:07 +03:00
Vitaliy Filippov	fa87769ed8	Correct config options in vduse docs	2023-12-06 02:09:04 +03:00
Vitaliy Filippov	2ce8292803	Also log when killing process	2023-12-06 01:06:53 +03:00
Vitaliy Filippov	7f8f7ded52	Check for empty output of vitastor-nbd map (just in case)	2023-12-06 01:01:14 +03:00
Vitaliy Filippov	68553eabbb	Log executed CLI commands	2023-12-06 00:48:12 +03:00
Vitaliy Filippov	3147c5c8d5	Remove internal error wrapping	2023-12-06 00:39:42 +03:00
Vitaliy Filippov	576e2ae608	Fix etcd_address check in CSI	2023-12-06 00:28:21 +03:00
Vitaliy Filippov	a1c7cc3d8d	Release 1.3.1 Hotfix to 1.3.0 - new "journal space reservation" had a bug which caused OSDs to crash with EC and without immediate_commit.	2023-12-04 18:35:09 +03:00
Vitaliy Filippov	a5e3dfbc5a	Oops, 1.3.0 needs a hotfix	2023-12-04 13:45:54 +03:00
Vitaliy Filippov	7972502eaf	Release 1.3.0 New features: - RDMA without ODP - much faster and all cards are now supported, not just Mellanox - VDUSE in CSI - faster, more stable and can even recover after CSI pod restart! - Reserve journal space for stabilize requests dynamically to prevent stalls under load with EC - Raise default NBD timeout from 30 to 300 seconds and allow to take it from /etc/vitastor/vitastor.conf - Remove explicit etcdUrl/etcdPrefix K8S storage class parameter support to prevent etcd migration issues for volumes created with these parameters - Support QEMU 8.1 and pve-qemu 8.1 Bug fixes: - Fix RDMA connection (and thus memory) leak - Fix rare crashes under load due to incorrect io_uring queue size tracking - Fix monitor statistics aggregation in case of empty /osd/stats keys - Fix crash on unknown long argument to vitastor-disk - Allow trailing comma in JSONs again - Fix crash on attempts to dump a long listing of objects "to stabilize" or "to rollback" in a slow op	2023-12-04 02:36:43 +03:00
Vitaliy Filippov	e57b7203b8	Use cmake3 on RHEL 7	2023-12-04 02:36:29 +03:00
Vitaliy Filippov	c8a179dcda	Note that Proxmox 8.1 is supported	2023-12-04 02:20:33 +03:00
Vitaliy Filippov	845454742d	Fix warning with QEMU 8.1	2023-12-04 01:59:07 +03:00
Vitaliy Filippov	d65512bd80	Add patches for QEMU 8.1	2023-12-04 01:56:17 +03:00
Vitaliy Filippov	53de2bbd0f	Support VDUSE in CSI VDUSE has multiple advantages: - Better performance - Lack of timeout problems - And even the ability to recover after restart of the vitastor-csi pod!	2023-12-04 00:41:24 +03:00
Vitaliy Filippov	628aa59574	Raise default NBD timeout from 30 to 300 seconds and allow to take it from /etc/vitastor/vitastor.conf	2023-12-02 14:11:14 +03:00
Vitaliy Filippov	037cf64a47	Remove explicit etcdUrl/etcdPrefix from volume parameters	2023-12-02 13:26:00 +03:00
Vitaliy Filippov	19e2d9d6fa	Fix crash on unknown long argument to vitastor-disk	2023-12-01 00:55:51 +03:00
Vitaliy Filippov	bfc7e61909	Add more notes + performance comparison about VDUSE	2023-11-25 02:25:56 +03:00
Vitaliy Filippov	7da4868b37	Fix monitor statistics aggregation in case of empty /osd/stats keys	2023-11-24 01:05:21 +03:00
Vitaliy Filippov	b5c020ce0b	Use io_uring SQ size for ringloop capacity - otherwise get_sqe could return NULL when space_left() was > 0 under load Raise default io_uring size to 1024 for the same effective capacity as previously	2023-11-20 03:04:06 +03:00
Vitaliy Filippov	6b33ae973d	%d -> %lu	2023-11-20 03:02:26 +03:00
Vitaliy Filippov	cf36445359	Reserve journal space for stabilize requests dynamically to prevent stalls	2023-11-20 03:01:57 +03:00
Vitaliy Filippov	3fd873d263	Add -fno-omit-frame-pointer by default	2023-11-20 02:59:54 +03:00
Vitaliy Filippov	a00e8ae9ed	Fix mismatch journal pos format in vitastor-disk	2023-11-19 15:19:54 +03:00
Vitaliy Filippov	75674545dc	Limit the number of printed object versions in slow op dump (otherwise it may overflow the fixed buffer)	2023-11-13 01:10:28 +03:00
Vitaliy Filippov	225eb2fe3d	Support RDMA without ODP by stupidly copying memory. Disable ODP by default ODP is slower than regular RDMA even with memory copy overhead Example numbers: - 3950000 random read iops without ODP vs 240000 iops with ODP - 1447000 random write iops without ODP vs 101000 iops with ODP Reference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf	2023-11-12 15:03:47 +03:00
Vitaliy Filippov	7e82573ed0	Fix RDMA connection leak which was preventing stable functioning of RDMA :)	2023-11-11 23:40:47 +03:00
Vitaliy Filippov	12a6bed2d5	Return the new accidentally rolled back json11 commit ("allow trailing comma")	2023-11-07 15:49:23 +03:00
Vitaliy Filippov	5524dbdab7	Release 1.2.0 New features: - Implement CSI volume expansion - Implement CSI volume snapshots - CSI driver now requires Kubernetes >= 1.20 Bug fixes: - Important bug fix for EC: fix EC n+k, k>=2 read recovery in ISA-L version returning incorrect data when reading at least the second chunk out of multiple missing chunks without reading the first one. All users of EC n+k, k>=2 should upgrade as soon as possible, and upgrade should be conducted with downtime: first stop all clients (VMs/containers), then all OSDs, then upgrade and restart everything. - Fix unstable statistics aggregation in monitor (affecting vitastor-cli status and df) - Make udev not wait for OSDs to start during boot - Do not report negative numbers of offline PGs in vitastor-cli status when changing PG count - Report both old and new PG counts in vitastor-cli df when changing it - Fix OSDs sometimes not starting with "The code only supports journal versions 1 and 2, but it is 2 on disk" error after upgrading from pre-1.0 versions and letting OSDs run for some time - Fix monitors sometimes returning old PG count back after OSD configuration changes - Make monitor PG changes more stable and timeout errors less probable	2023-11-05 01:48:57 +03:00
Vitaliy Filippov	cd3dec06ac	Remove spaces from old->new PG count in df	2023-11-05 01:45:45 +03:00
Vitaliy Filippov	371d79e059	Document vitastor-csi features	2023-11-05 01:05:26 +03:00
Vitaliy Filippov	0e888e6c60	Prevent spamming etcd with last_clean_pgs update requests	2023-11-05 00:12:00 +03:00
Vitaliy Filippov	408c21d8f0	Scale last_clean_pgs PG count even if current PGs already contain the new number of PGs	2023-11-04 23:45:59 +03:00
Vitaliy Filippov	43cb9ae212	Prevent multiple parallel recheck_pgs in case of timeouts	2023-11-04 20:59:56 +03:00
Vitaliy Filippov	e15b6e7805	Fix "cannot be narrowed" in clang	2023-11-04 18:14:44 +03:00
Vitaliy Filippov	31017d8412	Allow to start with V2 journal with header size from V1, as incorrectly updated by previous versions	2023-11-04 18:13:42 +03:00
Vitaliy Filippov	4819854064	Fix OSDs incorrectly updating journal superblock after upgrade to 1.x from pre-1.x and refusing to start after it	2023-11-04 15:02:24 +03:00
Vitaliy Filippov	1f509cca77	Fix unused capture warnings and void* arithmetic (clang)	2023-11-04 14:55:12 +03:00
Vitaliy Filippov	aa8e8e8271	Add version info to --help output	2023-11-04 13:32:12 +03:00
Vitaliy Filippov	4d79e531c5	Do not print "-X offline" in status when changing pool PG count, print it in df instead	2023-11-04 13:12:13 +03:00
Vitaliy Filippov	30dff8893f	Fix ISA-L version EC recovery with first missing data chunk not being read (Yes, all EC n + k with k >= 2 users should upgrade as soon as possible)	2023-11-04 01:34:18 +03:00
Vitaliy Filippov	becf14a705	Add a test for EC with multiple missing data chunks, but without recovery of first of them	2023-11-04 01:34:18 +03:00
Vitaliy Filippov	64388788c1	Implement CSI volume expansion	2023-11-01 12:46:20 +03:00
Vitaliy Filippov	37653abe4b	Implement CSI volume snapshots	2023-11-01 12:46:20 +03:00
Vitaliy Filippov	7c054c6f10	Add "id" to df --json output	2023-11-01 12:46:16 +03:00
Vitaliy Filippov	bb7709e824	Support listening on non-127.0.0.1 in tests	2023-11-01 12:45:27 +03:00
Vitaliy Filippov	ebeace5a2d	Add cmake and pkg-config to debian build depends	2023-11-01 12:45:27 +03:00
Vitaliy Filippov	a378789f10	Rollback erroneous go.mod changes in 1.1.0 O:-)	2023-10-30 18:47:48 +03:00
Vitaliy Filippov	1fe678e57b	Add --no-block to udev rule	2023-10-30 12:18:29 +03:00
Vitaliy Filippov	2e592a2f22	Fix undefined variable "timeout"	2023-10-29 01:30:55 +03:00
Vitaliy Filippov	b92f644e3a	Fix statistics aggregation, calculate inode stats by first deriving per-OSD stats, too	2023-10-29 01:30:55 +03:00
Vitaliy Filippov	890ea3dbc0	Forgot to add new parameter page to README	2023-10-28 13:39:53 +03:00
Vitaliy Filippov	06630369bf	Plans++	2023-10-28 13:38:04 +03:00
Vitaliy Filippov	b4740acf62	Fix operations paused for 0.5-1 second when it happens that io_uring submit is not triggered	2023-10-28 13:18:21 +03:00
Vitaliy Filippov	eae81bbda6	Fix typo	2023-10-28 01:09:20 +03:00
Vitaliy Filippov	8222e3c77d	Release 1.1.0 New features: - Implement [client writeback cache](docs/config/client.en.md#client_enable_writeback) - Add the third I/O mode: [O_DIRECT\|O_SYNC](docs/config/osd.en.md#data_io) (good for Optane) - Reduce load on etcd by splitting OSD lease and statistics reporting intervals: [etcd_stats_interval](docs/config/osd.en.md#etcd_stats_interval) (default 30 sec) - Make MON automatically filter OSDs by layout (block_size/immediate_commit/bitmap_granularity) to prevent "refusing to start PGs of this pool" errors on misconfiguration - Support running fio benchmarks on systems without io_uring - Make QEMU driver compatible with QEMU 8.1 - Document usage of [vhost-user-blk](docs/usage/qemu.en.md#vhost-user-blk) Bug fixes: - Fix resizing disks in QEMU driver (for example, in Proxmox) - Fix "unexpected result" in Proxmox driver by making CLI flush output on exit - Remove unneeded block_size mismatch warnings on pools without matching PGs - Fix possible segfault in vitastor-cli ls -l (usually with deleted pools) - Fix QEMU driver compatibility with systems without io_uring - Fix monitor eating 100% CPU when etcd is down (caused by infinite retries) - Fix potential incorrect write processing with snapshots (not caught in tests but could probably lead to client hangs) - Fix buffer insertion in cluster_client (not caught in tests but could probably lead to incorrect writes in rare cases) - Fix rare OSD crash during sync operation processing - Fix a reenterability issue in cluster_client not reproducible in QEMU/fio, but reproducible with the currently developed K/V database implementation - Fix deletion of the first modified object - OSDs could crash if you modified the same object a lot of times, then deleted it, and then modified it again - Fix the fio_sec_osd test tool	2023-10-28 00:33:06 +03:00
Vitaliy Filippov	29cbe70e74	Bump qemu version to vitastor4	2023-10-28 00:33:06 +03:00
Vitaliy Filippov	a883e79507	Make docs to add etcd_stats_interval	2023-10-27 14:09:26 +03:00
Vitaliy Filippov	be7e76f849	Split etcd_stats_interval out of etcd_report_interval	2023-10-27 01:26:26 +03:00
Vitaliy Filippov	6fd2cf5df6	Add documentation for the write-back cache	2023-10-27 01:26:26 +03:00
Vitaliy Filippov	294a754c9e	Allow write-back by default in NBD & NFS	2023-10-27 01:26:26 +03:00
Vitaliy Filippov	8bfea6e7de	Support vitastor_c_create_epoll() in fio driver	2023-10-26 22:57:36 +03:00
Vitaliy Filippov	bac9e34836	Allow to create vitastor_c with plain epoll without uring :-)	2023-10-26 22:57:36 +03:00
Vitaliy Filippov	8aa4d492c1	Allow to use epoll_manager without ringloop	2023-10-26 22:57:36 +03:00
Vitaliy Filippov	9336ee5476	Correctly free manual "small vector" in cluster_client %-)	2023-10-26 22:57:36 +03:00
Vitaliy Filippov	ad30b11519	Add the missing ringloop creation check to vitastor_c_create_uring_json()	2023-10-26 18:07:23 +03:00
Vitaliy Filippov	a061246997	Do not attempt to initialize QEMU driver via vitastor_c_create_qemu_uring() It doesn't add any compatibility because vitastor_c_uring_register_eventfd() is added in the same VITASTOR_C_API_VERSION 2.	2023-10-26 17:46:19 +03:00
Vitaliy Filippov	5066e35a49	Fix write-over-delete failing for the very first entry in dirty_db	2023-10-21 17:00:14 +03:00
Vitaliy Filippov	93dc31f3fc	Fix possible segfault in vitastor-cli ls -l	2023-10-18 11:11:41 +03:00
Vitaliy Filippov	f245b56176	Fix another possible reenterability issue in cluster_client Non-reproducible in QEMU/FIO, only caught during K/V DB debugging	2023-10-08 11:02:53 +03:00
Vitaliy Filippov	befca06f18	Support any OSD count in test_heal	2023-10-08 11:02:53 +03:00
Vitaliy Filippov	fbf0263625	Add qemu-storage-daemon to documentation	2023-09-16 18:40:52 +03:00
Vitaliy Filippov	3bcf276d4d	Run tests with writeback	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	38db53f5ee	Implement client writeback cache - Disabled by default, enable with client_enable_writeback=true - Even then only enabled in FIO when -direct is disabled and in QEMU when block device cache is enabled in settings - Can also be enabled in other clients like vitastor-cli using parameter client_writeback_allowed=true, but not recommended	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	cd543a90bc	Prevent stack overflows in cli_merge with CAS and writeback cache	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	f600cc07b0	Autosync in blockstore every autosync_writes, too	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	6a8e530e6b	Add FIXME to timerfd_manager	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	5cadb170b9	Fix possible OSD crash during sync due to missing min_flushed_journal_sector reset	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	e72d4ed1d4	Remove unused bs_sync fields	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	ff479a102d	Make MON filter OSDs by block layout to prevent "refusing to start PGs of this pool" errors on misconfiguration	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	27d0d5b06a	Reads do not have to wait for buffer flushes anymore	2023-09-16 17:52:17 +03:00
Vitaliy Filippov	33950c1ec8	Fix fio_sec_osd attr_len	2023-09-16 17:49:10 +03:00
Vitaliy Filippov	eea7ef1f19	Remove debug osd_trace from test_write	2023-09-12 01:35:36 +03:00
Vitaliy Filippov	cc0fdc6253	Remove erroneous block_size mismatch warnings on pools without matching PGs	2023-09-08 23:19:04 +03:00
Vitaliy Filippov	79ecd59b10	Flush STDOUT and STDERR before exiting from cli to fix Proxmox "Unexpected result"	2023-09-07 17:30:26 +03:00
Vitaliy Filippov	51081c9b45	Put etcd into tmpfs for tests	2023-09-07 02:35:09 +03:00
Vitaliy Filippov	b7d398be5b	Fix sscanf validation usage (field count instead of null_byte == 0)	2023-09-07 02:34:35 +03:00
Vitaliy Filippov	85e9f67d9d	Add supported_truncate_flags	2023-09-06 17:37:52 +03:00
Vitaliy Filippov	79c6d6f323	Make QEMU driver compatible with QEMU 8.1	2023-08-24 02:23:55 +03:00
Vitaliy Filippov	ae760dbc1d	Fix co_truncate size division by BDRV_SECTOR_SIZE	2023-08-24 01:55:35 +03:00
Vitaliy Filippov	65487da4b1	Do not include msgr_rdma.h into messenger.h	2023-08-24 01:55:35 +03:00
Vitaliy Filippov	7862282938	Extract validation to check_rw(), remove duplicate code with OP_SYNC	2023-08-13 23:49:52 +03:00
Vitaliy Filippov	30ce2bd951	Fix buffer insert in cluster_client	2023-08-12 11:08:50 +03:00
Vitaliy Filippov	b1a0afd10a	Aggregate buffer flushes	2023-08-11 11:26:13 +03:00
Vitaliy Filippov	85b6134910	Return dirty buffers on read in client Required at least to return buffers when they need to be replayed, but until they are actually replayed	2023-08-09 00:57:08 +03:00
Vitaliy Filippov	b1b07a393d	Fix incorrect marking op parts as done with snapshots (could probably lead to client hangs)	2023-08-09 00:57:08 +03:00
Vitaliy Filippov	7333022adf	Add a third I/O mode: O_DIRECT\|O_SYNC, change parameters to data_io/meta_io/journal_io	2023-08-09 00:57:08 +03:00
Vitaliy Filippov	ab8627c9fa	Fix monitor retrying failed etcd connection in an infinite loop without pauses	2023-08-09 00:57:08 +03:00
`@@ -1,4 +1,4 @@`
	`VERSION ?= v1.0.0`	`VERSION ?= v1.3.1`

	`all: build push`	`all: build push`