WIP Auto-tune recovery speed

Track recovery op latencies + refactor into a structure
Add vitastor-disk update-sb command
2023-12-14 01:11:57 +03:00 · 2023-12-14 01:11:57 +03:00 · 2023-12-14 01:11:42 +03:00 · 2023-12-14 01:01:00 +03:00 · 2023-12-14 01:00:32 +03:00 · 2023-12-10 00:34:13 +03:00
165 changed files with 7600 additions and 1814 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -622,6 +622,114 @@ jobs:
          echo ""
        done

+  test_heal_csum_32k_dmj:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: TEST_NAME=csum_32k_dmj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_csum_32k_dj:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: TEST_NAME=csum_32k_dj  OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_csum_32k:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: TEST_NAME=csum_32k     OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_csum_4k_dmj:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: TEST_NAME=csum_4k_dmj  OSD_ARGS="--data_csum_type crc32c --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_csum_4k_dj:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: TEST_NAME=csum_4k_dj   OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_csum_4k:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: TEST_NAME=csum_4k      OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_scrub:
    runs-on: ubuntu-latest
    needs: build
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@@ -7,7 +7,8 @@ for my $line (<>)
    if ($line =~ /\.\/(test_[^\.]+)/s)
    {
        chomp $line;
-        my $test_name = $1;
+        my $base_name = $1;
+        my $test_name = $base_name;
        my $timeout = 3;
        if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_add_osd' ||
            $test_name eq 'test_interrupted_rebalance' || $test_name eq 'test_rebalance_verify')
@@ -16,7 +17,12 @@ for my $line (<>)
        }
        while ($line =~ /([^\s=]+)=(\S+)/gs)
        {
-            if ($1 eq 'SCHEME' && $2 eq 'ec')
+            if ($1 eq 'TEST_NAME')
+            {
+                $test_name = $base_name.'_'.$2;
+                last;
+            }
+            elsif ($1 eq 'SCHEME' && $2 eq 'ec')
            {
                $test_name .= '_ec';
            }
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "0.9.6")
+set(VERSION "1.3.1")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - Параметры
    - [Общие](docs/config/common.ru.md)
    - [Сетевые](docs/config/network.ru.md)
+    - [Клиентский код](docs/config/client.en.md)
    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
    - [Прочие параметры OSD](docs/config/osd.ru.md)
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Read more details below in the documentation.
  - Parameter Reference
    - [Common](docs/config/common.en.md)
    - [Network](docs/config/network.en.md)
+    - [Client](docs/config/client.en.md)
    - [Global Disk Layout](docs/config/layout-cluster.en.md)
    - [OSD Disk Layout](docs/config/layout-osd.en.md)
    - [OSD Runtime Parameters](docs/config/osd.en.md)
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@@ -1,14 +1,15 @@
 # Compile stage
-FROM golang:buster AS build
+FROM golang:bookworm AS build

 ADD go.sum go.mod /app/
 RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
 ADD . /app
-RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
-RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
+RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
+    cd /app && \
+    CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi

 # Final stage
-FROM debian:buster
+FROM debian:bookworm

 LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
 LABEL description="Vitastor CSI Driver"
@@ -18,19 +19,30 @@ ENV CSI_ENDPOINT=""

 RUN apt-get update && \
    apt-get install -y wget && \
-    (echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
    apt-get update && \
-    apt-get install -y e2fsprogs xfsprogs kmod && \
+    apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
+        # dependencies of qemu-storage-daemon
+        libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
+        libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
    apt-get clean && \
    (echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)

 COPY --from=build /app/vitastor-csi /bin/

-RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
+RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
+    ((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
    apt-get update && \
    apt-get install -y vitastor-client && \
+    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-utils_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
+    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-block-extra_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
+    dpkg -x qemu-utils*.deb tmp1 && \
+    dpkg -x qemu-block-extra*.deb tmp1 && \
+    cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
+    mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
+    cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
+    rm -rf tmp1 *.deb && \
    apt-get clean

 ENTRYPOINT ["/bin/vitastor-csi"]
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.9.6
+VERSION ?= v1.3.1

 all: build push

--- a/csi/deploy/001-csi-config-map.yaml
+++ b/csi/deploy/001-csi-config-map.yaml
@@ -2,6 +2,7 @@
 apiVersion: v1
 kind: ConfigMap
 data:
+  # You can add multiple configuration files here to use a multi-cluster setup
  vitastor.conf: |-
    {"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
 metadata:
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.9.6
+          image: vitalif/vitastor-csi:v1.3.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
@@ -82,6 +82,8 @@ spec:
              name: host-sys
            - mountPath: /run/mount
              name: host-mount
+            - mountPath: /run/vitastor-csi
+              name: run-vitastor-csi
            - mountPath: /lib/modules
              name: lib-modules
              readOnly: true
@@ -132,6 +134,9 @@ spec:
        - name: host-mount
          hostPath:
            path: /run/mount
+        - name: run-vitastor-csi
+          hostPath:
+            path: /run/vitastor-csi
        - name: lib-modules
          hostPath:
            path: /lib/modules
--- a/csi/deploy/005-csi-provisioner-rbac.yaml
+++ b/csi/deploy/005-csi-provisioner-rbac.yaml
@@ -35,10 +35,13 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshots"]
-    verbs: ["get", "list"]
+    verbs: ["get", "list", "patch"]
+  - apiGroups: ["snapshot.storage.k8s.io"]
+    resources: ["volumesnapshots/status"]
+    verbs: ["get", "list", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents"]
-    verbs: ["create", "get", "list", "watch", "update", "delete"]
+    verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotclasses"]
    verbs: ["get", "list", "watch"]
@@ -53,7 +56,7 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents/status"]
-    verbs: ["update"]
+    verbs: ["update", "patch"]
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get"]
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -23,6 +23,11 @@ metadata:
  name: csi-vitastor-provisioner
 spec:
  replicas: 3
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 1
+      maxSurge: 0
  selector:
    matchLabels:
      app: csi-vitastor-provisioner
@@ -46,7 +51,7 @@ spec:
      priorityClassName: system-cluster-critical
      containers:
        - name: csi-provisioner
-          image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
+          image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
          args:
            - "--csi-address=$(ADDRESS)"
            - "--v=5"
@@ -116,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.9.6
+          image: vitalif/vitastor-csi:v1.3.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/009-storage-class.yaml
+++ b/csi/deploy/009-storage-class.yaml
@@ -12,8 +12,6 @@ parameters:
  etcdVolumePrefix: ""
  poolId: "1"
  # you can choose other configuration file if you have it in the config map
+  # different etcd URLs and prefixes should also be put in the config
  #configPath: "/etc/vitastor/vitastor.conf"
-  # you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
-  # multiple etcdUrls may be specified, delimited by comma
-  #etcdUrl: "http://192.168.7.2:2379"
-  #etcdPrefix: "/vitastor"
+allowVolumeExpansion: true
--- a/csi/deploy/example-snapshot-class.yaml
+++ b/csi/deploy/example-snapshot-class.yaml
@@ -0,0 +1,7 @@
+apiVersion: snapshot.storage.k8s.io/v1
+kind: VolumeSnapshotClass
+metadata:
+  name: vitastor-snapclass
+driver: csi.vitastor.io
+deletionPolicy: Delete
+parameters:
--- a/csi/deploy/example-snapshot-clone.yaml
+++ b/csi/deploy/example-snapshot-clone.yaml
@@ -0,0 +1,16 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: test-vitastor-clone
+spec:
+  storageClassName: vitastor
+  dataSource:
+    name: snap1
+    kind: VolumeSnapshot
+    apiGroup: snapshot.storage.k8s.io
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
--- a/csi/deploy/example-snapshot.yaml
+++ b/csi/deploy/example-snapshot.yaml
@@ -0,0 +1,8 @@
+apiVersion: snapshot.storage.k8s.io/v1
+kind: VolumeSnapshot
+metadata:
+  name: snap1
+spec:
+  volumeSnapshotClassName: vitastor-snapclass
+  source:
+    persistentVolumeClaimName: test-vitastor-pvc
--- a/csi/go.mod
+++ b/csi/go.mod
@@ -9,6 +9,7 @@ require (
 	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
+	google.golang.org/protobuf v1.24.0
 	k8s.io/klog v1.0.0
 	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
 )
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.9.6"
+    vitastorCSIDriverVersion = "1.3.1"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -20,6 +20,7 @@ import (

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
+    "google.golang.org/protobuf/types/known/timestamppb"

    "github.com/container-storage-interface/spec/lib/go/csi"
 )
@@ -45,6 +46,7 @@ type InodeConfig struct
    ParentPool uint64 `json:"parent_pool,omitempty"`
    ParentId uint64 `json:"parent_id,omitempty"`
    Readonly bool `json:"readonly,omitempty"`
+    CreateTs uint64 `json:"create_ts,omitempty"`
 }

 type ControllerServer struct
@@ -60,7 +62,7 @@ func NewControllerServer(driver *Driver) *ControllerServer
    }
 }

-func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
+func GetConnectionParams(params map[string]string) (map[string]string, error)
 {
    ctxVars := make(map[string]string)
    configPath := params["configPath"]
@@ -73,71 +75,69 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
        ctxVars["configPath"] = configPath
    }
    config := make(map[string]interface{})
-    if configFD, err := os.Open(configPath); err == nil
+    configFD, err := os.Open(configPath)
+    if (err != nil)
    {
-        defer configFD.Close()
-        data, _ := ioutil.ReadAll(configFD)
-        json.Unmarshal(data, &config)
+        return nil, err
    }
-    // Try to load prefix & etcd URL from the config
+    defer configFD.Close()
+    data, _ := ioutil.ReadAll(configFD)
+    json.Unmarshal(data, &config)
+    // Check etcd URL in the config, but do not use the explicit etcdUrl
+    // parameter for CLI calls, otherwise users won't be able to later
+    // change them - storage class parameters are saved in volume IDs
    var etcdUrl []string
-    if (params["etcdUrl"] != "")
+    switch config["etcd_address"].(type)
    {
-        ctxVars["etcdUrl"] = params["etcdUrl"]
-        etcdUrl = strings.Split(params["etcdUrl"], ",")
+    case string:
+        url := strings.TrimSpace(config["etcd_address"].(string))
+        if (url != "")
+        {
+            etcdUrl = strings.Split(url, ",")
+        }
+    case []string:
+        etcdUrl = config["etcd_address"].([]string)
+    case []interface{}:
+        for _, url := range config["etcd_address"].([]interface{})
+        {
+            s, ok := url.(string)
+            if (ok)
+            {
+                etcdUrl = append(etcdUrl, s)
+            }
+        }
    }
    if (len(etcdUrl) == 0)
    {
-        switch config["etcd_address"].(type)
-        {
-        case string:
-            etcdUrl = strings.Split(config["etcd_address"].(string), ",")
-        case []string:
-            etcdUrl = config["etcd_address"].([]string)
-        }
+        return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
    }
-    etcdPrefix := params["etcdPrefix"]
-    if (etcdPrefix == "")
+    return ctxVars, nil
+}
+
+func system(program string, args ...string) ([]byte, []byte, error)
+{
+    klog.Infof("Running "+program+" "+strings.Join(args, " "))
+    c := exec.Command(program, args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout, c.Stderr = &stdout, &stderr
+    err := c.Run()
+    if (err != nil)
    {
-        etcdPrefix, _ = config["etcd_prefix"].(string)
-        if (etcdPrefix == "")
-        {
-            etcdPrefix = "/vitastor"
-        }
+        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
+        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
+        return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
    }
-    else
-    {
-        ctxVars["etcdPrefix"] = etcdPrefix
-    }
-    return ctxVars, etcdUrl, etcdPrefix
+    return stdout.Bytes(), stderr.Bytes(), nil
 }

 func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
 {
-    if (ctxVars["etcdUrl"] != "")
-    {
-        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
-    }
-    if (ctxVars["etcdPrefix"] != "")
-    {
-        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
-    }
    if (ctxVars["configPath"] != "")
    {
        args = append(args, "--config_path", ctxVars["configPath"])
    }
-    c := exec.Command("/usr/bin/vitastor-cli", args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout = &stdout
-    c.Stderr = &stderr
-    err := c.Run()
-    stderrStr := string(stderr.Bytes())
-    if (err != nil)
-    {
-        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
-        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
-    }
-    return stdout.Bytes(), nil
+    stdout, _, err := system("/usr/bin/vitastor-cli", args...)
+    return stdout, err
 }

 // Create the volume
@@ -172,33 +172,49 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }

-    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
-    if (len(etcdUrl) == 0)
+    ctxVars, err := GetConnectionParams(req.Parameters)
+    if (err != nil)
    {
-        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
+        return nil, err
+    }
+
+    args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
+
+    // Support creation from snapshot
+    var src *csi.VolumeContentSource
+    if (req.VolumeContentSource.GetSnapshot() != nil)
+    {
+        snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
+        if (snapId != "")
+        {
+            snapVars := make(map[string]string)
+            err := json.Unmarshal([]byte(snapId), &snapVars)
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+            }
+            args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
+            src = &csi.VolumeContentSource{
+                Type: &csi.VolumeContentSource_Snapshot{
+                    Snapshot: &csi.VolumeContentSource_SnapshotSource{
+                        SnapshotId: snapId,
+                    },
+                },
+            }
+        }
    }

    // Create image using vitastor-cli
-    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
+    _, err = invokeCLI(ctxVars, args)
    if (err != nil)
    {
        if (strings.Index(err.Error(), "already exists") > 0)
        {
-            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
+            inodeCfg, err := invokeList(ctxVars, volName, true)
            if (err != nil)
            {
                return nil, err
            }
-            var inodeCfg []InodeConfig
-            err = json.Unmarshal(stat, &inodeCfg)
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
-            }
-            if (len(inodeCfg) == 0)
-            {
-                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
-            }
            if (inodeCfg[0].Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -217,6 +233,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
            // Ugly, but VolumeContext isn't passed to DeleteVolume :-(
            VolumeId: string(volumeIdJson),
            CapacityBytes: volSize,
+            ContentSource: src,
        },
    }, nil
 }
@@ -230,15 +247,19 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }

-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
-    volName := ctxVars["name"]
+    volName := volVars["name"]

-    ctxVars, _, _ = GetConnectionParams(ctxVars)
+    ctxVars, err := GetConnectionParams(volVars)
+    if (err != nil)
+    {
+        return nil, err
+    }

    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
    if (err != nil)
@@ -344,6 +365,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
        csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
        csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
        csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
+        csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
+        // TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
    } {
        controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
    }
@@ -353,28 +376,226 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
    }, nil
 }

+func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
+{
+    stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
+    if (err != nil)
+    {
+        return nil, err
+    }
+    var inodeCfg []InodeConfig
+    err = json.Unmarshal(stat, &inodeCfg)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+    }
+    if (expectExist && len(inodeCfg) == 0)
+    {
+        return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
+    }
+    return inodeCfg, nil
+}
+
 // CreateSnapshot create snapshot of an existing PV
 func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
+    }
+    if (req.SourceVolumeId == "" || req.Name == "")
+    {
+        return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
+    }
+
+    // snapshot name
+    snapName := req.Name
+
+    // req.VolumeId is an ugly json string in our case :)
+    ctxVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+    }
+    volName := ctxVars["name"]
+
+    // Create image using vitastor-cli
+    _, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
+    if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
+    {
+        return nil, err
+    }
+
+    // Check created snapshot
+    inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    // Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
+    ctxVars["snapshot"] = snapName
+    snapIdJson, _ := json.Marshal(ctxVars)
+    return &csi.CreateSnapshotResponse{
+        Snapshot: &csi.Snapshot{
+            SizeBytes: int64(inodeCfg[0].Size),
+            SnapshotId: string(snapIdJson),
+            SourceVolumeId: req.SourceVolumeId,
+            CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
+            ReadyToUse: true,
+        },
+    }, nil
 }

 // DeleteSnapshot delete provided snapshot of a PV
 func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
+    }
+    if (req.SnapshotId == "")
+    {
+        return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
+    }
+
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
+    }
+    volName := volVars["name"]
+    snapName := volVars["snapshot"]
+
+    ctxVars, err := GetConnectionParams(volVars)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    _, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    return &csi.DeleteSnapshotResponse{}, nil
 }

 // ListSnapshots list the snapshots of a PV
 func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
+    }
+
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+    }
+    volName := volVars["name"]
+    ctxVars, err := GetConnectionParams(volVars)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    resp := &csi.ListSnapshotsResponse{}
+    for _, ino := range inodeCfg
+    {
+        snapName := ino.Name[len(volName)+1:]
+        if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
+        {
+        }
+        else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
+        {
+            volVars["snapshot"] = snapName
+            snapIdJson, _ := json.Marshal(volVars)
+            resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
+                Snapshot: &csi.Snapshot{
+                    SizeBytes: int64(ino.Size),
+                    SnapshotId: string(snapIdJson),
+                    SourceVolumeId: req.SourceVolumeId,
+                    CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
+                    ReadyToUse: true,
+                },
+            })
+        }
+        else
+        {
+            resp.NextToken = snapName
+            break
+        }
+    }
+
+    return resp, nil
 }

-// ControllerExpandVolume resizes a volume
+// ControllerExpandVolume increases the size of a volume
 func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
 {
-    return nil, status.Error(codes.Unimplemented, "")
+    klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
+    if (req == nil)
+    {
+        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
+    }
+    if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
+    {
+        return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
+    }
+
+    volVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
+    }
+    volName := volVars["name"]
+    ctxVars, err := GetConnectionParams(volVars)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    inodeCfg, err := invokeList(ctxVars, volName, true)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
+    {
+        sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
+        _, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
+        if (err != nil)
+        {
+            return nil, err
+        }
+        inodeCfg, err = invokeList(ctxVars, volName, true)
+        if (err != nil)
+        {
+            return nil, err
+        }
+    }
+
+    return &csi.ControllerExpandVolumeResponse{
+        CapacityBytes: int64(inodeCfg[0].Size),
+        NodeExpansionRequired: false,
+    }, nil
 }

 // ControllerGetVolume get volume info
--- a/csi/src/identityserver.go
+++ b/csi/src/identityserver.go
@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
                    },
                },
            },
+            {
+                Type: &csi.PluginCapability_VolumeExpansion_{
+                    VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
+                        Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
+                    },
+                },
+            },
        },
    }, nil
 }
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@@ -5,11 +5,15 @@ package vitastor

 import (
    "context"
+    "errors"
+    "encoding/json"
+    "fmt"
    "os"
    "os/exec"
-    "encoding/json"
+    "path/filepath"
+    "strconv"
    "strings"
-    "bytes"
+    "syscall"

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
@@ -25,16 +29,91 @@ import (
 type NodeServer struct
 {
    *Driver
+    useVduse bool
+    stateDir string
    mounter mount.Interface
 }

+type DeviceState struct
+{
+    ConfigPath string `json:"configPath"`
+    VdpaId     string `json:"vdpaId"`
+    Image      string `json:"image"`
+    Blockdev   string `json:"blockdev"`
+    Readonly   bool   `json:"readonly"`
+    PidFile    string `json:"pidFile"`
+}
+
 // NewNodeServer create new instance node
 func NewNodeServer(driver *Driver) *NodeServer
 {
-    return &NodeServer{
+    stateDir := os.Getenv("STATE_DIR")
+    if (stateDir == "")
+    {
+        stateDir = "/run/vitastor-csi"
+    }
+    if (stateDir[len(stateDir)-1] != '/')
+    {
+        stateDir += "/"
+    }
+    ns := &NodeServer{
        Driver: driver,
+        useVduse: checkVduseSupport(),
+        stateDir: stateDir,
        mounter: mount.New(""),
    }
+    if (ns.useVduse)
+    {
+        ns.restoreVduseDaemons()
+    }
+    return ns
+}
+
+func checkVduseSupport() bool
+{
+    // Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
+    vduse := true
+    for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
+    {
+        _, err := os.Stat("/sys/module/"+mod)
+        if (err != nil)
+        {
+            if (!errors.Is(err, os.ErrNotExist))
+            {
+                klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
+            }
+            c := exec.Command("/sbin/modprobe", mod)
+            c.Stdout = os.Stderr
+            c.Stderr = os.Stderr
+            err := c.Run()
+            if (err != nil)
+            {
+                klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
+                vduse = false
+                break
+            }
+        }
+    }
+    // Check that vdpa tool functions
+    if (vduse)
+    {
+        c := exec.Command("/sbin/vdpa", "-j", "dev")
+        c.Stderr = os.Stderr
+        err := c.Run()
+        if (err != nil)
+        {
+            klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
+            vduse = false
+        }
+    }
+    if (!vduse)
+    {
+        klog.Errorf(
+            "Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
+            " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
+        )
+    }
+    return vduse
 }

 // NodeStageVolume mounts the volume to a staging path on the node.
@@ -61,6 +140,303 @@ func Contains(list []string, s string) bool
    return false
 }

+func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
+{
+    // Map NBD device
+    // FIXME: Check if already mapped
+    args := []string{
+        "map", "--image", volName,
+    }
+    if (ctxVars["configPath"] != "")
+    {
+        args = append(args, "--config_path", ctxVars["configPath"])
+    }
+    if (readonly)
+    {
+        args = append(args, "--readonly", "1")
+    }
+    stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
+    dev := strings.TrimSpace(string(stdout))
+    if (dev == "")
+    {
+        return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
+    }
+    return dev, err
+}
+
+func (ns *NodeServer) unmapNbd(devicePath string)
+{
+    // unmap NBD device
+    unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+    if (unmapErr != nil)
+    {
+        klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+    }
+}
+
+func findByPidFile(pidFile string) (*os.Process, error)
+{
+    klog.Infof("killing process with PID from file %s", pidFile)
+    pidBuf, err := os.ReadFile(pidFile)
+    if (err != nil)
+    {
+        return nil, err
+    }
+    pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
+    if (err != nil)
+    {
+        return nil, err
+    }
+    proc, err := os.FindProcess(int(pid))
+    if (err != nil)
+    {
+        return nil, err
+    }
+    return proc, nil
+}
+
+func killByPidFile(pidFile string) error
+{
+    proc, err := findByPidFile(pidFile)
+    if (err != nil)
+    {
+        return err
+    }
+    return proc.Signal(syscall.SIGTERM)
+}
+
+func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
+{
+    // Start qemu-storage-daemon
+    blockSpec := map[string]interface{}{
+        "node-name": "disk1",
+        "driver": "vitastor",
+        "image": volName,
+        "cache": map[string]bool{
+            "direct": true,
+            "no-flush": false,
+        },
+        "discard": "unmap",
+    }
+    if (configPath != "")
+    {
+        blockSpec["config-path"] = configPath
+    }
+    blockSpecJson, _ := json.Marshal(blockSpec)
+    writable := "true"
+    if (readonly)
+    {
+        writable = "false"
+    }
+    _, _, err := system(
+        "/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
+        "--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
+    )
+    return err
+}
+
+func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
+{
+    // Generate state file
+    stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
+    if (err != nil)
+    {
+        return "", "", err
+    }
+    stateFile := stateFd.Name()
+    stateFd.Close()
+    vdpaId := filepath.Base(stateFile)
+    vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
+    pidFile := ns.stateDir + vdpaId + ".pid"
+    // Map VDUSE device via qemu-storage-daemon
+    err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
+    if (err == nil)
+    {
+        // Add device to VDPA bus
+        _, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
+        if (err == nil)
+        {
+            // Find block device name
+            var matches []string
+            matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
+            if (err == nil && len(matches) == 0)
+            {
+                err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
+            }
+            if (err == nil)
+            {
+                blockdev := "/dev/"+filepath.Base(matches[0])
+                _, err = os.Stat(blockdev)
+                if (err == nil)
+                {
+                    // Generate state file
+                    stateJSON, _ := json.Marshal(&DeviceState{
+                        ConfigPath: ctxVars["configPath"],
+                        VdpaId:     vdpaId,
+                        Image:      volName,
+                        Blockdev:   blockdev,
+                        Readonly:   readonly,
+                        PidFile:    pidFile,
+                    })
+                    err = os.WriteFile(stateFile, stateJSON, 0600)
+                    if (err == nil)
+                    {
+                        return blockdev, vdpaId, nil
+                    }
+                }
+            }
+        }
+        killErr := killByPidFile(pidFile)
+        if (killErr != nil)
+        {
+            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
+        }
+        os.Remove(stateFile)
+        os.Remove(pidFile)
+    }
+    return "", "", err
+}
+
+func (ns *NodeServer) unmapVduse(devicePath string)
+{
+    if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
+    {
+        klog.Errorf("%s does not start with /dev/v", devicePath)
+        return
+    }
+    vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
+    if (err != nil)
+    {
+        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
+        return
+    }
+    vdpaId := ""
+    p := strings.Index(vduseDev, "/vduse/")
+    if (p >= 0)
+    {
+        vduseDev = vduseDev[p+7:]
+        p = strings.Index(vduseDev, "/")
+        if (p >= 0)
+        {
+            vdpaId = vduseDev[0:p]
+        }
+    }
+    if (vdpaId == "")
+    {
+        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
+        return
+    }
+    ns.unmapVduseById(vdpaId)
+}
+
+func (ns *NodeServer) unmapVduseById(vdpaId string)
+{
+    _, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
+    if (err != nil)
+    {
+        klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
+    }
+    else
+    {
+        _, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
+    }
+    stateFile := ns.stateDir + vdpaId + ".json"
+    os.Remove(stateFile)
+    pidFile := ns.stateDir + vdpaId + ".pid"
+    _, err = os.Stat(pidFile)
+    if (os.IsNotExist(err))
+    {
+        // ok, already killed
+    }
+    else if (err != nil)
+    {
+        klog.Errorf("Failed to stat %v: %v", pidFile, err)
+        return
+    }
+    else
+    {
+        err = killByPidFile(pidFile)
+        if (err != nil)
+        {
+            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
+        }
+        os.Remove(pidFile)
+    }
+}
+
+func (ns *NodeServer) restoreVduseDaemons()
+{
+    pattern := ns.stateDir+"vitastor-vduse-*.json"
+    matches, err := filepath.Glob(pattern)
+    if (err != nil)
+    {
+        klog.Errorf("failed to list %s: %v", pattern, err)
+    }
+    if (len(matches) == 0)
+    {
+        return
+    }
+    devList := make(map[string]interface{})
+    // example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
+    devListJSON, _, err := system("/sbin/vdpa", "-j", "dev", "list")
+    if (err != nil)
+    {
+        return
+    }
+    err = json.Unmarshal(devListJSON, &devList)
+    devs, ok := devList["dev"].(map[string]interface{})
+    if (err != nil || !ok)
+    {
+        klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
+        return
+    }
+    for _, stateFile := range matches
+    {
+        vdpaId := filepath.Base(stateFile)
+        vdpaId = vdpaId[0:len(vdpaId)-5]
+        // Check if VDPA device is still added to the bus
+        if (devs[vdpaId] != nil)
+        {
+            // Check if the storage daemon is still active
+            pidFile := ns.stateDir + vdpaId + ".pid"
+            exists := false
+            proc, err := findByPidFile(pidFile)
+            if (err == nil)
+            {
+                exists = proc.Signal(syscall.Signal(0)) == nil
+            }
+            if (!exists)
+            {
+                // Restart daemon
+                stateJSON, err := os.ReadFile(stateFile)
+                if (err != nil)
+                {
+                    klog.Warningf("error reading state file %v: %v", stateFile, err)
+                }
+                else
+                {
+                    var state DeviceState
+                    err := json.Unmarshal(stateJSON, &state)
+                    if (err != nil)
+                    {
+                        klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
+                    }
+                    else
+                    {
+                        klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
+                        _ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Unused, clean it up
+            ns.unmapVduseById(vdpaId)
+        }
+    }
+}
+
 // NodePublishVolume mounts the volume mounted to the staging path to the target path
 func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
 {
@@ -70,10 +446,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    isBlock := req.GetVolumeCapability().GetBlock() != nil

    // Check that it's not already mounted
-    _, error := mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (error != nil)
+    _, err := mount.IsNotMountPoint(ns.mounter, targetPath)
+    if (err != nil)
    {
-        if (os.IsNotExist(error))
+        if (os.IsNotExist(err))
        {
            if (isBlock)
            {
@@ -81,13 +457,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
                if (err != nil)
                {
                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
+                    return nil, err
                }
                err = pathFile.Close()
                if (err != nil)
                {
                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
+                    return nil, err
                }
            }
            else
@@ -96,121 +472,112 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
                if (err != nil)
                {
                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
+                    return nil, err
                }
            }
        }
        else
        {
-            return nil, status.Error(codes.Internal, error.Error())
+            return nil, err
        }
    }

    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
+    err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
    volName := ctxVars["name"]

-    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
-    if (len(etcdUrl) == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
-    }
-
-    // Map NBD device
-    // FIXME: Check if already mapped
-    args := []string{
-        "map", "--etcd_address", strings.Join(etcdUrl, ","),
-        "--etcd_prefix", etcdPrefix,
-        "--image", volName,
-    };
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    if (req.GetReadonly())
-    {
-        args = append(args, "--readonly", "1")
-    }
-    c := exec.Command("/usr/bin/vitastor-nbd", args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout, c.Stderr = &stdout, &stderr
-    err = c.Run()
-    stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
+    _, err = GetConnectionParams(ctxVars)
    if (err != nil)
    {
-        klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
-        return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
-    }
-    devicePath := strings.TrimSpace(stdoutStr)
-
-    // Check existing format
-    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
-    existingFormat, err := diskMounter.GetDiskFormat(devicePath)
-    if (err != nil)
-    {
-        klog.Errorf("failed to get disk format for path %s, error: %v", err)
-        // unmap NBD device
-        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-        if (unmapErr != nil)
-        {
-            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-        }
        return nil, err
    }

-    // Format the device (ext4 or xfs)
-    fsType := req.GetVolumeCapability().GetMount().GetFsType()
-    opt := req.GetVolumeCapability().GetMount().GetMountFlags()
-    opt = append(opt, "_netdev")
-    if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
-        req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
-        !Contains(opt, "ro"))
+    var devicePath, vdpaId string
+    if (!ns.useVduse)
    {
-        opt = append(opt, "ro")
-    }
-    if (fsType == "xfs")
-    {
-        opt = append(opt, "nouuid")
-    }
-    readOnly := Contains(opt, "ro")
-    if (existingFormat == "" && !readOnly)
-    {
-        args := []string{}
-        switch fsType
-        {
-            case "ext4":
-                args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
-            case "xfs":
-                args = []string{"-K", devicePath}
-        }
-        if (len(args) > 0)
-        {
-            cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
-            if (cmdErr != nil)
-            {
-                klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
-                // unmap NBD device
-                unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-                if (unmapErr != nil)
-                {
-                    klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-                }
-                return nil, status.Error(codes.Internal, cmdErr.Error())
-            }
-        }
-    }
-    if (isBlock)
-    {
-        opt = append(opt, "bind")
-        err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
+        devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
    }
    else
    {
+        devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
+    }
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
+    if (isBlock)
+    {
+        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
+    }
+    else
+    {
+        // Check existing format
+        existingFormat, err := diskMounter.GetDiskFormat(devicePath)
+        if (err != nil)
+        {
+            klog.Errorf("failed to get disk format for path %s, error: %v", err)
+            goto unmap
+        }
+
+        // Format the device (ext4 or xfs)
+        fsType := req.GetVolumeCapability().GetMount().GetFsType()
+        opt := req.GetVolumeCapability().GetMount().GetMountFlags()
+        opt = append(opt, "_netdev")
+        if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
+            req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
+            !Contains(opt, "ro"))
+        {
+            opt = append(opt, "ro")
+        }
+        if (fsType == "xfs")
+        {
+            opt = append(opt, "nouuid")
+        }
+        readOnly := Contains(opt, "ro")
+        if (existingFormat == "" && !readOnly)
+        {
+            var cmdOut []byte
+            switch fsType
+            {
+                case "ext4":
+                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
+                    cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
+                case "xfs":
+                    cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
+            }
+            if (err != nil)
+            {
+                klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
+                goto unmap
+            }
+        }
+
        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
+
+        // Try to run online resize on mount.
+        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
+        if (err == nil && existingFormat != "" && !readOnly)
+        {
+            var cmdOut []byte
+            switch (fsType)
+            {
+                case "ext4":
+                    cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
+                case "xfs":
+                    cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
+            }
+            if (err != nil)
+            {
+                klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
+                goto unmap
+            }
+        }
    }
    if (err != nil)
    {
@@ -218,15 +585,20 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
            devicePath, targetPath, volName, err,
        )
-        // unmap NBD device
-        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-        if (unmapErr != nil)
-        {
-            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-        }
-        return nil, status.Error(codes.Internal, err.Error())
+        goto unmap
    }
    return &csi.NodePublishVolumeResponse{}, nil
+
+unmap:
+    if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
+    {
+        ns.unmapNbd(devicePath)
+    }
+    else
+    {
+        ns.unmapVduseById(vdpaId)
+    }
+    return nil, err
 }

 // NodeUnpublishVolume unmounts the volume from the target path
@@ -241,25 +613,31 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
        {
            return nil, status.Error(codes.NotFound, "Target path not found")
        }
-        return nil, status.Error(codes.Internal, err.Error())
+        return nil, err
    }
    if (devicePath == "")
    {
-        return nil, status.Error(codes.NotFound, "Volume not mounted")
+        // volume not mounted
+        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
+        os.Remove(targetPath)
+        return &csi.NodeUnpublishVolumeResponse{}, nil
    }
    // unmount
    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
    if (err != nil)
    {
-        return nil, status.Error(codes.Internal, err.Error())
+        return nil, err
    }
    // unmap NBD device
    if (refCount == 1)
    {
-        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-        if (unmapErr != nil)
+        if (!ns.useVduse)
        {
-            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+            ns.unmapNbd(devicePath)
+        }
+        else
+        {
+            ns.unmapVduse(devicePath)
        }
    }
    return &csi.NodeUnpublishVolumeResponse{}, nil
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.9.6-1) unstable; urgency=medium
+vitastor (1.3.1-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.9.6-1) unstable; urgency=medium
+vitastor (0.7.0-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: vitastor
 Section: admin
 Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
 Rules-Requires-Root: no
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -7,7 +7,7 @@ ARG REL=

 WORKDIR /root

-RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
+RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
@@ -45,7 +45,7 @@ RUN set -e; \
    rm -rf /root/packages/qemu-$REL/*; \
    cd /root/packages/qemu-$REL; \
    dpkg-source -x /root/qemu*.dsc; \
-    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*(\d+\.\d+).*!$1!'); \
+    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*?(\d+\.\d+).*!$1!'); \
    D=$(ls -d qemu*/); \
    cp /root/vitastor/patches/qemu-$QEMU_VER-vitastor.patch ./qemu-*/debian/patches; \
    echo qemu-$QEMU_VER-vitastor.patch >> $D/debian/patches/series; \
@@ -54,7 +54,8 @@ RUN set -e; \
    quilt add block/vitastor.c; \
    cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
+    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
+    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.9.6; \
-    cd vitastor-0.9.6; \
+    cp -r /root/vitastor vitastor-1.3.1; \
+    cd vitastor-1.3.1; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.6.orig.tar.xz vitastor-0.9.6; \
-    cd vitastor-0.9.6; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.3.1.orig.tar.xz vitastor-1.3.1; \
+    cd vitastor-1.3.1; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config.en.md
+++ b/docs/config.en.md
@@ -33,6 +33,7 @@ In the future, additional configuration methods may be added:

 - [Common](config/common.en.md)
 - [Network](config/network.en.md)
+- [Client](config/client.en.md)
 - [Global Disk Layout](config/layout-cluster.en.md)
 - [OSD Disk Layout](config/layout-osd.en.md)
 - [OSD Runtime Parameters](config/osd.en.md)
--- a/docs/config.ru.md
+++ b/docs/config.ru.md
@@ -36,6 +36,7 @@

 - [Общие](config/common.ru.md)
 - [Сеть](config/network.ru.md)
+- [Клиентский код](config/client.ru.md)
 - [Глобальные дисковые параметры](config/layout-cluster.ru.md)
 - [Дисковые параметры OSD](config/layout-osd.ru.md)
 - [Прочие параметры OSD](config/osd.ru.md)
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -0,0 +1,137 @@
+[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
+
+-----
+
+[Читать на русском](client.ru.md)
+
+# Client Parameters
+
+These parameters apply only to clients and affect their interaction with
+the cluster.
+
+- [client_max_dirty_bytes](#client_max_dirty_bytes)
+- [client_max_dirty_ops](#client_max_dirty_ops)
+- [client_enable_writeback](#client_enable_writeback)
+- [client_max_buffered_bytes](#client_max_buffered_bytes)
+- [client_max_buffered_ops](#client_max_buffered_ops)
+- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
+- [nbd_timeout](#nbd_timeout)
+- [nbd_max_devices](#nbd_max_devices)
+- [nbd_max_part](#nbd_max_part)
+
+## client_max_dirty_bytes
+
+- Type: integer
+- Default: 33554432
+- Can be changed online: yes
+
+Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
+(not committed by fsync) data allowed by the client before forcing an
+additional fsync and committing the data. Also note that the client always
+holds a copy of uncommitted data in memory so this setting also affects
+RAM usage of clients.
+
+## client_max_dirty_ops
+
+- Type: integer
+- Default: 1024
+- Can be changed online: yes
+
+Same as client_max_dirty_bytes, but instead of total size, limits the number
+of uncommitted write operations.
+
+## client_enable_writeback
+
+- Type: boolean
+- Default: false
+- Can be changed online: yes
+
+This parameter enables client-side write buffering. This means that write
+requests are accumulated in memory for a short time before being sent to
+a Vitastor cluster which allows to send them in parallel and increase
+performance of some applications. Writes are buffered until client forces
+a flush with fsync() or until the amount of buffered writes exceeds the
+limit.
+
+Write buffering significantly increases performance of some applications,
+for example, CrystalDiskMark under Windows (LOL :-D), but also any other
+applications if they do writes in one of two non-optimal ways: either if
+they do a lot of small (4 kb or so) sequential writes, or if they do a lot
+of small random writes, but without any parallelism or asynchrony, and also
+without calling fsync().
+
+With write buffering enabled, you can expect around 22000 T1Q1 random write
+iops in QEMU more or less regardless of the quality of your SSDs, and this
+number is in fact bound by QEMU itself rather than Vitastor (check it
+yourself by adding a "driver=null-co" disk in QEMU). Without write
+buffering, the current record is 9900 iops, but the number is usually
+even lower with non-ideal hardware, for example, it may be 5000 iops.
+
+Even when this parameter is enabled, write buffering isn't enabled until
+the client explicitly allows it, because enabling it without the client
+being aware of the fact that his writes may be buffered may lead to data
+loss. Because of this, older versions of clients don't support write
+buffering at all, newer versions of the QEMU driver allow write buffering
+only if it's enabled in disk settings with `-blockdev cache.direct=false`,
+and newer versions of FIO only allow write buffering if you don't specify
+`-direct=1`. NBD and NFS drivers allow write buffering by default.
+
+You can overcome this restriction too with the `client_writeback_allowed`
+parameter, but you shouldn't do that unless you **really** know what you
+are doing.
+
+## client_max_buffered_bytes
+
+- Type: integer
+- Default: 33554432
+- Can be changed online: yes
+
+Maximum total size of buffered writes which triggers write-back when reached.
+
+## client_max_buffered_ops
+
+- Type: integer
+- Default: 1024
+- Can be changed online: yes
+
+Maximum number of buffered writes which triggers write-back when reached.
+Multiple consecutive modified data regions are counted as 1 write here.
+
+## client_max_writeback_iodepth
+
+- Type: integer
+- Default: 256
+- Can be changed online: yes
+
+Maximum number of parallel writes when flushing buffered data to the server.
+
+## nbd_timeout
+
+- Type: seconds
+- Default: 300
+
+Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
+executes for longer than this timeout, including when your cluster is just
+temporarily down for more than timeout, the NBD device will detach by itself
+(and possibly break the mounted file system).
+
+You can set timeout to 0 to never detach, but in that case you won't be
+able to remove the kernel device at all if the NBD process dies - you'll have
+to reboot the host.
+
+## nbd_max_devices
+
+- Type: integer
+- Default: 64
+
+Maximum number of NBD devices in the system. This value is passed as
+`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
+
+## nbd_max_part
+
+- Type: integer
+- Default: 3
+
+Maximum number of partitions per NBD device. This value is passed as
+`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
+Note that (nbds_max)*(1+max_part) usually can't exceed 256.
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -0,0 +1,137 @@
+[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
+
+-----
+
+[Read in English](client.en.md)
+
+# Параметры клиентского кода
+
+Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
+затрагивают логику их работы с кластером.
+
+- [client_max_dirty_bytes](#client_max_dirty_bytes)
+- [client_max_dirty_ops](#client_max_dirty_ops)
+- [client_enable_writeback](#client_enable_writeback)
+- [client_max_buffered_bytes](#client_max_buffered_bytes)
+- [client_max_buffered_ops](#client_max_buffered_ops)
+- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
+- [nbd_timeout](#nbd_timeout)
+- [nbd_max_devices](#nbd_max_devices)
+- [nbd_max_part](#nbd_max_part)
+
+## client_max_dirty_bytes
+
+- Тип: целое число
+- Значение по умолчанию: 33554432
+- Можно менять на лету: да
+
+При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
+зафиксированных fsync-ом) данных, при достижении которого клиент будет
+принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+что в этом случае до момента fsync клиент хранит копию незафиксированных
+данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+
+## client_max_dirty_ops
+
+- Тип: целое число
+- Значение по умолчанию: 1024
+- Можно менять на лету: да
+
+Аналогично client_max_dirty_bytes, но ограничивает количество
+незафиксированных операций записи вместо их общего объёма.
+
+## client_enable_writeback
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+- Можно менять на лету: да
+
+Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
+означает, что операции записи отправляются на кластер Vitastor не сразу, а
+могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
+до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
+пока клиент не вызовет fsync.
+
+Буферизация значительно повышает производительность некоторых приложений,
+например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
+которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
+(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
+есть, например, отправляя 128 операций записи в разные места диска, но не
+все сразу с помощью асинхронного I/O, а по одной.
+
+В QEMU с буферизацией записи можно ожидать показателя примерно 22000
+операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
+без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
+цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
+в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
+в секунду.
+
+При этом, даже если данный параметр включён, буферизация не включается, если
+явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
+буферизуются, это может приводить к потере данных. Поэтому в старых версиях
+клиентских драйверов буферизация записи не включается вообще, в новых
+версиях QEMU-драйвера включается, только если разрешена опцией диска
+`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
+В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
+
+Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
+но делать так не надо, если только вы не уверены в том, что делаете, на все
+100%. :-)
+
+## client_max_buffered_bytes
+
+- Тип: целое число
+- Значение по умолчанию: 33554432
+- Можно менять на лету: да
+
+Максимальный общий размер буферизованных записей, при достижении которого
+начинается процесс сброса данных на сервер.
+
+## client_max_buffered_ops
+
+- Тип: целое число
+- Значение по умолчанию: 1024
+- Можно менять на лету: да
+
+Максимальное количество буферизованных записей, при достижении которого
+начинается процесс сброса данных на сервер. При этом несколько
+последовательных изменённых областей здесь считаются 1 записью.
+
+## client_max_writeback_iodepth
+
+- Тип: целое число
+- Значение по умолчанию: 256
+- Можно менять на лету: да
+
+Максимальное число параллельных операций записи при сбросе буферов на сервер.
+
+## nbd_timeout
+
+- Тип: секунды
+- Значение по умолчанию: 300
+
+Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
+операция выполняется дольше таймаута, включая временную недоступность
+кластера на время, большее таймаута, NBD-устройство отключится само собой
+(и, возможно, сломает примонтированную ФС).
+
+Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
+таймауту, но в этом случае вы вообще не сможете удалить устройство, если
+процесс NBD умрёт - вам придётся перезагружать сервер.
+
+## nbd_max_devices
+
+- Тип: целое число
+- Значение по умолчанию: 64
+
+Максимальное число NBD-устройств в системе. Данное значение передаётся
+модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
+
+## nbd_max_part
+
+- Тип: целое число
+- Значение по умолчанию: 3
+
+Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
+модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
+Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
--- a/docs/config/layout-cluster.en.md
+++ b/docs/config/layout-cluster.en.md
@@ -96,8 +96,9 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
 it (they have internal SSD cache even though it's not stated in datasheets).

 Setting this parameter to "all" or "small" in OSD parameters requires enabling
-disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
-enabling disable_data_fsync.
+[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
+[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
+"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).

 TLDR: For optimal performance, set immediate_commit to "all" if you only use
 SSDs with supercapacitor-based power loss protection (nonvolatile
--- a/docs/config/layout-cluster.ru.md
+++ b/docs/config/layout-cluster.ru.md
@@ -103,8 +103,9 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
 указано в спецификациях).

 Указание "all" или "small" в настройках / командной строке OSD требует
-включения disable_journal_fsync и disable_meta_fsync, значение "all" также
-требует включения disable_data_fsync.
+включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
+[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
+также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).

 Итого, вкратце: для оптимальной производительности установите
 immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/layout-osd.en.md
+++ b/docs/config/layout-osd.en.md
@@ -24,6 +24,8 @@ initialization and can't be changed after it without losing data.
 - [disable_journal_fsync](#disable_journal_fsync)
 - [disable_device_lock](#disable_device_lock)
 - [disk_alignment](#disk_alignment)
+- [data_csum_type](#data_csum_type)
+- [csum_block_size](#csum_block_size)

 ## data_device

@@ -174,3 +176,43 @@ Intel Optane (probably, not tested yet).

 Clients don't need to be aware of disk_alignment, so it's not required to
 put a modified value into etcd key /vitastor/config/global.
+
+## data_csum_type
+
+- Type: string
+- Default: none
+
+Data checksum type to use. May be "crc32c" or "none". Set to "crc32c" to
+enable data checksums.
+
+## csum_block_size
+
+- Type: integer
+- Default: 4096
+
+Checksum calculation block size.
+
+Must be equal or a multiple of [bitmap_granularity](layout-cluster.en.md#bitmap_granularity)
+(which is usually 4 KB).
+
+Checksums increase metadata size by 4 bytes per each csum_block_size of data.
+
+Checksums are always a tradeoff:
+1. You either sacrifice +1 GB RAM per 1 TB of data
+2. Or you raise csum_block_size, for example, to 32k and sacrifice
+   50% random write iops due to checksum read-modify-write
+3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
+   sacrifice 50% random read iops due to checksum reads
+
+All-flash clusters usually have enough RAM to use default csum_block_size,
+which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.
+
+Thus, recommended setups are:
+1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
+2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
+3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+4. HDD-only, faster random read: csum_block_size=32k
+5. HDD-only, faster random write: csum_block_size=4k +
+   inmemory_metadata=false + meta_io=cached
+
+See also [meta_io](osd.en.md#meta_io).
--- a/docs/config/layout-osd.ru.md
+++ b/docs/config/layout-osd.ru.md
@@ -25,6 +25,8 @@
 - [disable_journal_fsync](#disable_journal_fsync)
 - [disable_device_lock](#disable_device_lock)
 - [disk_alignment](#disk_alignment)
+- [data_csum_type](#data_csum_type)
+- [csum_block_size](#csum_block_size)

 ## data_device

@@ -183,3 +185,47 @@ journal_block_size и meta_block_size. Однако единственные SSD

 Клиентам не обязательно знать про disk_alignment, так что помещать значение
 этого параметра в etcd в /vitastor/config/global не нужно.
+
+## data_csum_type
+
+- Тип: строка
+- Значение по умолчанию: none
+
+Тип используемых OSD контрольных сумм данных. Может быть "crc32c" или "none".
+Установите в "crc32c", чтобы включить расчёт и проверку контрольных сумм данных.
+
+Следует понимать, что контрольные суммы в зависимости от размера блока их
+расчёта либо увеличивают потребление памяти, либо снижают производительность.
+Подробнее смотрите в описании параметра [csum_block_size](#csum_block_size).
+
+## csum_block_size
+
+- Тип: целое число
+- Значение по умолчанию: 4096
+
+Размер блока расчёта контрольных сумм.
+
+Должен быть равен или кратен [bitmap_granularity](layout-cluster.ru.md#bitmap_granularity)
+(который обычно равен 4 КБ).
+
+Контрольные суммы увеличивают размер метаданных на 4 байта на каждые
+csum_block_size данных.
+
+Контрольные суммы - это всегда компромисс:
+1. Вы либо жертвуете потреблением +1 ГБ памяти на 1 ТБ дискового пространства
+2. Либо вы повышаете csum_block_size до, скажем, 32k и жертвуете 50%
+   скорости случайной записи из-за цикла чтения-изменения-записи для расчёта
+   новых контрольных сумм
+3. Либо вы отключаете [inmemory_metadata](osd.ru.md#inmemory_metadata) и
+   жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
+   с диска
+
+Таким образом, рекомендуются следующие варианты настроек:
+1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
+2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
+3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+4. Только HDD, быстрее случайное чтение: csum_block_size=32k
+5. Только HDD, быстрее случайная запись: csum_block_size=4k +
+   inmemory_metadata=false + meta_io=cached
+
+Смотрите также [meta_io](osd.ru.md#meta_io).
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -20,6 +20,7 @@ between clients, OSDs and etcd.
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
+- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -30,7 +31,6 @@ between clients, OSDs and etcd.
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@@ -69,11 +69,14 @@ but they are not connected to the cluster.
 - Type: string

 RDMA device name to use for Vitastor OSD communications (for example,
-"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
-Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
-to work. For example, Mellanox ConnectX-3 and older adapters don't have
-Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
-root to list available RDMA devices and their features.
+"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
+ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
+
+Versions up to Vitastor 1.2.0 required ODP which is only present in
+Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
+
+Run `ibv_devinfo -v` as root to list available RDMA devices and their
+features.

 Remember that you also have to configure your network switches if you use
 RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -148,6 +151,28 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
 Doesn't affect memory usage - additional memory isn't allocated for send
 operations.

+## rdma_odp
+
+- Type: boolean
+- Default: false
+
+Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
+ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
+for RDMA adapter to be able to use it. This, in turn, allows to skip memory
+copying during sending. One would think this should improve performance, but
+**in reality** RDMA performance with ODP is **drastically** worse. Example
+3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
+without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
+
+This happens because Mellanox ODP implementation seems to be based on
+message retransmissions when the adapter doesn't know about the buffer yet -
+it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
+which is generally slow in RDMA/RoCE networks. Here's a presentation about
+it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+ODP support is retained in the code just in case a good ODP implementation
+appears one day.
+
 ## peer_connect_interval

 - Type: seconds
@@ -240,17 +265,3 @@ etcd_report_interval to guarantee that keepalive actually works.

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
-
-## client_dirty_limit
-
- Type: integer
- Default: 33554432
- Can be changed online: yes
-
-Without immediate_commit=all this parameter sets the limit of "dirty"
-(not committed by fsync) data allowed by the client before forcing an
-additional fsync and committing the data. Also note that the client always
-holds a copy of uncommitted data in memory so this setting also affects
-RAM usage of clients.
-
-This parameter doesn't affect OSDs themselves.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -20,6 +20,7 @@
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
+- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
@@ -30,7 +31,6 @@
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@@ -72,12 +72,15 @@ RDMA может быть нужно только если у клиентов е
 - Тип: строка

 Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
-Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
-адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
-потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
-суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
-параметры и возможности.
+Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
+нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
+картами производства не Mellanox.
+
+Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
+на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
+
+Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
+список доступных RDMA-устройств, их параметры и возможности.

 Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
 правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -156,6 +159,29 @@ OSD в любом случае согласовывают реальное зн
 Не влияет на потребление памяти - дополнительная память на операции отправки
 не выделяется.

+## rdma_odp
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+
+Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
+исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
+не регистрировать память для её использования RDMA-картой. Благодаря этому
+можно не копировать данные при отправке их в сеть и, казалось бы, это должно
+улучшать производительность - но **по факту** получается так, что
+производительность только ухудшается, причём сильно. Пример - на 3-узловом
+кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
+удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
+
+Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
+основана на повторной передаче сообщений, когда карте не известен буфер -
+вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
+А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
+Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+Возможность использования ODP сохранена в коде на случай, если вдруг в один
+прекрасный день появится хорошая реализация ODP.
+
 ## peer_connect_interval

 - Тип: секунды
@@ -251,17 +277,3 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
-
-## client_dirty_limit
-
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
-
-При работе без immediate_commit=all - это лимит объёма "грязных" (не
-зафиксированных fsync-ом) данных, при достижении которого клиент будет
-принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-что в этом случае до момента fsync клиент хранит копию незафиксированных
-данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
-
-Параметр не влияет на сами OSD.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -11,6 +11,7 @@ initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.

 - [etcd_report_interval](#etcd_report_interval)
+- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -31,6 +32,9 @@ them, even without restarting by updating configuration in etcd.
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
+- [data_io](#data_io)
+- [meta_io](#meta_io)
+- [journal_io](#journal_io)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@@ -53,11 +57,21 @@ them, even without restarting by updating configuration in etcd.
 - Type: seconds
 - Default: 5

-Interval at which OSDs report their state to etcd. Affects OSD lease time
+Interval at which OSDs report their liveness to etcd. Affects OSD lease time
 and thus the failover speed. Lease time is equal to this parameter value
 plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
 that every OSD always refreshes its lease in time.

+## etcd_stats_interval
+
+- Type: seconds
+- Default: 30
+
+Interval at which OSDs report their statistics to etcd. Highly affects the
+imposed load on etcd, because statistics include a key for every OSD and
+for every PG. At the same time, low statistic intervals make `vitastor-cli`
+statistics more responsive.
+
 ## run_primary

 - Type: boolean
@@ -255,6 +269,60 @@ is typically very small because it's sufficient to have 16-32 MB journal
 for SSD OSDs. However, in theory it's possible that you'll want to turn it
 off for hybrid (HDD+SSD) OSDs with large journals on quick devices.

+## data_io
+
+- Type: string
+- Default: direct
+
+I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
+to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
+
+Choose "cached" to use Linux page cache. This may improve read performance
+for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
+decrease write performance for fast disks because page cache is an overhead
+itself.
+
+Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
+(which requires disable_data_fsync) with drives having write-back cache
+which can't be turned off, for example, Intel Optane. Also note that *some*
+desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
+disable_data_fsync unsafe even with "directsync".
+
+## meta_io
+
+- Type: string
+- Default: direct
+
+I/O mode for *metadata*. One of "direct", "cached" or "directsync".
+
+"cached" may improve read performance, but only under the following conditions:
+1. your drives are relatively slow (HDD, SATA SSD), and
+2. checksums are enabled, and
+3. [inmemory_metadata](#inmemory_metadata) is disabled.
+Under all these conditions, metadata blocks are read from disk on every
+read request to verify checksums and caching them may reduce this extra
+read load. Without (3) metadata is never read from the disk after starting,
+and without (2) metadata blocks are read from disk only during journal
+flushing.
+
+"directsync" is the same as above.
+
+If the same device is used for data and metadata, meta_io by default is set
+to the same value as [data_io](#data_io).
+
+## journal_io
+
+- Type: string
+- Default: direct
+
+I/O mode for *journal*. One of "direct", "cached" or "directsync".
+
+Here, "cached" may only improve read performance for recent writes and
+only if [inmemory_journal](#inmemory_journal) is turned off.
+
+If the same device is used for metadata and journal, journal_io by default
+is set to the same value as [meta_io](#meta_io).
+
 ## journal_sector_buffer_count

 - Type: integer
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -12,6 +12,7 @@
 изменения конфигурации в etcd.

 - [etcd_report_interval](#etcd_report_interval)
+- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@@ -32,6 +33,9 @@
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
+- [data_io](#data_io)
+- [meta_io](#meta_io)
+- [journal_io](#journal_io)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@@ -54,11 +58,21 @@
 - Тип: секунды
 - Значение по умолчанию: 5

-Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
-влияет на время резервации (lease) OSD и поэтому на скорость переключения
+Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
+влияет на время резервации (lease) OSD и поэтому - на скорость переключения
 при падении OSD. Время lease равняется значению этого параметра плюс
 max_etcd_attempts * etcd_quick_timeout.

+## etcd_stats_interval
+
+- Тип: секунды
+- Значение по умолчанию: 30
+
+Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
+создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
+каждый OSD и на каждую PG. В то же время низкий интервал делает
+статистику, печатаемую `vitastor-cli`, отзывчивей.
+
 ## run_primary

 - Тип: булево (да/нет)
@@ -263,6 +277,63 @@ Flusher - это микро-поток (корутина), которая коп
 параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
 журналами, расположенными на быстром по сравнению с HDD устройстве.

+## data_io
+
+- Тип: строка
+- Значение по умолчанию: direct
+
+Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
+"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
+
+Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
+чтении и записи. Это может улучшить скорость чтения горячих данных с
+относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
+снижает производительность записи для быстрых дисков, так как кэш сам по
+себе тоже добавляет накладные расходы.
+
+Выберите "directsync", если хотите задействовать
+[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
+включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
+дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
+настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
+fsync небезопасным даже с режимом "directsync".
+
+## meta_io
+
+- Тип: строка
+- Значение по умолчанию: direct
+
+Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
+"directsync".
+
+"cached" может улучшить скорость чтения, если:
+1. у вас медленные диски (HDD, SATA SSD)
+2. контрольные суммы включены
+3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
+При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
+для проверки контрольных сумм и их кэширование может снизить дополнительную
+нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
+запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
+
+Если одно и то же устройство используется для данных и метаданных, режим
+ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
+
+## journal_io
+
+- Тип: строка
+- Значение по умолчанию: direct
+
+Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
+"directsync".
+
+Здесь "cached" может улучшить скорость чтения только недавно записанных
+данных и только если параметр [inmemory_journal](#inmemory_journal)
+отключён.
+
+Если одно и то же устройство используется для метаданных и журнала,
+режим ввода-вывода журнала по умолчанию устанавливается равным
+[meta_io](#meta_io).
+
 ## journal_sector_buffer_count

 - Тип: целое число
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -205,9 +205,8 @@ This parameter usually doesn't require to be changed.
 - Default: 131072

 Block size for this pool. The value from /vitastor/config/global is used when
-unspecified. If your cluster has OSDs with different block sizes then pool must
-be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
-size.
+unspecified. Only OSDs with matching block_size are used for each pool. If you
+want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).

@@ -216,10 +215,9 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Type: integer
 - Default: 4096

-"Sector" size of virtual disks in this pool. The value from
-/vitastor/config/global is used when unspecified. Similar to block_size, the
-pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
-matching bitmap_granularity.
+"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
+is used when unspecified. Similarly to block_size, only OSDs with matching
+bitmap_granularity are used for each pool.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).

@@ -229,10 +227,11 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Default: none

 Immediate commit setting for this pool. The value from /vitastor/config/global
-is used when unspecified. Similar to block_size, the pool must be restricted by
-[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
-Compatible means that a pool with non-immediate commit will work with OSDs with
-immediate commit enabled, but not vice versa.
+is used when unspecified. Similarly to block_size, only OSDs with compatible
+bitmap_granularity are used for each pool. "Compatible" means that a pool with
+non-immediate commit will use OSDs with immediate commit enabled, but not vice
+versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
+with "all" or "small", and pools with "all" only use OSDs with "all".

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).

--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -208,8 +208,9 @@ PG в Vitastor эферемерны, то есть вы можете менят

 Размер блока для данного пула. Если не задан, используется значение из
 /vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
-блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
-с помощью [osd_tags](#osd_tags).
+блока, пул будет использовать только OSD с размером блока, равным размеру блока
+пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
+используйте [osd_tags](#osd_tags).

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).

@@ -219,9 +220,8 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: 4096

 Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
-значение из /vitastor/config/global. Аналогично block_size, пул должен быть
-ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
-[osd_tags](#osd_tags).
+значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
+использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).

@@ -231,11 +231,13 @@ PG в Vitastor эферемерны, то есть вы можете менят
 - По умолчанию: none

 Настройка мгновенного коммита для данного пула. Если не задана, используется
-значение из /vitastor/config/global. Аналогично block_size, пул должен быть
-ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
-помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
-мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
-не наоборот.
+значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
+использовать только OSD с *совместимыми* настройками immediate_commit.
+"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
+использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
+пул со значением "none" будет использовать все OSD, пул со "small" будет
+использовать OSD с "all" или "small", а пул с "all" будет использовать только
+OSD с "all".

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).

--- a/docs/config/src/client.en.md
+++ b/docs/config/src/client.en.md
@@ -0,0 +1,4 @@
+# Client Parameters
+
+These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
+affect their interaction with the cluster.
--- a/docs/config/src/client.ru.md
+++ b/docs/config/src/client.ru.md
@@ -0,0 +1,4 @@
+# Параметры клиентского кода
+
+Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
+затрагивают логику их работы с кластером.
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -0,0 +1,168 @@
+- name: client_max_dirty_bytes
+  type: int
+  default: 33554432
+  online: true
+  info: |
+    Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
+    (not committed by fsync) data allowed by the client before forcing an
+    additional fsync and committing the data. Also note that the client always
+    holds a copy of uncommitted data in memory so this setting also affects
+    RAM usage of clients.
+  info_ru: |
+    При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
+    зафиксированных fsync-ом) данных, при достижении которого клиент будет
+    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+    что в этом случае до момента fsync клиент хранит копию незафиксированных
+    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+- name: client_max_dirty_ops
+  type: int
+  default: 1024
+  online: true
+  info: |
+    Same as client_max_dirty_bytes, but instead of total size, limits the number
+    of uncommitted write operations.
+  info_ru: |
+    Аналогично client_max_dirty_bytes, но ограничивает количество
+    незафиксированных операций записи вместо их общего объёма.
+- name: client_enable_writeback
+  type: bool
+  default: false
+  online: true
+  info: |
+    This parameter enables client-side write buffering. This means that write
+    requests are accumulated in memory for a short time before being sent to
+    a Vitastor cluster which allows to send them in parallel and increase
+    performance of some applications. Writes are buffered until client forces
+    a flush with fsync() or until the amount of buffered writes exceeds the
+    limit.
+
+    Write buffering significantly increases performance of some applications,
+    for example, CrystalDiskMark under Windows (LOL :-D), but also any other
+    applications if they do writes in one of two non-optimal ways: either if
+    they do a lot of small (4 kb or so) sequential writes, or if they do a lot
+    of small random writes, but without any parallelism or asynchrony, and also
+    without calling fsync().
+
+    With write buffering enabled, you can expect around 22000 T1Q1 random write
+    iops in QEMU more or less regardless of the quality of your SSDs, and this
+    number is in fact bound by QEMU itself rather than Vitastor (check it
+    yourself by adding a "driver=null-co" disk in QEMU). Without write
+    buffering, the current record is 9900 iops, but the number is usually
+    even lower with non-ideal hardware, for example, it may be 5000 iops.
+
+    Even when this parameter is enabled, write buffering isn't enabled until
+    the client explicitly allows it, because enabling it without the client
+    being aware of the fact that his writes may be buffered may lead to data
+    loss. Because of this, older versions of clients don't support write
+    buffering at all, newer versions of the QEMU driver allow write buffering
+    only if it's enabled in disk settings with `-blockdev cache.direct=false`,
+    and newer versions of FIO only allow write buffering if you don't specify
+    `-direct=1`. NBD and NFS drivers allow write buffering by default.
+
+    You can overcome this restriction too with the `client_writeback_allowed`
+    parameter, but you shouldn't do that unless you **really** know what you
+    are doing.
+  info_ru: |
+    Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
+    означает, что операции записи отправляются на кластер Vitastor не сразу, а
+    могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
+    до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
+    пока клиент не вызовет fsync.
+
+    Буферизация значительно повышает производительность некоторых приложений,
+    например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
+    которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
+    (например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
+    есть, например, отправляя 128 операций записи в разные места диска, но не
+    все сразу с помощью асинхронного I/O, а по одной.
+
+    В QEMU с буферизацией записи можно ожидать показателя примерно 22000
+    операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
+    без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
+    цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
+    в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
+    в секунду.
+
+    При этом, даже если данный параметр включён, буферизация не включается, если
+    явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
+    буферизуются, это может приводить к потере данных. Поэтому в старых версиях
+    клиентских драйверов буферизация записи не включается вообще, в новых
+    версиях QEMU-драйвера включается, только если разрешена опцией диска
+    `-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
+    В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
+
+    Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
+    но делать так не надо, если только вы не уверены в том, что делаете, на все
+    100%. :-)
+- name: client_max_buffered_bytes
+  type: int
+  default: 33554432
+  online: true
+  info: |
+    Maximum total size of buffered writes which triggers write-back when reached.
+  info_ru: |
+    Максимальный общий размер буферизованных записей, при достижении которого
+    начинается процесс сброса данных на сервер.
+- name: client_max_buffered_ops
+  type: int
+  default: 1024
+  online: true
+  info: |
+    Maximum number of buffered writes which triggers write-back when reached.
+    Multiple consecutive modified data regions are counted as 1 write here.
+  info_ru: |
+    Максимальное количество буферизованных записей, при достижении которого
+    начинается процесс сброса данных на сервер. При этом несколько
+    последовательных изменённых областей здесь считаются 1 записью.
+- name: client_max_writeback_iodepth
+  type: int
+  default: 256
+  online: true
+  info: |
+    Maximum number of parallel writes when flushing buffered data to the server.
+  info_ru: |
+    Максимальное число параллельных операций записи при сбросе буферов на сервер.
+- name: nbd_timeout
+  type: sec
+  default: 300
+  online: false
+  info: |
+    Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
+    executes for longer than this timeout, including when your cluster is just
+    temporarily down for more than timeout, the NBD device will detach by itself
+    (and possibly break the mounted file system).
+
+    You can set timeout to 0 to never detach, but in that case you won't be
+    able to remove the kernel device at all if the NBD process dies - you'll have
+    to reboot the host.
+  info_ru: |
+    Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
+    операция выполняется дольше таймаута, включая временную недоступность
+    кластера на время, большее таймаута, NBD-устройство отключится само собой
+    (и, возможно, сломает примонтированную ФС).
+
+    Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
+    таймауту, но в этом случае вы вообще не сможете удалить устройство, если
+    процесс NBD умрёт - вам придётся перезагружать сервер.
+- name: nbd_max_devices
+  type: int
+  default: 64
+  online: false
+  info: |
+    Maximum number of NBD devices in the system. This value is passed as
+    `nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
+  info_ru: |
+    Максимальное число NBD-устройств в системе. Данное значение передаётся
+    модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
+- name: nbd_max_part
+  type: int
+  default: 3
+  online: false
+  info: |
+    Maximum number of partitions per NBD device. This value is passed as
+    `max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
+    Note that (nbds_max)*(1+max_part) usually can't exceed 256.
+  info_ru: |
+    Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
+    модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
+    Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
--- a/docs/config/src/included.en.md
+++ b/docs/config/src/included.en.md
@@ -28,6 +28,8 @@

 {{../../config/network.en.md|indent=2}}

+{{../../config/client.en.md|indent=2}}
+
 {{../../config/layout-cluster.en.md|indent=2}}

 {{../../config/layout-osd.en.md|indent=2}}
--- a/docs/config/src/included.ru.md
+++ b/docs/config/src/included.ru.md
@@ -28,6 +28,8 @@

 {{../../config/network.ru.md|indent=2}}

+{{../../config/client.ru.md|indent=2}}
+
 {{../../config/layout-cluster.ru.md|indent=2}}

 {{../../config/layout-osd.ru.md|indent=2}}
--- a/docs/config/src/layout-cluster.yml
+++ b/docs/config/src/layout-cluster.yml
@@ -87,8 +87,9 @@
    it (they have internal SSD cache even though it's not stated in datasheets).

    Setting this parameter to "all" or "small" in OSD parameters requires enabling
-    disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
-    enabling disable_data_fsync.
+    [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
+    [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
+    "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).

    TLDR: For optimal performance, set immediate_commit to "all" if you only use
    SSDs with supercapacitor-based power loss protection (nonvolatile
@@ -140,8 +141,9 @@
    указано в спецификациях).

    Указание "all" или "small" в настройках / командной строке OSD требует
-    включения disable_journal_fsync и disable_meta_fsync, значение "all" также
-    требует включения disable_data_fsync.
+    включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
+    [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
+    также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).

    Итого, вкратце: для оптимальной производительности установите
    immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/src/layout-osd.yml
+++ b/docs/config/src/layout-osd.yml
@@ -204,3 +204,73 @@

    Клиентам не обязательно знать про disk_alignment, так что помещать значение
    этого параметра в etcd в /vitastor/config/global не нужно.
+- name: data_csum_type
+  type: string
+  default: none
+  info: |
+    Data checksum type to use. May be "crc32c" or "none". Set to "crc32c" to
+    enable data checksums.
+  info_ru: |
+    Тип используемых OSD контрольных сумм данных. Может быть "crc32c" или "none".
+    Установите в "crc32c", чтобы включить расчёт и проверку контрольных сумм данных.
+
+    Следует понимать, что контрольные суммы в зависимости от размера блока их
+    расчёта либо увеличивают потребление памяти, либо снижают производительность.
+    Подробнее смотрите в описании параметра [csum_block_size](#csum_block_size).
+- name: csum_block_size
+  type: int
+  default: 4096
+  info: |
+    Checksum calculation block size.
+
+    Must be equal or a multiple of [bitmap_granularity](layout-cluster.en.md#bitmap_granularity)
+    (which is usually 4 KB).
+
+    Checksums increase metadata size by 4 bytes per each csum_block_size of data.
+
+    Checksums are always a tradeoff:
+    1. You either sacrifice +1 GB RAM per 1 TB of data
+    2. Or you raise csum_block_size, for example, to 32k and sacrifice
+       50% random write iops due to checksum read-modify-write
+    3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
+       sacrifice 50% random read iops due to checksum reads
+
+    All-flash clusters usually have enough RAM to use default csum_block_size,
+    which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.
+
+    Thus, recommended setups are:
+    1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
+    2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
+    3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+    4. HDD-only, faster random read: csum_block_size=32k
+    5. HDD-only, faster random write: csum_block_size=4k +
+       inmemory_metadata=false + meta_io=cached
+
+    See also [meta_io](osd.en.md#meta_io).
+  info_ru: |
+    Размер блока расчёта контрольных сумм.
+
+    Должен быть равен или кратен [bitmap_granularity](layout-cluster.ru.md#bitmap_granularity)
+    (который обычно равен 4 КБ).
+
+    Контрольные суммы увеличивают размер метаданных на 4 байта на каждые
+    csum_block_size данных.
+
+    Контрольные суммы - это всегда компромисс:
+    1. Вы либо жертвуете потреблением +1 ГБ памяти на 1 ТБ дискового пространства
+    2. Либо вы повышаете csum_block_size до, скажем, 32k и жертвуете 50%
+       скорости случайной записи из-за цикла чтения-изменения-записи для расчёта
+       новых контрольных сумм
+    3. Либо вы отключаете [inmemory_metadata](osd.ru.md#inmemory_metadata) и
+       жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
+       с диска
+
+    Таким образом, рекомендуются следующие варианты настроек:
+    1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
+    2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
+    3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
+    4. Только HDD, быстрее случайное чтение: csum_block_size=32k
+    5. Только HDD, быстрее случайная запись: csum_block_size=4k +
+       inmemory_metadata=false + meta_io=cached
+
+    Смотрите также [meta_io](osd.ru.md#meta_io).
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -48,11 +48,14 @@
  type: string
  info: |
    RDMA device name to use for Vitastor OSD communications (for example,
-    "rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
-    Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
-    to work. For example, Mellanox ConnectX-3 and older adapters don't have
-    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
-    root to list available RDMA devices and their features.
+    "rocep5s0f0"). Now Vitastor supports all adapters, even ones without
+    ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
+
+    Versions up to Vitastor 1.2.0 required ODP which is only present in
+    Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
+
+    Run `ibv_devinfo -v` as root to list available RDMA devices and their
+    features.

    Remember that you also have to configure your network switches if you use
    RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -61,12 +64,15 @@
    PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
-    Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
-    адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
-    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
-    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
-    параметры и возможности.
+    Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
+    нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
+    картами производства не Mellanox.
+
+    Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
+    на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
+
+    Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
+    список доступных RDMA-устройств, их параметры и возможности.

    Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
    правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -160,6 +166,45 @@
    у принимающей стороны в процессе работы не заканчивались буферы на приём.
    Не влияет на потребление памяти - дополнительная память на операции отправки
    не выделяется.
+- name: rdma_odp
+  type: bool
+  default: false
+  online: false
+  info: |
+    Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
+    ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
+    for RDMA adapter to be able to use it. This, in turn, allows to skip memory
+    copying during sending. One would think this should improve performance, but
+    **in reality** RDMA performance with ODP is **drastically** worse. Example
+    3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
+    without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
+
+    This happens because Mellanox ODP implementation seems to be based on
+    message retransmissions when the adapter doesn't know about the buffer yet -
+    it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
+    which is generally slow in RDMA/RoCE networks. Here's a presentation about
+    it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+    ODP support is retained in the code just in case a good ODP implementation
+    appears one day.
+  info_ru: |
+    Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
+    исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
+    не регистрировать память для её использования RDMA-картой. Благодаря этому
+    можно не копировать данные при отправке их в сеть и, казалось бы, это должно
+    улучшать производительность - но **по факту** получается так, что
+    производительность только ухудшается, причём сильно. Пример - на 3-узловом
+    кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
+    удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
+
+    Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
+    основана на повторной передаче сообщений, когда карте не известен буфер -
+    вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
+    А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
+    Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
+
+    Возможность использования ODP сохранена в коде на случай, если вдруг в один
+    прекрасный день появится хорошая реализация ODP.
 - name: peer_connect_interval
  type: sec
  min: 1
@@ -259,23 +304,3 @@
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
- name: client_dirty_limit
-  type: int
-  default: 33554432
-  online: true
-  info: |
-    Without immediate_commit=all this parameter sets the limit of "dirty"
-    (not committed by fsync) data allowed by the client before forcing an
-    additional fsync and committing the data. Also note that the client always
-    holds a copy of uncommitted data in memory so this setting also affects
-    RAM usage of clients.
-
-    This parameter doesn't affect OSDs themselves.
-  info_ru: |
-    При работе без immediate_commit=all - это лимит объёма "грязных" (не
-    зафиксированных fsync-ом) данных, при достижении которого клиент будет
-    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-    что в этом случае до момента fsync клиент хранит копию незафиксированных
-    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
-
-    Параметр не влияет на сами OSD.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -2,15 +2,28 @@
  type: sec
  default: 5
  info: |
-    Interval at which OSDs report their state to etcd. Affects OSD lease time
+    Interval at which OSDs report their liveness to etcd. Affects OSD lease time
    and thus the failover speed. Lease time is equal to this parameter value
    plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
    that every OSD always refreshes its lease in time.
  info_ru: |
-    Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
-    влияет на время резервации (lease) OSD и поэтому на скорость переключения
+    Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
+    влияет на время резервации (lease) OSD и поэтому - на скорость переключения
    при падении OSD. Время lease равняется значению этого параметра плюс
    max_etcd_attempts * etcd_quick_timeout.
+- name: etcd_stats_interval
+  type: sec
+  default: 30
+  info: |
+    Interval at which OSDs report their statistics to etcd. Highly affects the
+    imposed load on etcd, because statistics include a key for every OSD and
+    for every PG. At the same time, low statistic intervals make `vitastor-cli`
+    statistics more responsive.
+  info_ru: |
+    Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
+    создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
+    каждый OSD и на каждую PG. В то же время низкий интервал делает
+    статистику, печатаемую `vitastor-cli`, отзывчивей.
 - name: run_primary
  type: bool
  default: true
@@ -260,6 +273,96 @@
    достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
    параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
    журналами, расположенными на быстром по сравнению с HDD устройстве.
+- name: data_io
+  type: string
+  default: direct
+  info: |
+    I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
+    to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
+
+    Choose "cached" to use Linux page cache. This may improve read performance
+    for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
+    decrease write performance for fast disks because page cache is an overhead
+    itself.
+
+    Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
+    (which requires disable_data_fsync) with drives having write-back cache
+    which can't be turned off, for example, Intel Optane. Also note that *some*
+    desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
+    disable_data_fsync unsafe even with "directsync".
+  info_ru: |
+    Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
+    "directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
+
+    Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
+    чтении и записи. Это может улучшить скорость чтения горячих данных с
+    относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
+    снижает производительность записи для быстрых дисков, так как кэш сам по
+    себе тоже добавляет накладные расходы.
+
+    Выберите "directsync", если хотите задействовать
+    [immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
+    включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
+    дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
+    настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
+    fsync небезопасным даже с режимом "directsync".
+- name: meta_io
+  type: string
+  default: direct
+  info: |
+    I/O mode for *metadata*. One of "direct", "cached" or "directsync".
+
+    "cached" may improve read performance, but only under the following conditions:
+    1. your drives are relatively slow (HDD, SATA SSD), and
+    2. checksums are enabled, and
+    3. [inmemory_metadata](#inmemory_metadata) is disabled.
+    Under all these conditions, metadata blocks are read from disk on every
+    read request to verify checksums and caching them may reduce this extra
+    read load. Without (3) metadata is never read from the disk after starting,
+    and without (2) metadata blocks are read from disk only during journal
+    flushing.
+
+    "directsync" is the same as above.
+
+    If the same device is used for data and metadata, meta_io by default is set
+    to the same value as [data_io](#data_io).
+  info_ru: |
+    Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
+    "directsync".
+
+    "cached" может улучшить скорость чтения, если:
+    1. у вас медленные диски (HDD, SATA SSD)
+    2. контрольные суммы включены
+    3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
+    При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
+    для проверки контрольных сумм и их кэширование может снизить дополнительную
+    нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
+    запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
+
+    Если одно и то же устройство используется для данных и метаданных, режим
+    ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
+- name: journal_io
+  type: string
+  default: direct
+  info: |
+    I/O mode for *journal*. One of "direct", "cached" or "directsync".
+
+    Here, "cached" may only improve read performance for recent writes and
+    only if [inmemory_journal](#inmemory_journal) is turned off.
+
+    If the same device is used for metadata and journal, journal_io by default
+    is set to the same value as [meta_io](#meta_io).
+  info_ru: |
+    Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
+    "directsync".
+
+    Здесь "cached" может улучшить скорость чтения только недавно записанных
+    данных и только если параметр [inmemory_journal](#inmemory_journal)
+    отключён.
+
+    Если одно и то же устройство используется для метаданных и журнала,
+    режим ввода-вывода журнала по умолчанию устанавливается равным
+    [meta_io](#meta_io).
 - name: journal_sector_buffer_count
  type: int
  default: 32
--- a/docs/installation/kubernetes.en.md
+++ b/docs/installation/kubernetes.en.md
@@ -17,4 +17,26 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
+After that you'll be able to create PersistentVolumes.
+
+**Important:** For best experience, use Linux kernel at least 5.15 with [VDUSE](../usage/qemu.en.md#vduse)
+kernel modules enabled (vdpa, vduse, virtio-vdpa). If your distribution doesn't
+have them pre-built - build them yourself ([instructions](../usage/qemu.en.md#vduse)),
+I promise it's worth it :-). When VDUSE is unavailable, CSI driver uses [NBD](../usage/nbd.en.md)
+to map Vitastor devices. NBD is slower and prone to timeout issues: if Vitastor
+cluster becomes unresponsible for more than [nbd_timeout](../config/client.en.md#nbd_timeout),
+the NBD device detaches and breaks pods using it.
+
+## Features
+
+Vitastor CSI supports:
+- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
+- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
+- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
+- Volume expansion
+- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
+- [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
+- Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
+- Multiple clusters by using multiple configuration files in ConfigMap.
+
+Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
--- a/docs/installation/kubernetes.ru.md
+++ b/docs/installation/kubernetes.ru.md
@@ -17,4 +17,26 @@
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
+После этого вы сможете создавать PersistentVolume.
+
+**Важно:** Лучше всего использовать ядро Linux версии не менее 5.15 с включёнными модулями
+[VDUSE](../usage/qemu.ru.md#vduse) (vdpa, vduse, virtio-vdpa). Если в вашем дистрибутиве
+они не собраны из коробки - соберите их сами, обещаю, что это стоит того ([инструкция](../usage/qemu.ru.md#vduse)) :-).
+Когда VDUSE недоступно, CSI-плагин использует [NBD](../usage/nbd.ru.md) для подключения
+дисков, а NBD медленнее и имеет проблему таймаута - если кластер остаётся недоступным
+дольше, чем [nbd_timeout](../config/client.ru.md#nbd_timeout), NBD-устройство отключается
+и ломает поды, использующие его.
+
+## Возможности
+
+CSI-плагин Vitastor поддерживает:
+- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
+- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
+- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
+- Расширение размера томов
+- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
+- Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
+- Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
+- Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
+
+Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -14,9 +14,11 @@
  - Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
  - Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
+  - Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
+    stable version from 0.9.x branch instead of 1.x
 - For Debian 10 (Buster) also enable backports repository:
  `deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
+- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`

 ## CentOS

--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -14,9 +14,11 @@
  - Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
  - Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
+  - Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
+    установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
 - Для Debian 10 (Buster) также включите репозиторий backports:
  `deb http://deb.debian.org/debian buster-backports main`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
+- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`

 ## CentOS

--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):
+To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):

 - Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
-  bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
+  bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
 - Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
 - Define storage in `/etc/pve/storage.cfg` (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):
+Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):

 - Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
-  bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
+  bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
 - Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
 - Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
--- a/docs/intro/architecture.ru.md
+++ b/docs/intro/architecture.ru.md
@@ -54,7 +54,8 @@
  виртуальные диски, их снимки и клоны.
 - **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
  с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
-  библиотеки, без необходимости отображения дисков в виде блочных устройств.
+  библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
+  позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
 - **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
  с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
  (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -29,7 +29,9 @@
 - Snapshots and copy-on-write image clones
 - [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)
+- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
+- [Checksums](../config/layout-osd.en.md#data_csum_type)
+- [Client write-back cache](../config/client.en.md#client_enable_writeback)

 ## Plugins and tools

@@ -49,14 +51,15 @@

 The following features are planned for the future:

+- File system
+- Control plane optimisation
 - Other administrative tools
 - Web GUI
 - OpenNebula plugin
- iSCSI proxy
+- iSCSI and NVMeoF gateways
 - Multi-threaded client
 - Faster failover
- Checksums
+- S3
 - Tiered storage (SSD caching)
 - NVDIMM support
 - Compression (possibly)
- Read caching using system page cache (possibly)
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -31,7 +31,9 @@
 - Снапшоты и copy-on-write клоны
 - [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)
+- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
+- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
+- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)

 ## Драйверы и инструменты

@@ -49,13 +51,15 @@

 ## Планы развития

+- Файловая система
+- Оптимизация слоя управления
 - Другие инструменты администрирования
 - Web-интерфейс
 - Плагин для OpenNebula
- iSCSI-прокси
+- iSCSI и NVMeoF прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
- Контрольные суммы
+- S3
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
 - Возможно, сжатие
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -28,7 +28,8 @@ It supports the following commands:
 Global options:

 ```
--etcd_address ADDR  Etcd connection address
+--config_file FILE   Path to Vitastor configuration file
+--etcd_address URL   Etcd connection address
 --iodepth N          Send N operations in parallel to each OSD when possible (default 32)
 --parallel_osds M    Work with M osds in parallel when possible (default 4)
 --progress 1|0       Report progress (default 1)
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -27,7 +27,8 @@ vitastor-cli - интерфейс командной строки для адм
 Глобальные опции:

 ```
--etcd_address ADDR  Адрес соединения с etcd
+--config_file FILE   Путь к файлу конфигурации Vitastor
+--etcd_address URL   Адрес соединения с etcd
 --iodepth N          Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
 --parallel_osds M    Работать параллельно с M OSD (по умолчанию 4)
 --progress 1|0       Печатать прогресс выполнения (по умолчанию 1)
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -17,6 +17,7 @@ It supports the following commands:
 - [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
+- [update-sb](#update-sb)
 - [udev](#udev)
 - [exec-osd](#exec-osd)
 - [pre-exec](#pre-exec)
@@ -86,6 +87,8 @@ Options (both modes):
 --journal_size 1G/32M      Set journal size (area or partition size)
 --block_size 1M/128k       Set blockstore object size
 --bitmap_granularity 4k    Set bitmap granularity
+--data_csum_type none      Set data checksum type (crc32c or none)
+--csum_block_size 4k       Set data checksum block size
 --data_device_block 4k     Override data device block size
 --meta_device_block 4k     Override metadata device block size
 --journal_device_block 4k  Override journal device block size
@@ -100,8 +103,9 @@ checks the device cache status on start and tries to disable cache for SATA/SAS
 If it doesn't succeed it issues a warning in the system log.

 You can also pass other OSD options here as arguments and they'll be persisted
-to the superblock: max_write_iodepth, max_write_iodepth, min_flusher_count,
-max_flusher_count, inmemory_metadata, inmemory_journal, journal_sector_buffer_count,
+in the superblock: cached_io_data, cached_io_meta, cached_io_journal,
+inmemory_metadata, inmemory_journal, max_write_iodepth,
+min_flusher_count, max_flusher_count, journal_sector_buffer_count,
 journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
 throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
 See [Runtime OSD Parameters](../config/osd.en.md) for details.
@@ -179,6 +183,14 @@ Try to read Vitastor OSD superblock from `<device>` and print it in JSON format.

 Read JSON from STDIN and write it into Vitastor OSD superblock on `<device>`.

+## update-sb
+
+`vitastor-disk update-sb <device> [--force] [--<parameter> <value>] [...]`
+
+Read Vitastor OSD superblock from <device>, update parameters in it and write it back.
+
+`--force` allows to ignore validation errors.
+
 ## udev

 `vitastor-disk udev <device>`
@@ -249,7 +261,9 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
 ```
 --object_size 128k       Set blockstore block size
 --bitmap_granularity 4k  Set bitmap granularity
--journal_size 32M       Set journal size
+--journal_size 16M       Set journal size
+--data_csum_type none    Set data checksum type (crc32c or none)
+--csum_block_size 4k     Set data checksum block size
 --device_block_size 4k   Set device block size
 --journal_offset 0       Set journal offset
 --device_size 0          Set device size
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -17,6 +17,7 @@ vitastor-disk - инструмент командной строки для уп
 - [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
+- [update-sb](#update-sb)
 - [udev](#udev)
 - [exec-osd](#exec-osd)
 - [pre-exec](#pre-exec)
@@ -87,6 +88,8 @@ vitastor-disk - инструмент командной строки для уп
 --journal_size 1G/32M      Задать размер журнала (области или раздела журнала)
 --block_size 1M/128k       Задать размер объекта хранилища
 --bitmap_granularity 4k    Задать гранулярность битовых карт
+--data_csum_type none      Задать тип контрольных сумм (crc32c или none)
+--csum_block_size 4k       Задать размер блока расчёта контрольных сумм
 --data_device_block 4k     Задать размер блока устройства данных
 --meta_device_block 4k     Задать размер блока метаданных
 --journal_device_block 4k  Задать размер блока журнала
@@ -101,8 +104,9 @@ vitastor-disk - инструмент командной строки для уп
 это не удаётся, в системный журнал выводится предупреждение.

 Вы можете передать данной команде и некоторые другие опции OSD в качестве аргументов
-и они тоже будут сохранены в суперблок: max_write_iodepth, max_write_iodepth, min_flusher_count,
-max_flusher_count, inmemory_metadata, inmemory_journal, journal_sector_buffer_count,
+и они тоже будут сохранены в суперблок: cached_io_data, cached_io_meta,
+cached_io_journal, inmemory_metadata, inmemory_journal, max_write_iodepth,
+min_flusher_count, max_flusher_count, journal_sector_buffer_count,
 journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
 throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
 Читайте об этих параметрах подробнее в разделе [Изменяемые параметры OSD](../config/osd.ru.md).
@@ -184,6 +188,15 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.

 Прочитать JSON со стандартного ввода и записать его в суперблок OSD на диск `<device>`.

+## update-sb
+
+`vitastor-disk update-sb <device> [--force] [--<параметр> <значение>] [...]`
+
+Прочитать суперблок OSD с диска `<device>`, изменить в нём заданные параметры и записать обратно.
+
+Опция `--force` позволяет читать суперблок, даже если он считается некорректным
+из-за ошибок валидации.
+
 ## udev

 `vitastor-disk udev <device>`
@@ -254,7 +267,9 @@ OSD отключены fsync-и.
 ```
 --object_size 128k       Размер блока хранилища
 --bitmap_granularity 4k  Гранулярность битовых карт
--journal_size 32M       Размер журнала
+--journal_size 16M       Размер журнала
+--data_csum_type none    Задать тип контрольных сумм (crc32c или none)
+--csum_block_size 4k     Задать размер блока расчёта контрольных сумм
 --device_block_size 4k   Размер блока устройства
 --journal_offset 0       Смещение журнала
 --device_size 0          Размер устройства
--- a/docs/usage/nbd.en.md
+++ b/docs/usage/nbd.en.md
@@ -11,25 +11,25 @@ NBD stands for "Network Block Device", but in fact it also functions as "BUSE"
 NBD slighly lowers the performance due to additional overhead, but performance still
 remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).

-Vitastor Kubernetes CSI driver is based on NBD.
+See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.

-See also [VDUSE](qemu.en.md#vduse).
+Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.

 ## Map image

 To create a local block device for a Vitastor image run:

 ```
-vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
+vitastor-nbd map --image testimg
 ```

 It will output a block device name like /dev/nbd0 which you can then use as a normal disk.

 You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.

-Additional options for map command:
+vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:

-* `--nbd_timeout 30` \
+* `--nbd_timeout 300` \
  Timeout for I/O operations in seconds after exceeding which the kernel stops
  the device. You can set it to 0 to disable the timeout, but beware that you
  won't be able to stop the device at all if vitastor-nbd process dies.
@@ -44,6 +44,9 @@ Additional options for map command:
 * `--foreground 1` \
  Stay in foreground, do not daemonize.

+Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
+in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
+
 ## Unmap image

 To unmap the device run:
--- a/docs/usage/nbd.ru.md
+++ b/docs/usage/nbd.ru.md
@@ -14,16 +14,16 @@ NBD на данный момент необходимо, чтобы монтир
 NBD немного снижает производительность из-за дополнительных копирований памяти,
 но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).

-CSI-драйвер Kubernetes Vitastor основан на NBD.
+Смотрите также [VDUSE](qemu.ru.md#vduse), как лучшую альтернативу NBD.

-Смотрите также [VDUSE](qemu.ru.md#vduse).
+CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.

 ## Подключить устройство

 Чтобы создать локальное блочное устройство для образа, выполните команду:

 ```
-vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
+vitastor-nbd map --image testimg
 ```

 Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
@@ -32,7 +32,8 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 Для обращения по номеру инода, аналогично другим командам, можно использовать опции
 `--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.

-Дополнительные опции для команды подключения NBD-устройства:
+vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
+плюс специфичные для NBD:

 * `--nbd_timeout 30` \
  Максимальное время выполнения любой операции чтения/записи в секундах, при
@@ -53,6 +54,10 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 * `--foreground 1` \
  Не уводить процесс в фоновый режим.

+Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
+также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
+заданном опцией `--config_file`.
+
 ## Отключить устройство

 Для отключения устройства выполните:
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@@ -23,7 +23,7 @@ balancer or any failover method you want to in that case.
 vitastor-nfs usage:

 ```
-vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
+vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]

 --subdir <DIR>    export images prefixed <DIR>/ (default empty - export all images)
 --portmap 0       do not listen on port 111 (portmap/rpcbind, requires root)
--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@@ -22,7 +22,7 @@
 Использование vitastor-nfs:

 ```
-vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
+vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]

 --subdir <DIR>    экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
 --portmap 0       отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -34,6 +34,20 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```

+With a separate I/O thread:
+
+```
+qemu-system-x86_64 -enable-kvm -m 1024 \
+    -object iothread,id=vitastor1 \
+    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
+        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
+        id=virtio-disk0,bootindex=1,write-cache=off' \
+    -vnc 0.0.0.0:0
+```
+
+You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
+
 ## qemu-img

 For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -84,25 +98,75 @@ This can be used for backups. Just note that exporting an image that is currentl
 is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
 on a live VM.

+## vhost-user-blk
+
+QEMU, starting with 6.0, includes support for attaching disks via a separate
+userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
+lower latency.
+
+Example commands to use it with Vitastor:
+
+```
+qemu-storage-daemon \
+    --daemonize \
+    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
+
+qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
+    -object memory-backend-memfd,id=mem,size=2G,share=on \
+    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
+    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
+    -vnc 0.0.0.0:0
+```
+
+memfd memory-backend is crucial, vhost-user-blk does not work without it.
+
 ## VDUSE

 Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
 to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
 exporting QEMU block devices over this protocol using qemu-storage-daemon.

-VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
-for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
-hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
-In this case reboot will be the only way to remove VDUSE devices from system.
+VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
+- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
+- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
+- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
+  and block device will continue operation
+- It doesn't seem to have the device number limit

-On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
-performance is important for you. Approximate performance numbers:
-direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
+Example performance comparison:
+
+|                      | direct fio  | NBD         | VDUSE       |
+|----------------------|-------------|-------------|-------------|
+| linear write         | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
+| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
+| 4k random write Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
+| linear read          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
+| 4k random read Q128  | 287000 iops | 140000 iops | 189000 iops |
+| 4k random read Q1    | 9600 iops   | 7640 iops   | 7780 iops   |

 To try VDUSE you need at least Linux 5.15, built with VDUSE support
-(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
-disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
-[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
+(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
+
+Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
+use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
+or build modules for Debian kernel manually:
+
+```
+mkdir build
+cd build
+apt-get install linux-headers-`uname -r`
+apt-get build-dep linux-image-`uname -r`-unsigned
+apt-get source linux-image-`uname -r`-unsigned
+cd linux*/drivers/vdpa
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
+cd ../virtio
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+depmod -a
+```
+
+You also need `vdpa` tool from the `iproute2` package.

 Commands to attach Vitastor image as a VDUSE device:

@@ -115,7 +179,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
 vdpa dev add name test1 mgmtdev vduse
 ```

-After running these commands /dev/vda device will appear in the system and you'll be able to
+After running these commands, `/dev/vda` device will appear in the system and you'll be able to
 use it as a normal disk.

 To remove the device:
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -36,6 +36,18 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
    -vnc 0.0.0.0:0
 ```

+С отдельным потоком ввода-вывода:
+
+```
+qemu-system-x86_64 -enable-kvm -m 1024 \
+    -object iothread,id=vitastor1 \
+    -blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
+        "cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    -device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
+        id=virtio-disk0,bootindex=1,write-cache=off' \
+    -vnc 0.0.0.0:0
+```
+
 Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.

 ## qemu-img
@@ -88,25 +100,76 @@ qemu-img rebase -u -b '' testimg.qcow2
 в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
 с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.

+## vhost-user-blk
+
+QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
+Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
+задержку (ниже на 20-30 микросекунд, чем при обычном методе).
+
+Пример команд для использования vhost-user-blk с Vitastor:
+
+```
+qemu-storage-daemon \
+    --daemonize \
+    --blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
+    --export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
+
+qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
+    -object memory-backend-memfd,id=mem,size=2G,share=on \
+    -chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
+    -device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
+    -vnc 0.0.0.0:0
+```
+
+Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
+
 ## VDUSE

 В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
 к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
 экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.

-VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
-подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
-процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
-через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
+VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
+устройств на уровне ядра, ибо:
+- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
+- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
+- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
+  перезапустить (!) и блочное устройство продолжит работать
+- По-видимому, у него нет предела числа подключаемых в систему устройств

-С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
-быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
-прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
+Пример сравнения производительности:

-Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
-VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
-отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
-[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
+|                          | Прямой fio  | NBD         | VDUSE       |
+|--------------------------|-------------|-------------|-------------|
+| линейная запись          | 3.85 GB/s   | 1.12 GB/s   | 3.85 GB/s   |
+| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
+| 4k случайная запись Q1   | 9500 iops   | 7620 iops   | 7640 iops   |
+| линейное чтение          | 4.3 GB/s    | 1.8 GB/s    | 2.85 GB/s   |
+| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
+| 4k случайное чтение Q1   | 9600 iops   | 7640 iops   | 7780 iops   |
+
+Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
+VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
+
+В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
+на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
+из Proxmox или соберите модули для ядра Debian вручную:
+
+```
+mkdir build
+cd build
+apt-get install linux-headers-`uname -r`
+apt-get build-dep linux-image-`uname -r`-unsigned
+apt-get source linux-image-`uname -r`-unsigned
+cd linux*/drivers/vdpa
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
+cd ../virtio
+make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
+depmod -a
+```
+
+Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.

 Команды для подключения виртуального диска через VDUSE:

@@ -119,7 +182,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
 vdpa dev add name test1 mgmtdev vduse
 ```

-После этого в системе появится устройство /dev/vda, которое можно будет использовать как
+После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
 обычный диск.

 Для удаления устройства из системы:
--- a/mon/90-vitastor.rules
+++ b/mon/90-vitastor.rules
@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
    IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
    SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"

-ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
-ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
+ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -78,9 +78,15 @@ const etcd_tree = {
            disk_alignment: 4096,
            bitmap_granularity: 4096,
            immediate_commit: false, // 'all' or 'small'
+            // client - configurable online
+            client_max_dirty_bytes: 33554432,
+            client_max_dirty_ops: 1024,
+            client_enable_writeback: false,
+            client_max_buffered_bytes: 33554432,
+            client_max_buffered_ops: 1024,
+            client_max_writeback_iodepth: 256,
            // client and osd - configurable online
            log_level: 0,
-            client_dirty_limit: 33554432,
            peer_connect_interval: 5, // seconds. min: 1
            peer_connect_timeout: 5, // seconds. min: 1
            osd_idle_timeout: 5, // seconds. min: 1
@@ -93,6 +99,7 @@ const etcd_tree = {
            etcd_ws_keepalive_interval: 30, // seconds
            // osd
            etcd_report_interval: 5, // seconds
+            etcd_stats_interval: 30, // seconds
            run_primary: true,
            osd_network: null, // "192.168.7.0/24" or an array of masks
            bind_address: "0.0.0.0",
@@ -103,7 +110,15 @@ const etcd_tree = {
            autosync_interval: 5,
            autosync_writes: 128,
            client_queue_depth: 128, // unused
-            recovery_queue_depth: 4,
+            recovery_queue_depth: 1,
+            recovery_sleep_us: 0,
+            recovery_tune_min_util: 0.1,
+            recovery_tune_min_client_util: 0,
+            recovery_tune_max_util: 1.0,
+            recovery_tune_max_client_util: 0.5,
+            recovery_tune_interval: 1,
+            recovery_tune_ewma_rate: 0.5,
+            recovery_tune_sleep_min_us: 10, // 10 microseconds
            recovery_pg_switch: 128,
            recovery_sync_batch: 16,
            no_recovery: false,
@@ -390,12 +405,13 @@ class Mon
        this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
        this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
        this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
+        this.prev_stats = { osd_stats: {}, osd_diff: {} };
        this.signals_set = false;
-        this.stat_time = Date.now();
        this.ws = null;
        this.ws_alive = false;
        this.ws_keepalive_timer = null;
        this.on_stop_cb = () => this.on_stop(0).catch(console.error);
+        this.recheck_pgs_active = false;
    }

    parse_etcd_addresses(addrs)
@@ -539,10 +555,18 @@ class Mon
        {
            retries = 1;
        }
+        const tried = {};
        while (retries < 0 || retry < retries)
        {
            const cur_addr = this.pick_next_etcd();
            const base = 'ws'+cur_addr.substr(4);
+            let now = Date.now();
+            if (tried[base] && now-tried[base] < this.etcd_start_timeout)
+            {
+                await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
+                now = Date.now();
+            }
+            tried[base] = now;
            const ok = await new Promise((ok, no) =>
            {
                const timer_id = setTimeout(() =>
@@ -677,8 +701,27 @@ class Mon
        });
    }

+    // Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
+    schedule_save_last_clean()
+    {
+        if (!this.save_last_clean_timer)
+        {
+            this.save_last_clean_timer = setTimeout(() =>
+            {
+                this.save_last_clean_timer = null;
+                this.save_last_clean().catch(this.die);
+            }, this.config.mon_change_timeout || 1000);
+        }
+    }
+
    async save_last_clean()
    {
+        if (this.save_last_clean_running)
+        {
+            this.schedule_save_last_clean();
+            return;
+        }
+        this.save_last_clean_running = true;
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
        const new_clean_pgs = { items: {} };
    next_pool:
@@ -715,6 +758,7 @@ class Mon
                value: b64(JSON.stringify(this.state.history.last_clean_pgs))
            } } ],
        }, this.etcd_start_timeout, 0);
+        this.save_last_clean_running = false;
    }

    get_mon_state()
@@ -1148,6 +1192,33 @@ class Mon
        }
    }

+    filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
+    {
+        for (const host in flat_tree)
+        {
+            let found = 0;
+            for (const osd in flat_tree[host])
+            {
+                const osd_stat = this.state.osd.stats[osd];
+                if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
+                    osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
+                    osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
+                    osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
+                {
+                    delete flat_tree[host][osd];
+                }
+                else
+                {
+                    found++;
+                }
+            }
+            if (!found)
+            {
+                delete flat_tree[host];
+            }
+        }
+    }
+
    get_affinity_osds(pool_cfg, up_osds, osd_tree)
    {
        let aff_osds = up_osds;
@@ -1161,6 +1232,12 @@ class Mon

    async recheck_pgs()
    {
+        if (this.recheck_pgs_active)
+        {
+            this.schedule_recheck();
+            return;
+        }
+        this.recheck_pgs_active = true;
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
        // FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1182,6 +1259,7 @@ class Mon
                    // Pool deleted. Delete all PGs, but first stop them.
                    if (!await this.stop_all_pgs(pool_id))
                    {
+                        this.recheck_pgs_active = false;
                        this.schedule_recheck();
                        return;
                    }
@@ -1208,6 +1286,12 @@ class Mon
                pool_tree = pool_tree ? pool_tree.children : [];
                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
+                this.filter_osds_by_block_layout(
+                    pool_tree,
+                    pool_cfg.block_size || this.config.block_size || 131072,
+                    pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
+                    pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
+                );
                // These are for the purpose of building history.osd_sets
                const real_prev_pgs = [];
                let pg_history = [];
@@ -1244,9 +1328,16 @@ class Mon
                        // PG count changed. Need to bring all PGs down.
                        if (!await this.stop_all_pgs(pool_id))
                        {
+                            this.recheck_pgs_active = false;
                            this.schedule_recheck();
                            return;
                        }
+                    }
+                    if (prev_pgs.length != pool_cfg.pg_count)
+                    {
+                        // Scale PG count
+                        // Do it even if old_pg_count is already equal to pool_cfg.pg_count,
+                        // because last_clean_pgs may still contain the old number of PGs
                        const new_pg_history = [];
                        PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
                        pg_history = new_pg_history;
@@ -1348,6 +1439,7 @@ class Mon
                await this.save_pg_config(new_config_pgs);
            }
        }
+        this.recheck_pgs_active = false;
    }

    async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1397,7 +1489,6 @@ class Mon
    }

    // Schedule a recheck to run after a small timeout (1s)
-    // If already scheduled, cancel previous timer and schedule it again
    // This is required for multiple change events to trigger at most 1 recheck in 1s
    schedule_recheck()
    {
@@ -1411,15 +1502,15 @@ class Mon
        }
    }

-    derive_osd_stats(st, prev)
+    derive_osd_stats(st, prev, prev_diff)
    {
        const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
-        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
-        if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
+        const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
+        if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
        {
-            return diff;
+            return prev_diff || diff;
        }
-        const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
+        const timediff = BigInt(st.time*1000 - prev.time*1000);
        for (const op in st.op_stats||{})
        {
            const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1451,25 +1542,47 @@ class Mon
            if (n > 0)
                diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
        }
+        for (const pool_id in st.inode_stats||{})
+        {
+            const pool_diff = diff.inode_stats[pool_id] = {};
+            for (const inode_num in st.inode_stats[pool_id])
+            {
+                const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
+                for (const op of [ 'read', 'write', 'delete' ])
+                {
+                    const c = st.inode_stats[pool_id][inode_num][op];
+                    const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
+                        prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
+                    const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
+                    inode_diff[op] = {
+                        bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
+                        iops: n*1000n/timediff,
+                        lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
+                    };
+                }
+            }
+        }
        return diff;
    }

-    sum_op_stats(timestamp, prev_stats)
+    sum_op_stats()
    {
-        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
-        if (!prev_stats || prev_stats.timestamp >= timestamp)
+        for (const osd in this.state.osd.stats)
        {
-            return sum_diff;
+            const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
+            this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
+                cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
+            );
+            this.prev_stats.osd_stats[osd] = cur;
        }
-        const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
+        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
        // Sum derived values instead of deriving summed
        for (const osd in this.state.osd.stats)
        {
-            const derived = this.derive_osd_stats(this.state.osd.stats[osd],
-                this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
-            for (const type in derived)
+            const derived = this.prev_stats.osd_diff[osd];
+            for (const type in sum_diff)
            {
-                for (const op in derived[type])
+                for (const op in derived[type]||{})
                {
                    for (const k in derived[type][op])
                    {
@@ -1526,14 +1639,14 @@ class Mon
        return { object_counts, object_bytes };
    }

-    sum_inode_stats(prev_stats, timestamp, prev_timestamp)
+    sum_inode_stats()
    {
        const inode_stats = {};
        const inode_stub = () => ({
            raw_used: 0n,
-            read: { count: 0n, usec: 0n, bytes: 0n },
-            write: { count: 0n, usec: 0n, bytes: 0n },
-            delete: { count: 0n, usec: 0n, bytes: 0n },
+            read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
+            write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
+            delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
        });
        const seen_pools = {};
        for (const pool_id in this.state.config.pools)
@@ -1585,11 +1698,25 @@ class Mon
                }
            }
        }
-        if (prev_stats && prev_timestamp >= timestamp)
+        for (const osd in this.prev_stats.osd_diff)
        {
-            prev_stats = null;
+            for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
+            {
+                for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
+                {
+                    inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
+                    for (const op of [ 'read', 'write', 'delete' ])
+                    {
+                        const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
+                        const op_st = inode_stats[pool_id][inode_num][op];
+                        op_st.bps += op_diff.bps;
+                        op_st.iops += op_diff.iops;
+                        op_st.lat += op_diff.lat;
+                        op_st.n_osd = (op_st.n_osd || 0) + 1;
+                    }
+                }
+            }
        }
-        const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
        for (const pool_id in inode_stats)
        {
            for (const inode_num in inode_stats[pool_id])
@@ -1598,11 +1725,12 @@ class Mon
                for (const op of [ 'read', 'write', 'delete' ])
                {
                    const op_st = inode_stats[pool_id][inode_num][op];
-                    const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
-                    op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
-                    op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
-                    op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
-                    if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
+                    if (op_st.n_osd)
+                    {
+                        op_st.lat /= BigInt(op_st.n_osd);
+                        delete op_st.n_osd;
+                    }
+                    if (op_st.bps > 0 || op_st.iops > 0)
                        nonzero = true;
                }
                if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1635,15 +1763,9 @@ class Mon
    async update_total_stats()
    {
        const txn = [];
-        const timestamp = Date.now();
        const { object_counts, object_bytes } = this.sum_object_counts();
-        let stats = this.sum_op_stats(timestamp, this.prev_stats);
-        let { inode_stats, seen_pools } = this.sum_inode_stats(
-            this.prev_stats ? this.prev_stats.inode_stats : null,
-            timestamp, this.prev_stats ? this.prev_stats.timestamp : null
-        );
-        this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
-        this.stat_time = Date.now();
+        let stats = this.sum_op_stats();
+        let { inode_stats, seen_pools } = this.sum_inode_stats();
        stats.object_counts = object_counts;
        stats.object_bytes = object_bytes;
        stats = this.serialize_bigints(stats);
@@ -1788,10 +1910,18 @@ class Mon
        {
            retries = 1;
        }
+        const tried = {};
        while (retries < 0 || retry < retries)
        {
            retry++;
            const base = this.pick_next_etcd();
+            let now = Date.now();
+            if (tried[base] && now-tried[base] < timeout)
+            {
+                await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
+                now = Date.now();
+            }
+            tried[base] = now;
            const res = await POST(base+path, body, timeout);
            if (res.error)
            {
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.0.0",
+  "version": "1.3.1",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.9.6'
+VERSION = '1.3.1'

 LOG = logging.getLogger(__name__)

--- a/patches/pve-qemu-8.1-vitastor.patch
+++ b/patches/pve-qemu-8.1-vitastor.patch
@@ -0,0 +1,190 @@
+Index: pve-qemu-kvm-8.1.2/block/meson.build
+===================================================================
+--- pve-qemu-kvm-8.1.2.orig/block/meson.build
+++ pve-qemu-kvm-8.1.2/block/meson.build
+@@ -123,6 +123,7 @@ foreach m : [
+   [libnfs, 'nfs', files('nfs.c')],
+   [libssh, 'ssh', files('ssh.c')],
+   [rbd, 'rbd', files('rbd.c')],
+  [vitastor, 'vitastor', files('vitastor.c')],
+ ]
+   if m[0].found()
+     module_ss = ss.source_set()
+Index: pve-qemu-kvm-8.1.2/meson.build
+===================================================================
+--- pve-qemu-kvm-8.1.2.orig/meson.build
+++ pve-qemu-kvm-8.1.2/meson.build
+@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_
+   endif
+ endif
+ 
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+    required: get_option('vitastor'))
+  if libvitastor_client.found()
+    if cc.links('''
+      #include <vitastor_c.h>
+      int main(void) {
+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        return 0;
+      }''', dependencies: libvitastor_client)
+      vitastor = declare_dependency(dependencies: libvitastor_client)
+    elif get_option('vitastor').enabled()
+      error('could not link libvitastor_client')
+    else
+      warning('could not link libvitastor_client, disabling')
+    endif
+  endif
+endif
+
+ glusterfs = not_found
+ glusterfs_ftruncate_has_stat = false
+ glusterfs_iocb_has_stat = false
+@@ -2123,6 +2143,7 @@ if numa.found()
+ endif
+ config_host_data.set('CONFIG_OPENGL', opengl.found())
+ config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
+ config_host_data.set('CONFIG_RDMA', rdma.found())
+ config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
+ config_host_data.set('CONFIG_SDL', sdl.found())
+@@ -4298,6 +4319,7 @@ summary_info += {'fdt support':       fd
+ summary_info += {'libcap-ng support': libcap_ng}
+ summary_info += {'bpf support':       libbpf}
+ summary_info += {'rbd support':       rbd}
+summary_info += {'vitastor support':  vitastor}
+ summary_info += {'smartcard support': cacard}
+ summary_info += {'U2F support':       u2f}
+ summary_info += {'libusb':            libusb}
+Index: pve-qemu-kvm-8.1.2/meson_options.txt
+===================================================================
+--- pve-qemu-kvm-8.1.2.orig/meson_options.txt
+++ pve-qemu-kvm-8.1.2/meson_options.txt
+@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value :
+        description: 'lzo compression support')
+ option('rbd', type : 'feature', value : 'auto',
+        description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+       description: 'Vitastor block device driver')
+ option('opengl', type : 'feature', value : 'auto',
+        description: 'OpenGL support')
+ option('rdma', type : 'feature', value : 'auto',
+Index: pve-qemu-kvm-8.1.2/qapi/block-core.json
+===================================================================
+--- pve-qemu-kvm-8.1.2.orig/qapi/block-core.json
+++ pve-qemu-kvm-8.1.2/qapi/block-core.json
+@@ -3403,7 +3403,7 @@
+             'raw', 'rbd',
+             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
+             'pbs',
+-            'ssh', 'throttle', 'vdi', 'vhdx',
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
+             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
+@@ -4465,6 +4465,28 @@
+             '*server': ['InetSocketAddressBase'] } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -4923,6 +4945,7 @@
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+       'vhdx':       'BlockdevOptionsGenericFormat',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'virtio-blk-vfio-pci':
+                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
+                       'if': 'CONFIG_BLKIO' },
+@@ -5360,6 +5383,17 @@
+             '*encrypt' :        'RbdEncryptionCreateOptions' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -5581,6 +5615,7 @@
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'vmdk':           'BlockdevCreateOptionsVmdk',
+       'vpc':            'BlockdevCreateOptionsVpc'
+   } }
+Index: pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
+===================================================================
+--- pve-qemu-kvm-8.1.2.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ pve-qemu-kvm-8.1.2/scripts/ci/org.centos/stream/8/x86_64/configure
+@@ -30,7 +30,7 @@
+ --with-suffix="qemu-kvm" \
+ --firmwarepath=/usr/share/qemu-firmware \
+ --target-list="x86_64-softmmu" \
+---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+ --audio-drv-list="" \
+ --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
+ --with-coroutine=ucontext \
+@@ -176,6 +176,7 @@
+ --enable-opengl \
+ --enable-pie \
+ --enable-rbd \
+--enable-vitastor \
+ --enable-rdma \
+ --enable-seccomp \
+ --enable-snappy \
+Index: pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
+===================================================================
+--- pve-qemu-kvm-8.1.2.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-8.1.2/scripts/meson-buildoptions.sh
+@@ -153,6 +153,7 @@ meson_options_help() {
+   printf "%s\n" '  qed             qed image format support'
+   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
+   printf "%s\n" '  rbd             Ceph block device driver'
+  printf "%s\n" '  vitastor        Vitastor block device driver'
+   printf "%s\n" '  rdma            Enable RDMA-based migration'
+   printf "%s\n" '  replication     replication support'
+   printf "%s\n" '  sdl             SDL user interface'
+@@ -416,6 +417,8 @@ _meson_option_parse() {
+     --disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
+     --enable-rbd) printf "%s" -Drbd=enabled ;;
+     --disable-rbd) printf "%s" -Drbd=disabled ;;
+    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
+     --enable-rdma) printf "%s" -Drdma=enabled ;;
+     --disable-rdma) printf "%s" -Drdma=disabled ;;
+     --enable-replication) printf "%s" -Dreplication=enabled ;;
--- a/patches/qemu-8.1-vitastor.patch
+++ b/patches/qemu-8.1-vitastor.patch
@@ -0,0 +1,190 @@
+diff --git a/block/meson.build b/block/meson.build
+index 529fc172c6..d542dc0609 100644
+--- a/block/meson.build
+++ b/block/meson.build
+@@ -110,6 +110,7 @@ foreach m : [
+   [libnfs, 'nfs', files('nfs.c')],
+   [libssh, 'ssh', files('ssh.c')],
+   [rbd, 'rbd', files('rbd.c')],
+  [vitastor, 'vitastor', files('vitastor.c')],
+ ]
+   if m[0].found()
+     module_ss = ss.source_set()
+diff --git a/meson.build b/meson.build
+index a9c4f28247..8496cf13f1 100644
+--- a/meson.build
+++ b/meson.build
+@@ -1303,6 +1303,26 @@ if not get_option('rbd').auto() or have_block
+   endif
+ endif
+ 
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+    required: get_option('vitastor'))
+  if libvitastor_client.found()
+    if cc.links('''
+      #include <vitastor_c.h>
+      int main(void) {
+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        return 0;
+      }''', dependencies: libvitastor_client)
+      vitastor = declare_dependency(dependencies: libvitastor_client)
+    elif get_option('vitastor').enabled()
+      error('could not link libvitastor_client')
+    else
+      warning('could not link libvitastor_client, disabling')
+    endif
+  endif
+endif
+
+ glusterfs = not_found
+ glusterfs_ftruncate_has_stat = false
+ glusterfs_iocb_has_stat = false
+@@ -2119,6 +2139,7 @@ if numa.found()
+ endif
+ config_host_data.set('CONFIG_OPENGL', opengl.found())
+ config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
+ config_host_data.set('CONFIG_RDMA', rdma.found())
+ config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
+ config_host_data.set('CONFIG_SDL', sdl.found())
+@@ -4286,6 +4307,7 @@ summary_info += {'fdt support':       fdt_opt == 'disabled' ? false : fdt_opt}
+ summary_info += {'libcap-ng support': libcap_ng}
+ summary_info += {'bpf support':       libbpf}
+ summary_info += {'rbd support':       rbd}
+summary_info += {'vitastor support':  vitastor}
+ summary_info += {'smartcard support': cacard}
+ summary_info += {'U2F support':       u2f}
+ summary_info += {'libusb':            libusb}
+diff --git a/meson_options.txt b/meson_options.txt
+index ae6d8f469d..e3d9f8404d 100644
+--- a/meson_options.txt
+++ b/meson_options.txt
+@@ -186,6 +186,8 @@ option('lzo', type : 'feature', value : 'auto',
+        description: 'lzo compression support')
+ option('rbd', type : 'feature', value : 'auto',
+        description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+       description: 'Vitastor block device driver')
+ option('opengl', type : 'feature', value : 'auto',
+        description: 'OpenGL support')
+ option('rdma', type : 'feature', value : 'auto',
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index 2b1d493d6e..90673fdbdc 100644
+--- a/qapi/block-core.json
+++ b/qapi/block-core.json
+@@ -3146,7 +3146,7 @@
+             'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
+             'raw', 'rbd',
+             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
+-            'ssh', 'throttle', 'vdi', 'vhdx',
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
+             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
+@@ -4196,6 +4196,28 @@
+             '*key-secret': 'str',
+             '*server': ['InetSocketAddressBase'] } }
+ 
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+ ##
+ # @ReplicationMode:
+ #
+@@ -4654,6 +4676,7 @@
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+       'vhdx':       'BlockdevOptionsGenericFormat',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'virtio-blk-vfio-pci':
+                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
+                       'if': 'CONFIG_BLKIO' },
+@@ -5089,6 +5112,17 @@
+             '*cluster-size' :   'size',
+             '*encrypt' :        'RbdEncryptionCreateOptions' } }
+ 
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+ ##
+ # @BlockdevVmdkSubformat:
+ #
+@@ -5311,6 +5345,7 @@
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'vmdk':           'BlockdevCreateOptionsVmdk',
+       'vpc':            'BlockdevCreateOptionsVpc'
+   } }
+diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
+index d02b09a4b9..f0b5fbfef3 100755
+--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
+@@ -30,7 +30,7 @@
+ --with-suffix="qemu-kvm" \
+ --firmwarepath=/usr/share/qemu-firmware \
+ --target-list="x86_64-softmmu" \
+---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+ --audio-drv-list="" \
+ --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
+ --with-coroutine=ucontext \
+@@ -176,6 +176,7 @@
+ --enable-opengl \
+ --enable-pie \
+ --enable-rbd \
+--enable-vitastor \
+ --enable-rdma \
+ --enable-seccomp \
+ --enable-snappy \
+diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
+index d7020af175..94958eb6fa 100644
+--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
+@@ -153,6 +153,7 @@ meson_options_help() {
+   printf "%s\n" '  qed             qed image format support'
+   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
+   printf "%s\n" '  rbd             Ceph block device driver'
+  printf "%s\n" '  vitastor        Vitastor block device driver'
+   printf "%s\n" '  rdma            Enable RDMA-based migration'
+   printf "%s\n" '  replication     replication support'
+   printf "%s\n" '  sdl             SDL user interface'
+@@ -416,6 +417,8 @@ _meson_option_parse() {
+     --disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
+     --enable-rbd) printf "%s" -Drbd=enabled ;;
+     --disable-rbd) printf "%s" -Drbd=disabled ;;
+    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
+     --enable-rdma) printf "%s" -Drdma=enabled ;;
+     --disable-rdma) printf "%s" -Drdma=disabled ;;
+     --enable-replication) printf "%s" -Dreplication=enabled ;;
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.9.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.6$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.3.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.3.1$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -15,6 +15,7 @@ RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
 RUN rpm --nomd5 -i fio*.src.rpm
 RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
 RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
+RUN yum -y install cmake3

 ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root

@@ -35,7 +36,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.6.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.3.1.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.6
+Version:        1.3.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.6.el7.tar.gz
+Source0:        vitastor-1.3.1.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -16,7 +16,7 @@ BuildRequires:  jerasure-devel
 BuildRequires:  libisa-l-devel
 BuildRequires:  gf-complete-devel
 BuildRequires:  libibverbs-devel
-BuildRequires:  cmake
+BuildRequires:  cmake3
 Requires:       vitastor-osd = %{version}-%{release}
 Requires:       vitastor-mon = %{version}-%{release}
 Requires:       vitastor-client = %{version}-%{release}
@@ -94,7 +94,7 @@ Vitastor fio drivers for benchmarking.

 %build
 . /opt/rh/devtoolset-9/enable
-%cmake .
+%cmake3 .
 %make_build


--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.6.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.3.1.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.6
+Version:        1.3.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.6.el8.tar.gz
+Source0:        vitastor-1.3.1.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.6.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.3.1.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.6
+Version:        1.3.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.6.el9.tar.gz
+Source0:        vitastor-1.3.1.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,10 +16,11 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.9.6")
-add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
+add_definitions(-DVERSION="1.3.1")
+add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
+add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
-	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
+	add_definitions(-fsanitize=address)
 	add_link_options(-fsanitize=address -fno-omit-frame-pointer)
 endif (${WITH_ASAN})

@@ -137,6 +138,7 @@ endif (${WITH_FIO})
 add_library(vitastor_client SHARED
 	cluster_client.cpp
 	cluster_client_list.cpp
+	cluster_client_wb.cpp
 	vitastor_c.cpp
 	cli_common.cpp
 	cli_alloc_osd.cpp
@@ -300,7 +302,7 @@ target_link_libraries(test_crc32
 add_executable(test_cluster_client
 	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
-	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
+	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
 	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
--- a/src/addr_util.cpp
+++ b/src/addr_util.cpp
@@ -19,8 +19,8 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
        if (p != std::string::npos && !(str.length() > 0 && str[p-1] == ']')) // "[ipv6]" which contains ':'
        {
            char null_byte = 0;
-            int n = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
-            if (n != 1 || default_port >= 0x10000)
+            int scanned = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
+            if (scanned != 1 || default_port >= 0x10000)
                return false;
            str = str.substr(0, p);
        }
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -143,34 +143,83 @@ uint64_t allocator::get_free_count()
    return free;
 }

+// FIXME: Move to utils?
 void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
 {
-    if (start == 0)
+    if (start == 0 && len == 32*bitmap_granularity)
+        *((uint32_t*)bitmap) = UINT32_MAX;
+    else if (start == 0 && len == 64*bitmap_granularity)
+        *((uint64_t*)bitmap) = UINT64_MAX;
+    else
    {
-        if (len == 32*bitmap_granularity)
+        unsigned bit_start = start / bitmap_granularity;
+        unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
+        while (bit_start < bit_end)
        {
-            *((uint32_t*)bitmap) = UINT32_MAX;
-            return;
-        }
-        else if (len == 64*bitmap_granularity)
-        {
-            *((uint64_t*)bitmap) = UINT64_MAX;
-            return;
-        }
-    }
-    unsigned bit_start = start / bitmap_granularity;
-    unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
-    while (bit_start < bit_end)
-    {
-        if (!(bit_start & 7) && bit_end >= bit_start+8)
-        {
-            ((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
-            bit_start += 8;
-        }
-        else
-        {
-            ((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
-            bit_start++;
+            if (!(bit_start & 7) && bit_end >= bit_start+8)
+            {
+                ((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
+                bit_start += 8;
+            }
+            else
+            {
+                ((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
+                bit_start++;
+            }
        }
    }
 }
+
+void bitmap_clear(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
+{
+    if (start == 0 && len == 32*bitmap_granularity)
+        *((uint32_t*)bitmap) = 0;
+    else if (start == 0 && len == 64*bitmap_granularity)
+        *((uint64_t*)bitmap) = 0;
+    else
+    {
+        unsigned bit_start = start / bitmap_granularity;
+        unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
+        while (bit_start < bit_end)
+        {
+            if (!(bit_start & 7) && bit_end >= bit_start+8)
+            {
+                ((uint8_t*)bitmap)[bit_start / 8] = 0;
+                bit_start += 8;
+            }
+            else
+            {
+                ((uint8_t*)bitmap)[bit_start / 8] &= (0xFF ^ (1 << (bit_start % 8)));
+                bit_start++;
+            }
+        }
+    }
+}
+
+bool bitmap_check(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
+{
+    bool r = false;
+    if (start == 0 && len == 32*bitmap_granularity)
+        r = !!*((uint32_t*)bitmap);
+    else if (start == 0 && len == 64*bitmap_granularity)
+        r = !!*((uint64_t*)bitmap);
+    else
+    {
+        unsigned bit_start = start / bitmap_granularity;
+        unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
+        while (bit_start < bit_end)
+        {
+            if (!(bit_start & 7) && bit_end >= bit_start+8)
+            {
+                r = r || !!((uint8_t*)bitmap)[bit_start / 8];
+                bit_start += 8;
+            }
+            else
+            {
+                r = r || (((uint8_t*)bitmap)[bit_start / 8] & (1 << (bit_start % 8)));
+                bit_start++;
+            }
+        }
+    }
+    return r;
+}
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -23,3 +23,5 @@ public:
 };

 void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
+void bitmap_clear(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
+bool bitmap_check(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -77,6 +77,7 @@ Output:
  -EINVAL = invalid input parameters
  -ENOENT = requested object/version does not exist for reads
  -ENOSPC = no space left in the store for writes
+  -EDOM = checksum error.
 - version = the version actually read or written

 ## BS_OP_DELETE
--- a/src/blockstore_disk.cpp
+++ b/src/blockstore_disk.cpp
@@ -40,10 +40,49 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    data_block_size = parse_size(config["block_size"]);
    journal_device = config["journal_device"];
    journal_offset = parse_size(config["journal_offset"]);
-    disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
-    journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
-    meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
-    bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
+    disk_alignment = parse_size(config["disk_alignment"]);
+    journal_block_size = parse_size(config["journal_block_size"]);
+    meta_block_size = parse_size(config["meta_block_size"]);
+    bitmap_granularity = parse_size(config["bitmap_granularity"]);
+    meta_format = stoull_full(config["meta_format"]);
+    if (config.find("data_io") == config.end() &&
+        config.find("meta_io") == config.end() &&
+        config.find("journal_io") == config.end())
+    {
+        bool cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
+        bool cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
+            config.find("cached_io_meta") == config.end() ||
+            config["cached_io_meta"] == "true" || config["cached_io_meta"] == "yes" || config["cached_io_meta"] == "1";
+        bool cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
+            config.find("cached_io_journal") == config.end() ||
+            config["cached_io_journal"] == "true" || config["cached_io_journal"] == "yes" || config["cached_io_journal"] == "1";
+        data_io = cached_io_data ? "cached" : "direct";
+        meta_io = cached_io_meta ? "cached" : "direct";
+        journal_io = cached_io_journal ? "cached" : "direct";
+    }
+    else
+    {
+        data_io = config.find("data_io") != config.end() ? config["data_io"] : "direct";
+        meta_io = config.find("meta_io") != config.end()
+            ? config["meta_io"]
+            : (meta_device == data_device || meta_device == "" ? data_io : "direct");
+        journal_io = config.find("journal_io") != config.end()
+            ? config["journal_io"]
+            : (journal_device == meta_device || journal_device == "" ? meta_io : "direct");
+    }
+    if (config["data_csum_type"] == "crc32c")
+    {
+        data_csum_type = BLOCKSTORE_CSUM_CRC32C;
+    }
+    else if (config["data_csum_type"] == "" || config["data_csum_type"] == "none")
+    {
+        data_csum_type = BLOCKSTORE_CSUM_NONE;
+    }
+    else
+    {
+        throw std::runtime_error("data_csum_type="+config["data_csum_type"]+" is unsupported, only \"crc32c\" and \"none\" are supported");
+    }
+    csum_block_size = parse_size(config["csum_block_size"]);
    // Validate
    if (!data_block_size)
    {
@@ -91,7 +130,23 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    }
    if (data_block_size % bitmap_granularity)
    {
-        throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
+        throw std::runtime_error("Data block size must be a multiple of sparse write tracking granularity");
+    }
+    if (!data_csum_type)
+    {
+        csum_block_size = 0;
+    }
+    else if (!csum_block_size)
+    {
+        csum_block_size = bitmap_granularity;
+    }
+    if (csum_block_size && (csum_block_size % bitmap_granularity))
+    {
+        throw std::runtime_error("Checksum block size must be a multiple of sparse write tracking granularity");
+    }
+    if (csum_block_size && (data_block_size % csum_block_size))
+    {
+        throw std::runtime_error("Checksum block size must be a divisor of data block size");
    }
    if (meta_device == "")
    {
@@ -110,7 +165,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
        throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
    }
    clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
-    clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
+    clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
+        ? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
+    clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
 }

 void blockstore_disk_t::calc_lengths(bool skip_meta_check)
@@ -160,6 +217,25 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
    // required metadata size
    block_count = data_len / data_block_size;
    meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
+    if (meta_format == BLOCKSTORE_META_FORMAT_V1 ||
+        !meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type)
+    {
+        uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
+        uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
+            / (meta_block_size / clean_entry_v0_size)) * meta_block_size;
+        if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
+        {
+            // Old metadata fits.
+            printf("Warning: Using old metadata format without checksums because the new format doesn't fit into provided area\n");
+            clean_entry_size = clean_entry_v0_size;
+            meta_len = meta_v0_len;
+            meta_format = BLOCKSTORE_META_FORMAT_V1;
+        }
+        else
+            meta_format = BLOCKSTORE_META_FORMAT_V2;
+    }
+    else
+        meta_format = BLOCKSTORE_META_FORMAT_V2;
    if (!skip_meta_check && meta_area_size < meta_len)
    {
        throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
@@ -214,9 +290,19 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
    }
 }

+static int bs_openmode(const std::string & mode)
+{
+    if (mode == "directsync")
+        return O_DIRECT|O_SYNC;
+    else if (mode == "cached")
+        return O_SYNC;
+    else
+        return O_DIRECT;
+}
+
 void blockstore_disk_t::open_data()
 {
-    data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
+    data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
    if (data_fd == -1)
    {
        throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
@@ -241,9 +327,9 @@ void blockstore_disk_t::open_data()

 void blockstore_disk_t::open_meta()
 {
-    if (meta_device != data_device)
+    if (meta_device != data_device || meta_io != data_io)
    {
-        meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
+        meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
        if (meta_fd == -1)
        {
            throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
@@ -253,7 +339,7 @@ void blockstore_disk_t::open_meta()
        {
            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
        }
-        if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
+        if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
        {
            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
        }
@@ -279,15 +365,15 @@ void blockstore_disk_t::open_meta()

 void blockstore_disk_t::open_journal()
 {
-    if (journal_device != meta_device)
+    if (journal_device != meta_device || journal_io != meta_io)
    {
-        journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
+        journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
        if (journal_fd == -1)
        {
            throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
        }
        check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
-        if (!disable_flock && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
+        if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
        {
            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
        }
--- a/src/blockstore_disk.h
+++ b/src/blockstore_disk.h
@@ -8,6 +8,10 @@
 #include <string>
 #include <map>

+#define BLOCKSTORE_CSUM_NONE 0
+// Lower byte of checksum type is its length
+#define BLOCKSTORE_CSUM_CRC32C 0x104
+
 struct blockstore_disk_t
 {
    std::string data_device, meta_device, journal_device;
@@ -21,17 +25,24 @@ struct blockstore_disk_t
    uint64_t meta_block_size = 4096;
    // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
    uint64_t bitmap_granularity = 4096;
+    // Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
+    uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
+    // Checksum block size, must be a multiple of bitmap_granularity
+    uint32_t csum_block_size = 4096;
    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
    bool disable_flock = false;
+    // I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
+    // O_SYNC without O_DIRECT = use Linux page cache for reads and writes
+    std::string data_io, meta_io, journal_io;

    int meta_fd = -1, data_fd = -1, journal_fd = -1;
-    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len;
+    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
    uint64_t data_offset, data_device_sect, data_device_size, data_len;
    uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;

    uint32_t block_order;
    uint64_t block_count;
-    uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
+    uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;

    void parse_config(std::map<std::string, std::string> & config);
    void open_data();
@@ -39,4 +50,13 @@ struct blockstore_disk_t
    void open_journal();
    void calc_lengths(bool skip_meta_check = false);
    void close_all();
+
+    inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
+    {
+        // Checksums may be partial if write is not aligned with csum_block_size
+        return clean_entry_bitmap_size + (csum_block_size && len > 0
+            ? ((offset+len+csum_block_size-1)/csum_block_size - offset/csum_block_size)
+                * (data_csum_type & 0xFF)
+            : 0);
+    }
 };
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@@ -1,10 +1,22 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

+#define COPY_BUF_JOURNAL 1
+#define COPY_BUF_DATA 2
+#define COPY_BUF_ZERO 4
+#define COPY_BUF_CSUM_FILL 8
+#define COPY_BUF_COALESCED 16
+#define COPY_BUF_META_BLOCK 32
+#define COPY_BUF_JOURNALED_BIG 64
+
 struct copy_buffer_t
 {
-    uint64_t offset, len;
+    int copy_flags;
+    uint64_t offset, len, disk_offset;
+    uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
    void *buf;
+    uint8_t *csum_buf;
+    int *dyn_data;
 };

 struct meta_sector_t
@@ -37,7 +49,7 @@ class journal_flusher_co
 {
    blockstore_impl_t *bs;
    journal_flusher_t *flusher;
-    int wait_state, wait_count;
+    int wait_state, wait_count, wait_journal_count;
    struct io_uring_sqe *sqe;
    struct ring_data_t *data;

@@ -46,28 +58,39 @@ class journal_flusher_co
    obj_ver_id cur;
    std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
    std::map<object_id, uint64_t>::iterator repeat_it;
-    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
+    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;

    bool skip_copy, has_delete, has_writes;
    std::vector<copy_buffer_t> v;
    std::vector<copy_buffer_t>::iterator it;
+    int i;
+    bool fill_incomplete, cleared_incomplete;
+    int read_to_fill_incomplete;
    int copy_count;
-    uint64_t clean_loc, old_clean_loc;
+    uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
    flusher_meta_write_t meta_old, meta_new;
    bool clean_init_bitmap;
    uint64_t clean_bitmap_offset, clean_bitmap_len;
-    void *new_clean_bitmap;
+    uint8_t *clean_init_dyn_ptr;
+    uint8_t *new_clean_bitmap;

    uint64_t new_trim_pos;

-    // local: scan_dirty()
-    uint64_t offset, end_offset, submit_offset, submit_len;
-
    friend class journal_flusher_t;
-    bool scan_dirty(int wait_base);
+    void scan_dirty();
+    bool read_dirty(int wait_base);
+    bool modify_meta_do_reads(int wait_base);
+    bool wait_meta_reads(int wait_base);
    bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
+    bool clear_incomplete_csum_block_bits(int wait_base);
+    void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
+    void update_metadata_entry();
+    bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
    void update_clean_db();
+    void free_data_blocks();
    bool fsync_batch(bool fsync_meta, int wait_base);
+    bool trim_journal(int wait_base);
+    void free_buffers();
 public:
    journal_flusher_co();
    bool loop();
@@ -95,9 +118,10 @@ class journal_flusher_t

    std::map<uint64_t, meta_sector_t> meta_sectors;
    std::deque<object_id> flush_queue;
-    std::map<object_id, uint64_t> flush_versions;
+    std::map<object_id, uint64_t> flush_versions; // FIXME: consider unordered_map?

    bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
+    bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);

 public:
    journal_flusher_t(blockstore_impl_t *bs);
@@ -112,4 +136,5 @@ public:
    void unshift_flush(obj_ver_id oid, bool force);
    void remove_flush(object_id oid);
    void dump_diagnostics();
+    bool is_mutated(uint64_t clean_loc);
 };
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -13,6 +13,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    initialized = 0;
    parse_config(config, true);
    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
+    alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
    try
    {
        dsk.open_data();
@@ -38,8 +39,8 @@ blockstore_impl_t::~blockstore_impl_t()
    dsk.close_all();
    if (metadata_buffer)
        free(metadata_buffer);
-    if (clean_bitmap)
-        free(clean_bitmap);
+    if (clean_bitmaps)
+        free(clean_bitmaps);
 }

 bool blockstore_impl_t::is_started()
@@ -383,6 +384,10 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
+    if (op->opcode == BS_OP_SYNC)
+    {
+        unsynced_queued_ops = 0;
+    }
    init_op(op);
    submit_queue.push_back(op);
    ringloop->wakeup();
@@ -392,6 +397,7 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
 {
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
+    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -93,11 +93,10 @@

 // "VITAstor"
 #define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
-#define BLOCKSTORE_META_VERSION_V1 1
+#define BLOCKSTORE_META_FORMAT_V1 1
+#define BLOCKSTORE_META_FORMAT_V2 2

 // metadata header (superblock)
-// FIXME: After adding the OSD superblock, add a key to metadata
-// and journal headers to check if they belong to the same OSD
 struct __attribute__((__packed__)) blockstore_meta_header_v1_t
 {
    uint64_t zero;
@@ -108,14 +107,29 @@ struct __attribute__((__packed__)) blockstore_meta_header_v1_t
    uint32_t bitmap_granularity;
 };

+struct __attribute__((__packed__)) blockstore_meta_header_v2_t
+{
+    uint64_t zero;
+    uint64_t magic;
+    uint64_t version;
+    uint32_t meta_block_size;
+    uint32_t data_block_size;
+    uint32_t bitmap_granularity;
+    uint32_t data_csum_type;
+    uint32_t csum_block_size;
+    uint32_t header_csum;
+};
+
 // 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
 // per "clean" entry on disk with fixed metadata tables
-// FIXME: maybe add crc32's to metadata
 struct __attribute__((__packed__)) clean_disk_entry
 {
    object_id oid;
    uint64_t version;
    uint8_t bitmap[];
+    // Two more fields come after bitmap in metadata version 2:
+    // uint32_t data_csum[];
+    // uint32_t entry_csum;
 };

 // 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
@@ -125,7 +139,7 @@ struct __attribute__((__packed__)) clean_entry
    uint64_t location;
 };

-// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
+// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
 struct __attribute__((__packed__)) dirty_entry
 {
    uint32_t state;
@@ -134,7 +148,7 @@ struct __attribute__((__packed__)) dirty_entry
    uint32_t offset;   // data offset within object (stripe)
    uint32_t len;      // data length
    uint64_t journal_sector; // journal sector used for this entry
-    void* bitmap;   // either external bitmap itself when it fits, or a pointer to it when it doesn't
+    void* dyn_data;    // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
 };

 // - Sync must be submitted after previous writes/deletes (not before!)
@@ -163,12 +177,23 @@ struct __attribute__((__packed__)) dirty_entry
 // Suspend operation until there is some free space on the data device
 #define WAIT_FREE 5

-struct fulfill_read_t
+struct used_clean_obj_t
 {
-    uint64_t offset, len;
-    uint64_t journal_sector; // sector+1 if used and !journal.inmemory, otherwise 0
+    int refs;
+    bool was_freed; // was freed by a parallel flush?
+    bool was_changed; // was changed by a parallel flush?
 };

+// https://github.com/algorithm-ninja/cpp-btree
+// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
+// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
+typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
+typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
+
+#include "blockstore_init.h"
+
+#include "blockstore_flush.h"
+
 #define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
 #define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)

@@ -181,10 +206,11 @@ struct blockstore_op_private_t
    int op_state;

    // Read
-    std::vector<fulfill_read_t> read_vec;
+    uint64_t clean_block_used;
+    std::vector<copy_buffer_t> read_vec;

    // Sync, write
-    int min_flushed_journal_sector, max_flushed_journal_sector;
+    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;

    // Write
    struct iovec iov_zerofill[3];
@@ -194,19 +220,8 @@ struct blockstore_op_private_t

    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
-    int sync_small_checked, sync_big_checked;
 };

-// https://github.com/algorithm-ninja/cpp-btree
-// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
-// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
-typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
-typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
-
-#include "blockstore_init.h"
-
-#include "blockstore_flush.h"
-
 typedef uint32_t pool_id_t;
 typedef uint64_t pool_pg_id_t;

@@ -247,17 +262,20 @@ class blockstore_impl_t
    int throttle_target_parallelism = 1;
    // Minimum difference in microseconds between target and real execution times to throttle the response
    int throttle_threshold_us = 50;
+    // Maximum writes between automatically added fsync operations
+    uint64_t autosync_writes = 128;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

    std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
    std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
-    uint8_t *clean_bitmap = NULL;
+    uint8_t *clean_bitmaps = NULL;
    blockstore_dirty_db_t dirty_db;
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    int unsynced_big_write_count = 0;
+    int unsynced_big_write_count = 0, unstable_unsynced = 0;
+    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint8_t *zero_object;

@@ -267,6 +285,10 @@ class blockstore_impl_t
    journal_flusher_t *flusher;
    int big_to_flush = 0;
    int write_iodepth = 0;
+    bool alloc_dyn_data = false;
+
+    // clean data blocks referenced by read operations
+    std::map<uint64_t, used_clean_obj_t> used_clean_objects;

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
@@ -310,8 +332,30 @@ class blockstore_impl_t

    // Read
    int dequeue_read(blockstore_op_t *read_op);
-    int fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
-        uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector);
+    void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
+        std::function<int(int, bool, uint32_t, uint32_t)> callback);
+    int fulfill_read(blockstore_op_t *read_op,
+        uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
+        uint32_t item_state, uint64_t item_version, uint64_t item_location,
+        uint64_t journal_sector, uint8_t *csum, int *dyn_data);
+    bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
+        uint8_t *clean_entry_bitmap, int *dyn_data,
+        uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
+    int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
+        uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
+    int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
+        uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
+        uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
+    bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
+        uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
+    bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
+    uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
+    bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
+        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
+    bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
+        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
+    bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
+        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
    int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
        uint32_t item_state, uint64_t item_version);
    void handle_read_event(ring_data_t *data, blockstore_op_t *op);
@@ -342,6 +386,7 @@ class blockstore_impl_t
    int continue_rollback(blockstore_op_t *op);
    void mark_rolled_back(const obj_ver_id & ov);
    void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
+    void free_dirty_dyn_data(dirty_entry & e);

    // List
    void process_list(blockstore_op_t *op);
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -77,13 +77,20 @@ resume_1:
    if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
    {
        {
-            blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer;
+            blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
            hdr->zero = 0;
            hdr->magic = BLOCKSTORE_META_MAGIC_V1;
-            hdr->version = BLOCKSTORE_META_VERSION_V1;
+            hdr->version = bs->dsk.meta_format;
            hdr->meta_block_size = bs->dsk.meta_block_size;
            hdr->data_block_size = bs->dsk.data_block_size;
            hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
+            if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
+            {
+                hdr->data_csum_type = bs->dsk.data_csum_type;
+                hdr->csum_block_size = bs->dsk.csum_block_size;
+                hdr->header_csum = 0;
+                hdr->header_csum = crc32c(0, hdr, sizeof(*hdr));
+            }
        }
        if (bs->readonly)
        {
@@ -109,28 +116,62 @@ resume_1:
    }
    else
    {
-        blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer;
-        if (hdr->zero != 0 ||
-            hdr->magic != BLOCKSTORE_META_MAGIC_V1 ||
-            hdr->version != BLOCKSTORE_META_VERSION_V1)
+        blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
+        if (hdr->zero != 0 || hdr->magic != BLOCKSTORE_META_MAGIC_V1 || hdr->version < BLOCKSTORE_META_FORMAT_V1)
        {
            printf(
-                "Metadata is corrupt or old version.\n"
-                " If this is a new OSD please zero out the metadata area before starting it.\n"
-                " If you need to upgrade from 0.5.x please request it via the issue tracker.\n"
+                "Metadata is corrupt or too old (pre-0.6.x).\n"
+                " If this is a new OSD, please zero out the metadata area before starting it.\n"
+                " If you need to upgrade from 0.5.x, convert metadata with vitastor-disk.\n"
+            );
+            exit(1);
+        }
+        if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
+        {
+            uint32_t csum = hdr->header_csum;
+            hdr->header_csum = 0;
+            if (crc32c(0, hdr, sizeof(*hdr)) != csum)
+            {
+                printf("Metadata header is corrupt (checksum mismatch).\n");
+                exit(1);
+            }
+            hdr->header_csum = csum;
+            bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
+        }
+        else if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
+        {
+            hdr->data_csum_type = 0;
+            hdr->csum_block_size = 0;
+            hdr->header_csum = 0;
+            // Enable compatibility mode - entries without checksums
+            bs->dsk.clean_entry_size = sizeof(clean_disk_entry) + bs->dsk.clean_entry_bitmap_size*2;
+            bs->dsk.meta_len = (1 + (bs->dsk.block_count - 1 + bs->dsk.meta_block_size / bs->dsk.clean_entry_size)
+                / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
+            bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V1;
+            printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
+        }
+        else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
+        {
+            printf(
+                "Metadata format is too new for me (stored version is %lu, max supported %u).\n",
+                hdr->version, BLOCKSTORE_META_FORMAT_V2
            );
            exit(1);
        }
        if (hdr->meta_block_size != bs->dsk.meta_block_size ||
            hdr->data_block_size != bs->dsk.data_block_size ||
-            hdr->bitmap_granularity != bs->dsk.bitmap_granularity)
+            hdr->bitmap_granularity != bs->dsk.bitmap_granularity ||
+            hdr->data_csum_type != bs->dsk.data_csum_type ||
+            hdr->csum_block_size != bs->dsk.csum_block_size)
        {
            printf(
                "Configuration stored in metadata superblock"
-                " (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
-                " differs from OSD configuration (%lu/%u/%lu).\n",
+                " (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u, data_csum_type=%u, csum_block_size=%u)"
+                " differs from OSD configuration (%lu/%u/%lu, %u/%u).\n",
                hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
-                bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity
+                hdr->data_csum_type, hdr->csum_block_size,
+                bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity,
+                bs->dsk.data_csum_type, bs->dsk.csum_block_size
            );
            exit(1);
        }
@@ -279,12 +320,22 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
    for (uint64_t i = 0; i < max_i; i++)
    {
        clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
-        if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
-        {
-            memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2*bs->dsk.clean_entry_bitmap_size);
-        }
        if (entry->oid.inode > 0)
        {
+            if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
+            {
+                // Check entry crc32
+                uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
+                if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
+                {
+                    printf("Metadata entry %lu is corrupt (checksum mismatch), skipping\n", done_cnt+i);
+                    continue;
+                }
+            }
+            if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
+            {
+                memcpy(bs->clean_bitmaps + (done_cnt+i) * 2 * bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2 * bs->dsk.clean_entry_bitmap_size);
+            }
            auto & clean_db = bs->clean_db_shard(entry->oid);
            auto clean_it = clean_db.find(entry->oid);
            if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
@@ -440,7 +491,9 @@ resume_1:
            .size = sizeof(journal_entry_start),
            .reserved = 0,
            .journal_start = bs->journal.block_size,
-            .version = JOURNAL_VERSION,
+            .version = JOURNAL_VERSION_V2,
+            .data_csum_type = bs->dsk.data_csum_type,
+            .csum_block_size = bs->dsk.csum_block_size,
        };
        ((journal_entry_start*)submitted_buf)->crc32 = je_crc32((journal_entry*)submitted_buf);
        if (bs->readonly)
@@ -492,18 +545,37 @@ resume_1:
        if (je_start->magic != JOURNAL_MAGIC ||
            je_start->type != JE_START ||
            je_crc32((journal_entry*)je_start) != je_start->crc32 ||
-            je_start->size != sizeof(journal_entry_start) && je_start->size != JE_START_LEGACY_SIZE)
+            je_start->size != JE_START_V0_SIZE && je_start->size != JE_START_V1_SIZE && je_start->size != JE_START_V2_SIZE)
        {
            // Entry is corrupt
-            fprintf(stderr, "First entry of the journal is corrupt\n");
+            fprintf(stderr, "First entry of the journal is corrupt or unsupported\n");
            exit(1);
        }
-        if (je_start->size == JE_START_LEGACY_SIZE || je_start->version != JOURNAL_VERSION)
+        if (je_start->size == JE_START_V0_SIZE ||
+            (je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
+            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
        {
            fprintf(
-                stderr, "The code only supports journal version %d, but it is %lu on disk."
-                    " Please use the previous version to flush the journal before upgrading OSD\n",
-                JOURNAL_VERSION, je_start->size == JE_START_LEGACY_SIZE ? 0 : je_start->version
+                stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
+                    " Please use vitastor-disk to rewrite the journal\n",
+                je_start->size == JE_START_V0_SIZE ? 0 : je_start->version
+            );
+            exit(1);
+        }
+        if (je_start->version == JOURNAL_VERSION_V1 ||
+            je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
+        {
+            je_start->data_csum_type = 0;
+            je_start->csum_block_size = 0;
+        }
+        if (je_start->data_csum_type != bs->dsk.data_csum_type ||
+            je_start->csum_block_size != bs->dsk.csum_block_size)
+        {
+            printf(
+                "Configuration stored in journal superblock (data_csum_type=%u, csum_block_size=%u)"
+                " differs from OSD configuration (%u/%u).\n",
+                je_start->data_csum_type, je_start->csum_block_size,
+                bs->dsk.data_csum_type, bs->dsk.csum_block_size
            );
            exit(1);
        }
@@ -660,8 +732,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
    resume:
        while (pos < bs->journal.block_size)
        {
-            journal_entry *je = (journal_entry*)((uint8_t*)buf + proc_pos - done_pos + pos);
-            if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
+            auto buf_pos = proc_pos - done_pos + pos;
+            journal_entry *je = (journal_entry*)((uint8_t*)buf + buf_pos);
+            if (je->magic != JOURNAL_MAGIC || buf_pos+je->size > len || je_crc32(je) != je->crc32 ||
                je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
            {
                if (pos == 0)
@@ -705,11 +778,14 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
                    throw std::runtime_error(err);
                }
-                uint32_t data_crc32 = 0;
+                small_write_data.clear();
                if (location >= done_pos && location+je->small_write.len <= done_pos+len)
                {
                    // data is within this buffer
-                    data_crc32 = crc32c(0, (uint8_t*)buf + location - done_pos, je->small_write.len);
+                    small_write_data.push_back((iovec){
+                        .iov_base = (uint8_t*)buf + location - done_pos,
+                        .iov_len = je->small_write.len,
+                    });
                }
                else
                {
@@ -724,7 +800,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                                ? location+je->small_write.len : done[i].pos+done[i].len);
                            uint64_t part_begin = (location < done[i].pos ? done[i].pos : location);
                            covered += part_end - part_begin;
-                            data_crc32 = crc32c(data_crc32, (uint8_t*)done[i].buf + part_begin - done[i].pos, part_end - part_begin);
+                            small_write_data.push_back((iovec){
+                                .iov_base = (uint8_t*)done[i].buf + part_begin - done[i].pos,
+                                .iov_len = part_end - part_begin,
+                            });
                        }
                    }
                    if (covered < je->small_write.len)
@@ -734,12 +813,102 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        return 2;
                    }
                }
-                if (data_crc32 != je->small_write.crc32_data)
+                bool data_csum_valid = true;
+                if (!bs->dsk.csum_block_size)
+                {
+                    uint32_t data_crc32 = 0;
+                    for (auto & sd: small_write_data)
+                    {
+                        data_crc32 = crc32c(data_crc32, sd.iov_base, sd.iov_len);
+                    }
+                    data_csum_valid = data_crc32 == je->small_write.crc32_data;
+                    if (!data_csum_valid)
+                    {
+                        printf(
+                            "Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - data crc32 %x != %x\n",
+                            je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
+                            je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
+                            je->small_write.offset, je->small_write.len,
+                            data_crc32, je->small_write.crc32_data
+                        );
+                    }
+                }
+                else if (je->small_write.len > 0)
+                {
+                    // FIXME: deduplicate with disk_tool_journal.cpp
+                    // like in enqueue_write()
+                    uint32_t start = je->small_write.offset / bs->dsk.csum_block_size;
+                    uint32_t end = (je->small_write.offset+je->small_write.len-1) / bs->dsk.csum_block_size;
+                    uint32_t data_csum_size = (end-start+1) * (bs->dsk.data_csum_type & 0xFF);
+                    uint32_t required_size = sizeof(journal_entry_small_write) + bs->dsk.clean_entry_bitmap_size + data_csum_size;
+                    if (je->size != required_size)
+                    {
+                        printf(
+                            "Journal entry data has invalid size for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - should be %u bytes but is %u bytes\n",
+                            je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
+                            je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
+                            je->small_write.offset, je->small_write.len,
+                            required_size, je->size
+                        );
+                        data_csum_valid = false;
+                    }
+                    else
+                    {
+                        int sd_num = 0;
+                        size_t sd_pos = 0;
+                        uint32_t *block_csums = (uint32_t*)((uint8_t*)je + sizeof(journal_entry_small_write) + bs->dsk.clean_entry_bitmap_size);
+                        for (uint32_t pos = start; pos <= end; pos++, block_csums++)
+                        {
+                            size_t block_left = (pos == start
+                                ? (start == end
+                                    ? je->small_write.len
+                                    : bs->dsk.csum_block_size - je->small_write.offset%bs->dsk.csum_block_size)
+                                : (pos < end
+                                    ? bs->dsk.csum_block_size
+                                    : (je->small_write.offset + je->small_write.len)%bs->dsk.csum_block_size));
+                            if (pos > start && pos == end && block_left == 0)
+                            {
+                                // full last block
+                                block_left = bs->dsk.csum_block_size;
+                            }
+                            uint32_t block_crc32 = 0;
+                            while (block_left > 0)
+                            {
+                                assert(sd_num < small_write_data.size());
+                                if (small_write_data[sd_num].iov_len >= sd_pos+block_left)
+                                {
+                                    block_crc32 = crc32c(block_crc32, (uint8_t*)small_write_data[sd_num].iov_base+sd_pos, block_left);
+                                    sd_pos += block_left;
+                                    break;
+                                }
+                                else
+                                {
+                                    block_crc32 = crc32c(block_crc32, (uint8_t*)small_write_data[sd_num].iov_base+sd_pos, small_write_data[sd_num].iov_len-sd_pos);
+                                    block_left -= (small_write_data[sd_num].iov_len-sd_pos);
+                                    sd_pos = 0;
+                                    sd_num++;
+                                }
+                            }
+                            if (block_crc32 != *block_csums)
+                            {
+                                printf(
+                                    "Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - block %u crc32 %x != %x\n",
+                                    je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
+                                    je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
+                                    je->small_write.offset, je->small_write.len,
+                                    pos, block_crc32, *block_csums
+                                );
+                                data_csum_valid = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if (!data_csum_valid)
                {
                    // journal entry is corrupt, stop here
                    // interesting thing is that we must clear the corrupt entry if we're not readonly,
                    // because we don't write next entries in the same journal block
-                    printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
                    memset((uint8_t*)buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
                    bs->journal.next_free = prev_free;
                    init_write_buf = (uint8_t*)buf + proc_pos - done_pos;
@@ -755,11 +924,14 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .oid = je->small_write.oid,
                        .version = je->small_write.version,
                    };
-                    void *bmp = NULL;
-                    void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write);
-                    if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
+                    uint64_t dyn_size = bs->dsk.dirty_dyn_size(je->small_write.offset, je->small_write.len);
+                    void *dyn = NULL;
+                    void *dyn_from = (uint8_t*)je + sizeof(journal_entry_small_write);
+                    if (!bs->alloc_dyn_data)
                    {
-                        memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
+                        // Bitmap without checksum is only 4 bytes for 128k objects, save it inline
+                        // It can even contain 4 byte bitmap + 4 byte CRC32 for 4 kb writes :)
+                        memcpy(&dyn, dyn_from, dyn_size);
                    }
                    else
                    {
@@ -767,8 +939,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        // allocations for entry bitmaps. This can only be fixed by using
                        // a patched map with dynamic entry size, but not the btree_map,
                        // because it doesn't keep iterators valid all the time.
-                        bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
-                        memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
+                        dyn = malloc_or_die(dyn_size+sizeof(int));
+                        *((int*)dyn) = 1;
+                        memcpy((uint8_t*)dyn+sizeof(int), dyn_from, dyn_size);
                    }
                    bs->dirty_db.emplace(ov, (dirty_entry){
                        .state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
@@ -777,7 +950,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .offset = je->small_write.offset,
                        .len = je->small_write.len,
                        .journal_sector = proc_pos,
-                        .bitmap = bmp,
+                        .dyn_data = dyn,
                    });
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
@@ -836,11 +1009,13 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .oid = je->big_write.oid,
                        .version = je->big_write.version,
                    };
-                    void *bmp = NULL;
-                    void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write);
-                    if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
+                    uint64_t dyn_size = bs->dsk.dirty_dyn_size(je->big_write.offset, je->big_write.len);
+                    void *dyn = NULL;
+                    void *dyn_from = (uint8_t*)je + sizeof(journal_entry_big_write);
+                    if (!bs->alloc_dyn_data)
                    {
-                        memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
+                        // Bitmap without checksum is only 4 bytes for 128k objects, save it inline
+                        memcpy(&dyn, dyn_from, dyn_size);
                    }
                    else
                    {
@@ -848,8 +1023,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        // allocations for entry bitmaps. This can only be fixed by using
                        // a patched map with dynamic entry size, but not the btree_map,
                        // because it doesn't keep iterators valid all the time.
-                        bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
-                        memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
+                        dyn = malloc_or_die(dyn_size+sizeof(int));
+                        *((int*)dyn) = 1;
+                        memcpy((uint8_t*)dyn+sizeof(int), dyn_from, dyn_size);
                    }
                    auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
                        .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
@@ -858,7 +1034,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .offset = je->big_write.offset,
                        .len = je->big_write.len,
                        .journal_sector = proc_pos,
-                        .bitmap = bmp,
+                        .dyn_data = dyn,
                    }).first;
                    if (bs->data_alloc->get(je->big_write.location >> bs->dsk.block_order))
                    {
--- a/src/blockstore_init.h
+++ b/src/blockstore_init.h
@@ -50,6 +50,7 @@ class blockstore_init_journal
    uint64_t next_free;
    std::vector<bs_init_journal_done> done;
    std::vector<obj_ver_id> double_allocs;
+    std::vector<iovec> small_write_data;
    uint64_t journal_pos = 0;
    uint64_t continue_pos = 0;
    void *init_write_buf = NULL;
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -17,6 +17,7 @@ blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
 // Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
 int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
 {
+    uint64_t prev_next = next_sector;
    int required = entries_required;
    while (1)
    {
@@ -35,11 +36,19 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
            }
            required -= fits;
            next_in_pos += fits * size;
-            sectors_to_write++;
+            if (next_sector != prev_next || !sectors_to_write)
+            {
+                // Except the previous call to this function
+                sectors_to_write++;
+            }
        }
        else if (bs->journal.sector_info[next_sector].dirty)
        {
-            sectors_to_write++;
+            if (next_sector != prev_next || !sectors_to_write)
+            {
+                // Except the previous call to this function
+                sectors_to_write++;
+            }
        }
        if (required <= 0)
        {
@@ -135,7 +144,10 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        journal.sector_info[journal.cur_sector].written = false;
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
-        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
+        auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
+        // double check that next_free doesn't cross used_start from the left
+        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        journal.next_free = next_next_free;
        memset(journal.inmemory
            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
            : (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
@@ -189,6 +201,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
    priv->pending_ops++;
    if (!priv->min_flushed_journal_sector)
        priv->min_flushed_journal_sector = 1+cur_sector;
+    assert(priv->min_flushed_journal_sector <= journal.sector_count);
    priv->max_flushed_journal_sector = 1+cur_sector;
 }

@@ -289,3 +302,31 @@ void journal_t::dump_diagnostics()
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
    );
 }
+
+static uint64_t zero_page[4096];
+
+uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
+{
+    uint32_t r = prev_crc;
+    while (left_pad >= 4096)
+    {
+        r = crc32c(r, zero_page, 4096);
+        left_pad -= 4096;
+    }
+    if (left_pad > 0)
+        r = crc32c(r, zero_page, left_pad);
+    r = crc32c(r, buf, len);
+    while (right_pad >= 4096)
+    {
+        r = crc32c(r, zero_page, 4096);
+        right_pad -= 4096;
+    }
+    if (left_pad > 0)
+        r = crc32c(r, zero_page, right_pad);
+    return r;
+}
+
+uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
+{
+    return crc32c(0, buf, len);
+}
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -8,16 +8,11 @@

 #define MIN_JOURNAL_SIZE 4*1024*1024
 #define JOURNAL_MAGIC 0x4A33
-#define JOURNAL_VERSION 1
+#define JOURNAL_VERSION_V1 1
+#define JOURNAL_VERSION_V2 2
 #define JOURNAL_BUFFER_SIZE 4*1024*1024
 #define JOURNAL_ENTRY_HEADER_SIZE 16

-// We reserve some extra space for future stabilize requests during writes
-// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
-// writing more than can be stabilized afterwards
-#define JOURNAL_STABILIZE_RESERVATION 65536
-#define JOURNAL_INSTANT_RESERVATION 131072
-
 // Journal entries
 // Journal entries are linked to each other by their crc32 value
 // The journal is almost a blockchain, because object versions constantly increase
@@ -32,7 +27,7 @@
 #define JE_BIG_WRITE_INSTANT   0x08
 #define JE_MAX         0x08

-// crc32c comes first to ease calculation and is equal to crc32()
+// crc32c comes first to ease calculation
 struct __attribute__((__packed__)) journal_entry_start
 {
    uint32_t crc32;
@@ -42,8 +37,12 @@ struct __attribute__((__packed__)) journal_entry_start
    uint32_t reserved;
    uint64_t journal_start;
    uint64_t version;
+    uint32_t data_csum_type;
+    uint32_t csum_block_size;
 };
-#define JE_START_LEGACY_SIZE 24
+#define JE_START_V0_SIZE 24
+#define JE_START_V1_SIZE 32
+#define JE_START_V2_SIZE 40

 struct __attribute__((__packed__)) journal_entry_small_write
 {
@@ -59,10 +58,12 @@ struct __attribute__((__packed__)) journal_entry_small_write
    // small_write entries contain <len> bytes of data which is stored in next sectors
    // data_offset is its offset within journal
    uint64_t data_offset;
-    uint32_t crc32_data;
+    uint32_t crc32_data; // zero when data_csum_type != 0
    // small_write and big_write entries are followed by the "external" bitmap
    // its size is dynamic and included in journal entry's <size> field
    uint8_t bitmap[];
+    // and then data checksums if data_csum_type != 0
+    // uint32_t data_crc32c[];
 };

 struct __attribute__((__packed__)) journal_entry_big_write
@@ -80,6 +81,8 @@ struct __attribute__((__packed__)) journal_entry_big_write
    // small_write and big_write entries are followed by the "external" bitmap
    // its size is dynamic and included in journal entry's <size> field
    uint8_t bitmap[];
+    // and then data checksums if data_csum_type != 0
+    // uint32_t data_crc32c[];
 };

 struct __attribute__((__packed__)) journal_entry_stable
@@ -218,3 +221,6 @@ struct blockstore_journal_check_t
 };

 journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
+
+uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
+uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@@ -19,6 +19,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
+    if (config.find("autosync_writes") != config.end())
+    {
+        autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
+    }
    if (!max_flusher_count)
    {
        max_flusher_count = 256;
@@ -135,19 +139,24 @@ void blockstore_impl_t::calc_lengths()
    {
        metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
        if (!metadata_buffer)
-            throw std::runtime_error("Failed to allocate memory for the metadata");
+            throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
    }
-    else if (dsk.clean_entry_bitmap_size)
+    else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
    {
-        clean_bitmap = (uint8_t*)malloc(dsk.block_count * 2*dsk.clean_entry_bitmap_size);
-        if (!clean_bitmap)
-            throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
+        clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
+        if (!clean_bitmaps)
+        {
+            throw std::runtime_error(
+                "Failed to allocate memory for the metadata sparse write bitmap ("+
+                std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
+            );
+        }
    }
    if (journal.inmemory)
    {
        journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
        if (!journal.buffer)
-            throw std::runtime_error("Failed to allocate memory for journal");
+            throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
    }
    else
    {
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

+#include <limits.h>
 #include "blockstore_impl.h"

 int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
@@ -8,12 +9,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
 {
    if (!len)
    {
-        // Zero-length version - skip
-        return 1;
-    }
-    else if (IS_IN_FLIGHT(item_state))
-    {
-        // Write not finished yet - skip
+        // Zero-length read
        return 1;
    }
    else if (IS_DELETE(item_state))
@@ -22,6 +18,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
        memset(buf, 0, len);
        return 1;
    }
+    assert(!IS_IN_FLIGHT(item_state));
    if (journal.inmemory && IS_JOURNAL(item_state))
    {
        memcpy(buf, (uint8_t*)journal.buffer + offset, len);
@@ -40,59 +37,115 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
    return 1;
 }

-// FIXME I've seen a bug here so I want some tests
-int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
-    uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector)
+void blockstore_impl_t::find_holes(std::vector<copy_buffer_t> & read_vec,
+    uint32_t item_start, uint32_t item_end,
+    std::function<int(int, bool, uint32_t, uint32_t)> callback)
 {
-    uint32_t cur_start = item_start;
-    if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
+    auto cur_start = item_start;
+    int i = 0;
+    while (cur_start < item_end)
    {
-        cur_start = cur_start < read_op->offset ? read_op->offset : cur_start;
-        item_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end;
-        auto it = PRIV(read_op)->read_vec.begin();
-        while (1)
+        // COPY_BUF_CSUM_FILL items are fake items inserted in the end, their offsets aren't in order
+        if (i >= read_vec.size() || read_vec[i].copy_flags & COPY_BUF_CSUM_FILL || read_vec[i].offset >= item_end)
        {
-            for (; it != PRIV(read_op)->read_vec.end(); it++)
-            {
-                if (it->offset >= cur_start)
-                {
-                    break;
-                }
-                else if (it->offset + it->len > cur_start)
-                {
-                    cur_start = it->offset + it->len;
-                    if (cur_start >= item_end)
-                    {
-                        goto endwhile;
-                    }
-                }
-            }
-            if (it == PRIV(read_op)->read_vec.end() || it->offset > cur_start)
-            {
-                fulfill_read_t el = {
-                    .offset = cur_start,
-                    .len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
-                    .journal_sector = journal_sector,
-                };
-                it = PRIV(read_op)->read_vec.insert(it, el);
-                if (!fulfill_read_push(read_op,
-                    (uint8_t*)read_op->buf + el.offset - read_op->offset,
-                    item_location + el.offset - item_start,
-                    el.len, item_state, item_version))
-                {
-                    return 0;
-                }
-                fulfilled += el.len;
-            }
-            cur_start = it->offset + it->len;
-            if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
-            {
-                break;
-            }
+            // Hole (at end): cur_start .. item_end
+            i += callback(i, false, cur_start, item_end);
+            break;
        }
+        else if (read_vec[i].offset > cur_start)
+        {
+            // Hole: cur_start .. min(read_vec[i].offset, item_end)
+            auto cur_end = read_vec[i].offset > item_end ? item_end : read_vec[i].offset;
+            i += callback(i, false, cur_start, cur_end);
+            cur_start = cur_end;
+        }
+        else if (read_vec[i].offset + read_vec[i].len > cur_start)
+        {
+            // Allocated: cur_start .. min(read_vec[i].offset + read_vec[i].len, item_end)
+            auto cur_end = read_vec[i].offset + read_vec[i].len;
+            cur_end = cur_end > item_end ? item_end : cur_end;
+            i += callback(i, true, cur_start, cur_end);
+            cur_start = cur_end;
+            i++;
+        }
+        else
+            i++;
    }
-endwhile:
-    return 1;
+}
+
+int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
+    uint64_t &fulfilled, uint32_t item_start, uint32_t item_end, // FIXME: Rename item_* to dirty_*
+    uint32_t item_state, uint64_t item_version, uint64_t item_location,
+    uint64_t journal_sector, uint8_t *csum, int *dyn_data)
+{
+    int r = 1;
+    if (item_start < read_op->offset + read_op->len && item_end > read_op->offset)
+    {
+        auto & rv = PRIV(read_op)->read_vec;
+        auto rd_start = item_start < read_op->offset ? read_op->offset : item_start;
+        auto rd_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end;
+        find_holes(rv, rd_start, rd_end, [&](int pos, bool alloc, uint32_t start, uint32_t end)
+        {
+            if (!r || alloc)
+                return 0;
+            if (!journal.inmemory && dsk.csum_block_size > dsk.bitmap_granularity && IS_JOURNAL(item_state) && !IS_DELETE(item_state))
+            {
+                uint32_t blk_begin = (start/dsk.csum_block_size) * dsk.csum_block_size;
+                blk_begin = blk_begin < item_start ? item_start : blk_begin;
+                uint32_t blk_end = ((end-1) / dsk.csum_block_size + 1) * dsk.csum_block_size;
+                blk_end = blk_end > item_end ? item_end : blk_end;
+                rv.push_back((copy_buffer_t){
+                    .copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL,
+                    .offset = blk_begin,
+                    .len = blk_end-blk_begin,
+                    .csum_buf = (csum + (blk_begin/dsk.csum_block_size -
+                        item_start/dsk.csum_block_size) * (dsk.data_csum_type & 0xFF)),
+                    .dyn_data = dyn_data,
+                });
+                if (dyn_data)
+                {
+                    (*dyn_data)++;
+                }
+                // Submit the journal checksum block read
+                if (!read_checksum_block(read_op, 1, fulfilled, item_location - item_start))
+                {
+                    r = 0;
+                }
+                return 0;
+            }
+            copy_buffer_t el = {
+                .copy_flags = (IS_JOURNAL(item_state) ? COPY_BUF_JOURNAL : COPY_BUF_DATA),
+                .offset = start,
+                .len = end-start,
+                .disk_offset = item_location + start - item_start,
+                .journal_sector = (IS_JOURNAL(item_state) ? journal_sector : 0),
+                .csum_buf = !csum ? NULL : (csum + (start - item_start) / dsk.csum_block_size * (dsk.data_csum_type & 0xFF)),
+                .dyn_data = dyn_data,
+            };
+            if (dyn_data)
+            {
+                (*dyn_data)++;
+            }
+            if (IS_BIG_WRITE(item_state))
+            {
+                // If we don't track it then we may IN THEORY read another object's data:
+                // submit read -> remove the object -> flush remove -> overwrite with another object -> finish read
+                // Very improbable, but possible
+                PRIV(read_op)->clean_block_used = 1;
+            }
+            rv.insert(rv.begin() + pos, el);
+            fulfilled += el.len;
+            if (!fulfill_read_push(read_op,
+                (uint8_t*)read_op->buf + el.offset - read_op->offset,
+                item_location + el.offset - item_start,
+                el.len, item_state, item_version))
+            {
+                r = 0;
+            }
+            return 1;
+        });
+    }
+    return r;
 }

 uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
@@ -106,10 +159,225 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
        clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
    }
    else
-        clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
+        clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
    return clean_entry_bitmap;
 }

+int blockstore_impl_t::fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
+    uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end)
+{
+    if (read_end == read_offset)
+        return 0;
+    int required = 0;
+    read_buf -= read_offset;
+    uint32_t last_block = (read_end-1)/dsk.csum_block_size;
+    uint32_t start_block = read_offset/dsk.csum_block_size;
+    uint32_t end_block = 0;
+    while (start_block <= last_block)
+    {
+        if (read_range_fulfilled(rv, fulfilled, read_buf, clean_entry_bitmap,
+            start_block*dsk.csum_block_size < read_offset ? read_offset : start_block*dsk.csum_block_size,
+            (start_block+1)*dsk.csum_block_size > read_end ? read_end : (start_block+1)*dsk.csum_block_size))
+        {
+            // read_range_fulfilled() also adds zero-filled areas
+            start_block++;
+        }
+        else
+        {
+            // Find a sequence of checksum blocks required to be read
+            end_block = start_block;
+            while ((end_block+1)*dsk.csum_block_size < read_end &&
+                !read_range_fulfilled(rv, fulfilled, read_buf, clean_entry_bitmap,
+                    (end_block+1)*dsk.csum_block_size < read_offset ? read_offset : (end_block+1)*dsk.csum_block_size,
+                    (end_block+2)*dsk.csum_block_size > read_end ? read_end : (end_block+2)*dsk.csum_block_size))
+            {
+                end_block++;
+            }
+            end_block++;
+            // OK, mark this range as required
+            rv.push_back((copy_buffer_t){
+                .copy_flags = COPY_BUF_CSUM_FILL | (from_journal ? COPY_BUF_JOURNALED_BIG : 0),
+                .offset = start_block*dsk.csum_block_size,
+                .len = (end_block-start_block)*dsk.csum_block_size,
+                // save clean_entry_bitmap if we're reading clean data from the journal
+                .csum_buf = from_journal ? clean_entry_bitmap : NULL,
+                .dyn_data = dyn_data,
+            });
+            if (dyn_data)
+            {
+                (*dyn_data)++;
+            }
+            start_block = end_block;
+            required++;
+        }
+    }
+    return required;
+}
+
+// read_buf should be == op->buf - op->offset
+bool blockstore_impl_t::read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
+    uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end)
+{
+    bool all_done = true;
+    find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
+    {
+        if (alloc)
+            return 0;
+        int diff = 0;
+        uint32_t bmp_start = cur_start/dsk.bitmap_granularity;
+        uint32_t bmp_end = cur_end/dsk.bitmap_granularity;
+        uint32_t bmp_pos = bmp_start;
+        while (bmp_pos < bmp_end)
+        {
+            while (bmp_pos < bmp_end && !(clean_entry_bitmap[bmp_pos >> 3] & (1 << (bmp_pos & 0x7))))
+                bmp_pos++;
+            if (bmp_pos > bmp_start)
+            {
+                // zero fill
+                copy_buffer_t el = {
+                    .copy_flags = COPY_BUF_ZERO,
+                    .offset = bmp_start*dsk.bitmap_granularity,
+                    .len = (bmp_pos-bmp_start)*dsk.bitmap_granularity,
+                };
+                rv.insert(rv.begin() + pos, el);
+                if (read_buf)
+                    memset(read_buf + el.offset, 0, el.len);
+                fulfilled += el.len;
+                diff++;
+            }
+            bmp_start = bmp_pos;
+            while (bmp_pos < bmp_end && (clean_entry_bitmap[bmp_pos >> 3] & (1 << (bmp_pos & 0x7))))
+                bmp_pos++;
+            if (bmp_pos > bmp_start)
+            {
+                // something is to be read
+                all_done = false;
+            }
+            bmp_start = bmp_pos;
+        }
+        return diff;
+    });
+    return all_done;
+}
+
+bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc)
+{
+    auto & rv = PRIV(op)->read_vec;
+    auto *vi = &rv[rv.size()-rv_pos];
+    uint32_t item_start = vi->offset, item_end = vi->offset+vi->len;
+    uint32_t fill_size = 0;
+    int n_iov = 0;
+    find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
+    {
+        if (alloc)
+        {
+            fill_size += cur_end-cur_start;
+            n_iov++;
+        }
+        else
+        {
+            if (cur_start < op->offset)
+            {
+                fill_size += op->offset-cur_start;
+                n_iov++;
+                cur_start = op->offset;
+            }
+            if (cur_end > op->offset+op->len)
+            {
+                fill_size += cur_end-(op->offset+op->len);
+                n_iov++;
+                cur_end = op->offset+op->len;
+            }
+            if (cur_end > cur_start)
+            {
+                n_iov++;
+            }
+        }
+        return 0;
+    });
+    void *buf = memalign_or_die(MEM_ALIGNMENT, fill_size + n_iov*sizeof(struct iovec));
+    iovec *iov = (struct iovec*)((uint8_t*)buf+fill_size);
+    n_iov = 0;
+    fill_size = 0;
+    find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
+    {
+        int res = 0;
+        if (alloc)
+        {
+            iov[n_iov++] = (struct iovec){ (uint8_t*)buf+fill_size, cur_end-cur_start };
+            fill_size += cur_end-cur_start;
+        }
+        else
+        {
+            if (cur_start < op->offset)
+            {
+                iov[n_iov++] = (struct iovec){ (uint8_t*)buf+fill_size, op->offset-cur_start };
+                fill_size += op->offset-cur_start;
+                cur_start = op->offset;
+            }
+            auto lim_end = cur_end > op->offset+op->len ? op->offset+op->len : cur_end;
+            if (lim_end > cur_start)
+            {
+                iov[n_iov++] = (struct iovec){ (uint8_t*)op->buf+cur_start-op->offset, lim_end-cur_start };
+                rv.insert(rv.begin() + pos, (copy_buffer_t){
+                    .copy_flags = COPY_BUF_DATA,
+                    .offset = cur_start,
+                    .len = lim_end-cur_start,
+                });
+                fulfilled += lim_end-cur_start;
+                res++;
+            }
+            if (cur_end > op->offset+op->len)
+            {
+                iov[n_iov++] = (struct iovec){ (uint8_t*)buf+fill_size, cur_end - (op->offset+op->len) };
+                fill_size += cur_end - (op->offset+op->len);
+                cur_end = op->offset+op->len;
+            }
+        }
+        return res;
+    });
+    vi = &rv[rv.size()-rv_pos];
+    // Save buf into read_vec too but in a creepy way
+    // FIXME: Shit, something else should be invented %)
+    *vi = (copy_buffer_t){
+        .copy_flags = vi->copy_flags,
+        .offset = vi->offset,
+        .len = ((uint64_t)n_iov << 32) | fill_size,
+        .disk_offset = clean_loc + item_start,
+        .buf = (uint8_t*)buf,
+        .csum_buf = vi->csum_buf,
+        .dyn_data = vi->dyn_data,
+    };
+    int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd);
+    uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
+    uint32_t d_pos = 0;
+    for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
+    {
+        int n_cur = n_iov-n_pos < IOV_MAX ? n_iov-n_pos : IOV_MAX;
+        BS_SUBMIT_GET_SQE(sqe, data);
+        PRIV(op)->pending_ops++;
+        my_uring_prep_readv(sqe, submit_fd, iov + n_pos, n_cur, submit_offset + clean_loc + item_start + d_pos);
+        data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
+        if (n_pos > 0 || n_pos + IOV_MAX < n_iov)
+        {
+            uint32_t d_len = 0;
+            for (int i = 0; i < IOV_MAX; i++)
+                d_len += iov[n_pos+i].iov_len;
+            data->iov.iov_len = d_len;
+            d_pos += d_len;
+        }
+        else
+            data->iov.iov_len = item_end-item_start;
+    }
+    if (!(vi->copy_flags & COPY_BUF_JOURNAL))
+    {
+        // Reads running parallel to flushes of the same clean block may read
+        // a mixture of old and new data. So we don't verify checksums for such blocks.
+        PRIV(op)->clean_block_used = 1;
+    }
+    return true;
+}
+
 int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
 {
    auto & clean_db = clean_db_shard(read_op->oid);
@@ -131,6 +399,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    }
    uint64_t fulfilled = 0;
    PRIV(read_op)->pending_ops = 0;
+    PRIV(read_op)->clean_block_used = 0;
+    auto & rv = PRIV(read_op)->read_vec;
    uint64_t result_version = 0;
    if (dirty_found)
    {
@@ -148,23 +418,36 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                    FINISH_OP(read_op);
                    return 2;
                }
+                int *dyn_data = (int*)(dsk.csum_block_size > 0 && alloc_dyn_data ? dirty.dyn_data : NULL);
+                uint8_t *bmp_ptr = (alloc_dyn_data
+                    ? (uint8_t*)dirty.dyn_data + sizeof(int) : (uint8_t*)&dirty.dyn_data);
                if (!result_version)
                {
                    result_version = dirty_it->first.version;
                    if (read_op->bitmap)
                    {
-                        void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
                        memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
                    }
                }
                // If inmemory_journal is false, journal trim will have to wait until the read is completed
-                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
-                    dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset),
-                    (IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0)))
+                if (!IS_JOURNAL(dirty.state))
                {
-                    // need to wait. undo added requests, don't dequeue op
-                    PRIV(read_op)->read_vec.clear();
-                    return 0;
+                    // Read from data disk, possibly checking checksums
+                    if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dyn_data,
+                        dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
+                    {
+                        goto undo_read;
+                    }
+                }
+                else
+                {
+                    // Copy from memory or read from journal, possibly checking checksums
+                    if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
+                        dirty.state, dirty_it->first.version, dirty.location, dirty.journal_sector+1,
+                        journal.inmemory ? NULL : bmp_ptr+dsk.clean_entry_bitmap_size, dyn_data))
+                    {
+                        goto undo_read;
+                    }
                }
            }
            if (fulfilled == read_op->len || dirty_it == dirty_db.begin())
@@ -187,50 +470,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        }
        if (fulfilled < read_op->len)
        {
-            if (!dsk.clean_entry_bitmap_size)
+            if (!fulfill_clean_read(read_op, fulfilled, NULL, NULL, 0, dsk.data_block_size,
+                clean_it->second.location, clean_it->second.version))
            {
-                if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size,
-                    (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0))
-                {
-                    // need to wait. undo added requests, don't dequeue op
-                    PRIV(read_op)->read_vec.clear();
-                    return 0;
-                }
-            }
-            else
-            {
-                uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
-                uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
-                while (bmp_start < bmp_size)
-                {
-                    while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
-                    {
-                        bmp_end++;
-                    }
-                    if (bmp_end > bmp_start)
-                    {
-                        // fill with zeroes
-                        assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
-                            bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
-                    }
-                    bmp_start = bmp_end;
-                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
-                    {
-                        bmp_end++;
-                    }
-                    if (bmp_end > bmp_start)
-                    {
-                        if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
-                            bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
-                            clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0))
-                        {
-                            // need to wait. undo added requests, don't dequeue op
-                            PRIV(read_op)->read_vec.clear();
-                            return 0;
-                        }
-                        bmp_start = bmp_end;
-                    }
-                }
+                goto undo_read;
            }
        }
    }
@@ -242,11 +485,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        FINISH_OP(read_op);
        return 2;
    }
-    if (fulfilled < read_op->len)
-    {
-        assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
-        assert(fulfilled == read_op->len);
-    }
+    assert(fulfilled == read_op->len);
    read_op->version = result_version;
    if (!PRIV(read_op)->pending_ops)
    {
@@ -271,6 +510,309 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    }
    read_op->retval = 0;
    return 2;
+undo_read:
+    // need to wait. undo added requests, don't dequeue op
+    if (dsk.csum_block_size > dsk.bitmap_granularity)
+    {
+        for (auto & vec: rv)
+        {
+            if ((vec.copy_flags & COPY_BUF_CSUM_FILL) && vec.buf)
+            {
+                free(vec.buf);
+                vec.buf = NULL;
+            }
+            if (vec.dyn_data && --(*vec.dyn_data) == 0) // refcount
+            {
+                free(vec.dyn_data);
+                vec.dyn_data = NULL;
+            }
+        }
+    }
+    rv.clear();
+    return 0;
+}
+
+int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
+    // FIXME Passing dirty_entry& would be nicer
+    uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
+    uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf)
+{
+    if (offset % dsk.csum_block_size || submit_len % dsk.csum_block_size)
+    {
+        if (offset < blk_end)
+        {
+            // Already being read as a part of the previous checksum block series
+            cp.buf = blk_buf + offset - blk_begin;
+            cp.copy_flags |= COPY_BUF_COALESCED;
+            if (offset+submit_len > blk_end)
+                cp.len = blk_end-offset;
+            return 2;
+        }
+        else
+        {
+            // We don't use fill_partial_checksum_blocks for journal because journal writes never have holes (internal bitmap)
+            blk_begin = (offset/dsk.csum_block_size) * dsk.csum_block_size;
+            blk_begin = blk_begin < dirty_offset ? dirty_offset : blk_begin;
+            blk_end = ((offset+submit_len-1)/dsk.csum_block_size + 1) * dsk.csum_block_size;
+            blk_end = blk_end > dirty_end ? dirty_end : blk_end;
+            if (blk_begin < offset || blk_end > offset+submit_len)
+            {
+                blk_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, blk_end-blk_begin);
+                cp.buf = blk_buf + offset - blk_begin;
+                cp.copy_flags |= COPY_BUF_COALESCED;
+                rv.push_back((copy_buffer_t){
+                    .copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL,
+                    .offset = blk_begin,
+                    .len = blk_end-blk_begin,
+                    .disk_offset = dirty_loc + blk_begin - dirty_offset,
+                    .buf = blk_buf,
+                    .csum_buf = (csum_ptr + (blk_begin/dsk.csum_block_size -
+                        dirty_offset/dsk.csum_block_size) * (dsk.data_csum_type & 0xFF)),
+                    .dyn_data = dyn_data,
+                });
+                if (dyn_data)
+                {
+                    (*dyn_data)++;
+                }
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
+    uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
+{
+    bool from_journal = clean_entry_bitmap != NULL;
+    if (!clean_entry_bitmap)
+    {
+        // NULL clean_entry_bitmap means we're reading from data, not from the journal,
+        // and the bitmap location is obvious
+        clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
+    }
+    if (dsk.csum_block_size > dsk.bitmap_granularity)
+    {
+        auto & rv = PRIV(read_op)->read_vec;
+        int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
+            (uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
+        if (!inmemory_meta && !from_journal && req > 0)
+        {
+            // Read checksums from disk
+            uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
+            for (int i = req; i > 0; i--)
+            {
+                rv[rv.size()-i].csum_buf = csum_buf;
+            }
+        }
+        for (int i = req; i > 0; i--)
+        {
+            if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
+            {
+                return false;
+            }
+        }
+        PRIV(read_op)->clean_block_used = req > 0;
+    }
+    else if (from_journal)
+    {
+        // Don't scan bitmap - journal writes don't have holes (internal bitmap)!
+        uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
+            item_start/dsk.csum_block_size*(dsk.data_csum_type & 0xFF));
+        if (!fulfill_read(read_op, fulfilled, item_start, item_end,
+            (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_loc + item_start, 0, csum, dyn_data))
+        {
+            return false;
+        }
+        if (item_start > 0 && fulfilled < read_op->len)
+        {
+            // fill with zeroes
+            assert(fulfill_read(read_op, fulfilled, 0, item_start, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
+        }
+        if (item_end < dsk.data_block_size && fulfilled < read_op->len)
+        {
+            // fill with zeroes
+            assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
+        }
+    }
+    else
+    {
+        bool csum_done = !dsk.csum_block_size || inmemory_meta;
+        uint8_t *csum_buf = clean_entry_bitmap;
+        uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
+        while (bmp_start < bmp_size)
+        {
+            while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
+            {
+                bmp_end++;
+            }
+            if (bmp_end > bmp_start)
+            {
+                // fill with zeroes
+                assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
+                    bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
+            }
+            bmp_start = bmp_end;
+            while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
+            {
+                bmp_end++;
+            }
+            if (bmp_end > bmp_start)
+            {
+                if (!csum_done)
+                {
+                    // Read checksums from disk
+                    csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
+                    csum_done = true;
+                }
+                uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
+                if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
+                    bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
+                    clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
+                {
+                    return false;
+                }
+                bmp_start = bmp_end;
+            }
+        }
+    }
+    // Increment reference counter if clean data is being read from the disk
+    if (PRIV(read_op)->clean_block_used)
+    {
+        auto & uo = used_clean_objects[clean_loc];
+        uo.refs++;
+        if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
+            uo.was_changed = true;
+        PRIV(read_op)->clean_block_used = clean_loc;
+    }
+    return true;
+}
+
+uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
+{
+    auto & rv = PRIV(op)->read_vec;
+    auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
+    auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
+    uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
+    rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
+        .copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
+        .offset = pos,
+        .buf = buf,
+    });
+    BS_SUBMIT_GET_SQE(sqe, data);
+    data->iov = (struct iovec){ buf, dsk.meta_block_size };
+    PRIV(op)->pending_ops++;
+    my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
+    data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
+    // return pointer to checksums + bitmap
+    return buf + pos + sizeof(clean_disk_entry);
+}
+
+bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
+    iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
+{
+    assert(!(offset % dsk.csum_block_size));
+    uint32_t *csums = (uint32_t*)csum_buf;
+    uint32_t block_csum = 0;
+    uint32_t block_done = 0;
+    uint32_t block_num = clean_entry_bitmap ? offset/dsk.csum_block_size : 0;
+    uint32_t bmp_pos = offset/dsk.bitmap_granularity;
+    for (int i = 0; i < n_iov; i++)
+    {
+        uint32_t pos = 0;
+        while (pos < iov[i].iov_len)
+        {
+            uint32_t start = pos;
+            uint8_t bit = (clean_entry_bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
+            while (pos < iov[i].iov_len && ((clean_entry_bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1) == bit)
+            {
+                pos += dsk.bitmap_granularity;
+                bmp_pos++;
+            }
+            uint32_t len = pos-start;
+            auto buf = (uint8_t*)iov[i].iov_base+start;
+            while (block_done+len >= dsk.csum_block_size)
+            {
+                auto cur_len = dsk.csum_block_size-block_done;
+                block_csum = crc32c_pad(block_csum, buf, bit ? cur_len : 0, bit ? 0 : cur_len, 0);
+                if (block_csum != csums[block_num])
+                {
+                    if (bad_block_cb)
+                        bad_block_cb(block_num*dsk.csum_block_size, block_csum, csums[block_num]);
+                    else
+                        return false;
+                }
+                block_num++;
+                buf += cur_len;
+                len -= cur_len;
+                block_done = block_csum = 0;
+            }
+            if (len > 0)
+            {
+                block_csum = crc32c_pad(block_csum, buf, bit ? len : 0, bit ? 0 : len, 0);
+                block_done += len;
+            }
+        }
+    }
+    assert(!block_done);
+    return true;
+}
+
+bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset,
+    iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
+{
+    uint32_t block_csum = 0;
+    uint32_t block_num = 0;
+    uint32_t block_done = offset%dsk.csum_block_size;
+    for (int i = 0; i < n_iov; i++)
+    {
+        uint32_t len = iov[i].iov_len;
+        auto buf = (uint8_t*)iov[i].iov_base;
+        while (block_done+len >= dsk.csum_block_size)
+        {
+            auto cur_len = dsk.csum_block_size-block_done;
+            block_csum = crc32c(block_csum, buf, cur_len);
+            if (block_csum != ((uint32_t*)csums)[block_num])
+            {
+                if (bad_block_cb)
+                    bad_block_cb(block_num*dsk.csum_block_size, block_csum, ((uint32_t*)csums)[block_num]);
+                else
+                    return false;
+            }
+            block_num++;
+            buf += cur_len;
+            len -= cur_len;
+            block_done = block_csum = 0;
+        }
+        if (len > 0)
+        {
+            block_csum = crc32c(block_csum, buf, len);
+            block_done += len;
+        }
+    }
+    if (block_done > 0 && block_csum != ((uint32_t*)csums)[block_num])
+    {
+        if (bad_block_cb)
+            bad_block_cb(block_num*dsk.csum_block_size, block_csum, ((uint32_t*)csums)[block_num]);
+        else
+            return false;
+    }
+    return true;
+}
+
+bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
+    iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
+{
+    uint32_t offset = clean_loc % dsk.data_block_size;
+    if (from_journal)
+        return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
+    clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
+    if (!dyn_data)
+    {
+        assert(inmemory_meta);
+        dyn_data = get_clean_entry_bitmap(clean_loc, 0);
+    }
+    return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
 }

 void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
@@ -284,6 +826,139 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
    }
    if (PRIV(op)->pending_ops == 0)
    {
+        if (dsk.csum_block_size)
+        {
+            // verify checksums if required
+            auto & rv = PRIV(op)->read_vec;
+            void *meta_block = NULL;
+            if (dsk.csum_block_size > dsk.bitmap_granularity)
+            {
+                for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--)
+                {
+                    if (rv[i].copy_flags & COPY_BUF_META_BLOCK)
+                    {
+                        // Metadata read. Skip
+                        assert(!meta_block);
+                        meta_block = rv[i].buf;
+                        rv[i].buf = NULL;
+                        continue;
+                    }
+                    struct iovec *iov = (struct iovec*)((uint8_t*)rv[i].buf + (rv[i].len & 0xFFFFFFFF));
+                    int n_iov = rv[i].len >> 32;
+                    bool ok = true;
+                    if (rv[i].copy_flags & COPY_BUF_JOURNAL)
+                    {
+                        // SMALL_WRITE from journal
+                        verify_journal_checksums(
+                            rv[i].csum_buf, rv[i].offset, iov, n_iov,
+                            [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
+                            {
+                                ok = false;
+                                printf(
+                                    "Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
+                                    op->oid.inode, op->oid.stripe, op->version,
+                                    rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
+                                );
+                            }
+                        );
+                    }
+                    else
+                    {
+                        // BIG_WRITE from journal or clean data
+                        // Do not verify checksums if the data location is/was mutated by flushers
+                        auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
+                        if (!uo.was_changed)
+                        {
+                            verify_clean_padded_checksums(
+                                op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
+                                [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
+                                {
+                                    ok = false;
+                                    printf(
+                                        "Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
+                                        op->oid.inode, op->oid.stripe, op->version,
+                                        (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
+                                        rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
+                                    );
+                                }
+                            );
+                        }
+                    }
+                    if (!ok)
+                    {
+                        op->retval = -EDOM;
+                    }
+                    free(rv[i].buf);
+                    rv[i].buf = NULL;
+                    if (rv[i].dyn_data && --(*rv[i].dyn_data) == 0) // refcount
+                    {
+                        free(rv[i].dyn_data);
+                        rv[i].dyn_data = NULL;
+                    }
+                }
+            }
+            else
+            {
+                for (auto & vec: rv)
+                {
+                    if (vec.copy_flags & COPY_BUF_META_BLOCK)
+                    {
+                        // Metadata read. Skip
+                        assert(!meta_block);
+                        meta_block = vec.buf;
+                        vec.buf = NULL;
+                        continue;
+                    }
+                    if (vec.csum_buf)
+                    {
+                        uint32_t *csum = (uint32_t*)vec.csum_buf;
+                        for (size_t p = 0; p < vec.len; p += dsk.csum_block_size, csum++)
+                        {
+                            if (crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size) != *csum)
+                            {
+                                // checksum error
+                                printf(
+                                    "Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
+                                    op->oid.inode, op->oid.stripe, op->version,
+                                    (vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
+                                    crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
+                                );
+                                op->retval = -EDOM;
+                                break;
+                            }
+                        }
+                    }
+                    if (vec.dyn_data && --(*vec.dyn_data) == 0) // refcount
+                    {
+                        free(vec.dyn_data);
+                        vec.dyn_data = NULL;
+                    }
+                }
+            }
+            if (meta_block)
+            {
+                // Free after checking
+                free(meta_block);
+                meta_block = NULL;
+            }
+        }
+        if (PRIV(op)->clean_block_used)
+        {
+            // Release clean data block
+            auto uo_it = used_clean_objects.find(PRIV(op)->clean_block_used);
+            if (uo_it != used_clean_objects.end())
+            {
+                uo_it->second.refs--;
+                if (uo_it->second.refs <= 0)
+                {
+                    if (uo_it->second.was_freed)
+                    {
+                        data_alloc->set(PRIV(op)->clean_block_used, false);
+                    }
+                    used_clean_objects.erase(uo_it);
+                }
+            }
+        }
        if (!journal.inmemory)
        {
            // Release journal sector usage
@@ -324,8 +999,9 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
                    *result_version = dirty_it->first.version;
                if (bitmap)
                {
-                    void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
-                    memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
+                    void *dyn_ptr = (alloc_dyn_data
+                        ? (uint8_t*)dirty_it->second.dyn_data + sizeof(int) : (uint8_t*)&dirty_it->second.dyn_data);
+                    memcpy(bitmap, dyn_ptr, dsk.clean_entry_bitmap_size);
                }
                return 0;
            }
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -227,11 +227,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
            journal.used_sectors.erase(dirty_it->second.journal_sector);
            flusher->mark_trim_possible();
        }
-        if (dsk.clean_entry_bitmap_size > sizeof(void*))
-        {
-            free(dirty_it->second.bitmap);
-            dirty_it->second.bitmap = NULL;
-        }
+        free_dirty_dyn_data(dirty_it->second);
        if (dirty_it == dirty_start)
        {
            break;
@@ -240,3 +236,18 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    }
    dirty_db.erase(dirty_start, dirty_end);
 }
+
+void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
+{
+    if (e.dyn_data)
+    {
+        if (alloc_dyn_data &&
+            --*((int*)e.dyn_data) == 0) // refcount
+        {
+            // dyn_data contains the bitmap and checksums
+            // free it if it doesn't refer to the in-memory journal
+            free(e.dyn_data);
+        }
+        e.dyn_data = NULL;
+    }
+}
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@@ -27,8 +27,6 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
        unsynced_big_write_count -= unsynced_big_writes.size();
        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
-        PRIV(op)->sync_small_checked = 0;
-        PRIV(op)->sync_big_checked = 0;
        unsynced_big_writes.clear();
        unsynced_small_writes.clear();
        if (PRIV(op)->sync_big_writes.size() > 0)
@@ -78,8 +76,25 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
        // 2nd step: Data device is synced, prepare & write journal entries
        // Check space in the journal and journal memory buffers
        blockstore_journal_check_t space_check(this);
-        if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
+        if (dsk.csum_block_size)
+        {
+            // More complex check because all journal entries have different lengths
+            int left = PRIV(op)->sync_big_writes.size();
+            for (auto & sbw: PRIV(op)->sync_big_writes)
+            {
+                left--;
+                auto & dirty_entry = dirty_db.at(sbw);
+                uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
+                if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
+                    (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+                {
+                    return 0;
+                }
+            }
+        }
+        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -90,16 +105,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
        int s = 0;
        while (it != PRIV(op)->sync_big_writes.end())
        {
-            if (!journal.entry_fits(sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size) &&
+            auto & dirty_entry = dirty_db.at(*it);
+            uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
+            if (!journal.entry_fits(sizeof(journal_entry_big_write) + dyn_size) &&
                journal.sector_info[journal.cur_sector].dirty)
            {
                prepare_journal_sector_write(journal.cur_sector, op);
                s++;
            }
-            auto & dirty_entry = dirty_db.at(*it);
            journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
                journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
-                sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
+                sizeof(journal_entry_big_write) + dyn_size
            );
            dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@@ -115,8 +131,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            je->offset = dirty_entry.offset;
            je->len = dirty_entry.len;
            je->location = dirty_entry.location;
-            memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*)
-                ? dirty_entry.bitmap : &dirty_entry.bitmap), dsk.clean_entry_bitmap_size);
+            memcpy((void*)(je+1), (alloc_dyn_data
+                ? (uint8_t*)dirty_entry.dyn_data+sizeof(int) : (uint8_t*)&dirty_entry.dyn_data), dyn_size);
            je->crc32 = je_crc32((journal_entry*)je);
            journal.crc32_last = je->crc32;
            it++;
@@ -169,6 +185,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
        {
            mark_stable(dirty_it->first);
        }
+        else
+        {
+            unstable_unsynced--;
+            assert(unstable_unsynced >= 0);
+        }
        dirty_it++;
        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
        {
@@ -199,6 +220,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
            {
                mark_stable(*it);
            }
+            else
+            {
+                unstable_unsynced--;
+                assert(unstable_unsynced >= 0);
+            }
        }
    }
    op->retval = 0;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -8,12 +8,21 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    // Check or assign version number
    bool found = false, deleted = false, unsynced = false, is_del = (op->opcode == BS_OP_DELETE);
    bool wait_big = false, wait_del = false;
-    void *bmp = NULL;
-    uint64_t version = 1;
-    if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
+    void *dyn = NULL;
+    if (is_del)
    {
-        bmp = calloc_or_die(1, dsk.clean_entry_bitmap_size);
+        op->len = 0;
    }
+    size_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
+    if (!is_del && alloc_dyn_data)
+    {
+        // FIXME: Working with `dyn_data` has to be refactored somehow but I first have to decide how :)
+        // +sizeof(int) = refcount
+        dyn = calloc_or_die(1, dyn_size+sizeof(int));
+        *((int*)dyn) = 1;
+    }
+    uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
+    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
        auto dirty_it = dirty_db.upper_bound((obj_ver_id){
@@ -33,10 +42,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
                : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
            if (!is_del && !deleted)
            {
-                if (dsk.clean_entry_bitmap_size > sizeof(void*))
-                    memcpy(bmp, dirty_it->second.bitmap, dsk.clean_entry_bitmap_size);
-                else
-                    bmp = dirty_it->second.bitmap;
+                void *dyn_from = alloc_dyn_data
+                    ? (uint8_t*)dirty_it->second.dyn_data + sizeof(int) : (uint8_t*)&dirty_it->second.dyn_data;
+                memcpy(dyn_ptr, dyn_from, dsk.clean_entry_bitmap_size);
            }
        }
    }
@@ -50,7 +58,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            if (!is_del)
            {
                void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
-                memcpy((dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, dsk.clean_entry_bitmap_size);
+                memcpy(dyn_ptr, bmp_ptr, dsk.clean_entry_bitmap_size);
            }
        }
        else
@@ -112,15 +120,16 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
 #endif
            op->retval = -EEXIST;
-            if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
+            if (!is_del && alloc_dyn_data)
            {
-                free(bmp);
+                free(dyn);
            }
            return false;
        }
    }
-    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size &&
-        immediate_commit != IMMEDIATE_ALL)
+    bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL));
+    if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm ||
+        !imm && unsynced_queued_ops >= autosync_writes)
    {
        // Issue an additional sync so that the previous big write can reach the journal
        blockstore_op_t *sync_op = new blockstore_op_t;
@@ -131,6 +140,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        };
        enqueue_op(sync_op);
    }
+    else if (!imm)
+        unsynced_queued_ops++;
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
@@ -158,26 +169,50 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        if (op->bitmap)
        {
            // Only allow to overwrite part of the object bitmap respective to the write's offset/len
-            uint8_t *bmp_ptr = (uint8_t*)(dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
            uint32_t bit = op->offset/dsk.bitmap_granularity;
            uint32_t bits_left = op->len/dsk.bitmap_granularity;
            while (!(bit % 8) && bits_left >= 8)
            {
                // Copy bytes
-                bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
+                dyn_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
                bit += 8;
                bits_left -= 8;
            }
            while (bits_left > 0)
            {
                // Copy bits
-                bmp_ptr[bit/8] = (bmp_ptr[bit/8] & ~(1 << (bit%8)))
+                dyn_ptr[bit/8] = (dyn_ptr[bit/8] & ~(1 << (bit%8)))
                    | (((uint8_t*)op->bitmap)[bit/8] & (1 << bit%8));
                bit++;
                bits_left--;
            }
        }
    }
+    // Calculate checksums
+    // FIXME: Allow to receive checksums from outside?
+    if (!is_del && dsk.data_csum_type && op->len > 0)
+    {
+        uint32_t *data_csums = (uint32_t*)(dyn_ptr + dsk.clean_entry_bitmap_size);
+        uint32_t start = op->offset / dsk.csum_block_size;
+        uint32_t end = (op->offset+op->len-1) / dsk.csum_block_size;
+        auto fn = state & BS_ST_BIG_WRITE ? crc32c_pad : crc32c_nopad;
+        if (start == end)
+            data_csums[0] = fn(0, op->buf, op->len, op->offset - start*dsk.csum_block_size, end*dsk.csum_block_size - (op->offset+op->len));
+        else
+        {
+            // First block
+            data_csums[0] = fn(0, op->buf, dsk.csum_block_size*(start+1)-op->offset, op->offset - start*dsk.csum_block_size, 0);
+            // Intermediate blocks
+            for (uint32_t i = start+1; i < end; i++)
+                data_csums[i-start] = crc32c(0, (uint8_t*)op->buf + dsk.csum_block_size*i-op->offset, dsk.csum_block_size);
+            // Last block
+            data_csums[end-start] = fn(
+                0, (uint8_t*)op->buf + end*dsk.csum_block_size - op->offset,
+                op->offset+op->len - end*dsk.csum_block_size,
+                0, (end+1)*dsk.csum_block_size - (op->offset+op->len)
+            );
+        }
+    }
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
@@ -188,7 +223,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        .offset = is_del ? 0 : op->offset,
        .len = is_del ? 0 : op->len,
        .journal_sector = 0,
-        .bitmap = bmp,
+        .dyn_data = dyn,
    });
    return true;
 }
@@ -197,8 +232,7 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
 {
    while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
    {
-        if (dsk.clean_entry_bitmap_size > sizeof(void*))
-            free(dirty_it->second.bitmap);
+        free_dirty_dyn_data(dirty_it->second);
        dirty_db.erase(dirty_it++);
    }
    bool found = false;
@@ -255,13 +289,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
 #endif
        auto prev_it = dirty_it;
-        prev_it--;
-        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
+        if (prev_it != dirty_db.begin())
        {
-            // Original version is still invalid
-            // All subsequent writes to the same object must be canceled too
-            cancel_all_writes(op, dirty_it, -EEXIST);
-            return 2;
+            prev_it--;
+            if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
+            {
+                // Original version is still invalid
+                // All subsequent writes to the same object must be canceled too
+                printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
+                    op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
+                cancel_all_writes(op, dirty_it, -EEXIST);
+                return 2;
+            }
        }
        op->version = PRIV(op)->real_version;
        PRIV(op)->real_version = 0;
@@ -280,8 +319,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
-            (dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
+            sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -347,7 +386,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
        );
        PRIV(op)->pending_ops = 1;
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        if (!(dirty_it->second.state & BS_ST_INSTANT))
+        {
+            unstable_unsynced++;
+        }
        if (immediate_commit != IMMEDIATE_ALL)
        {
            // Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -363,13 +405,14 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
    {
        // Small (journaled) write
        // First check if the journal has sufficient space
+        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
        if (unsynced_big_write_count &&
            !space_check.check_available(op, unsynced_big_write_count,
-                sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
+                sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
            || !space_check.check_available(op, 1,
-                sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size,
-                op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+                sizeof(journal_entry_small_write) + dyn_size,
+                op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
@@ -377,27 +420,21 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        BS_SUBMIT_CHECK_SQES(
            // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
            (immediate_commit != IMMEDIATE_NONE ||
-                !journal.entry_fits(sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size) ? 1 : 0) +
+                !journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size) ? 1 : 0) +
            (op->len > 0 ? 1 : 0)
        );
        write_iodepth++;
        // Got SQEs. Prepare previous journal sector write if required
        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        if (immediate_commit == IMMEDIATE_NONE)
+        if (immediate_commit == IMMEDIATE_NONE &&
+            !journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
        {
-            if (!journal.entry_fits(sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size))
-            {
-                prepare_journal_sector_write(journal.cur_sector, op);
-            }
-            else
-            {
-                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-            }
+            prepare_journal_sector_write(journal.cur_sector, op);
        }
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
-            sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size
+            sizeof(journal_entry_small_write) + dyn_size
        );
        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@@ -425,14 +462,17 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                exit(1);
            }
        }
+        // double check that next_free doesn't cross used_start from the left
+        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        je->oid = op->oid;
        je->version = op->version;
        je->offset = op->offset;
        je->len = op->len;
        je->data_offset = journal.next_free;
-        je->crc32_data = crc32c(0, op->buf, op->len);
-        memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), dsk.clean_entry_bitmap_size);
+        je->crc32_data = dsk.csum_block_size ? 0 : crc32c(0, op->buf, op->len);
+        memcpy((void*)(je+1), (alloc_dyn_data
+            ? (uint8_t*)dirty_it->second.dyn_data+sizeof(int) : (uint8_t*)&dirty_it->second.dyn_data), dyn_size);
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
        if (immediate_commit != IMMEDIATE_NONE)
@@ -461,10 +501,15 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        dirty_it->second.location = journal.next_free;
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
-        journal.next_free += op->len;
-        if (journal.next_free >= journal.len)
+        next_next_free = journal.next_free + op->len;
+        if (next_next_free >= journal.len)
+            next_next_free = dsk.journal_block_size;
+        // double check that next_free doesn't cross used_start from the left
+        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        journal.next_free = next_next_free;
+        if (!(dirty_it->second.state & BS_ST_INSTANT))
        {
-            journal.next_free = dsk.journal_block_size;
+            unstable_unsynced++;
        }
        if (!PRIV(op)->pending_ops)
        {
@@ -501,17 +546,17 @@ resume_2:
            .version = op->version,
        });
        assert(dirty_it != dirty_db.end());
+        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
-        if (!space_check.check_available(op, 1,
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
-            ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+        if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
+            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
        {
            return 0;
        }
        BS_SUBMIT_CHECK_SQES(1);
        journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
+            sizeof(journal_entry_big_write) + dyn_size
        );
        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@@ -527,7 +572,8 @@ resume_2:
        je->offset = op->offset;
        je->len = op->len;
        je->location = dirty_it->second.location;
-        memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), dsk.clean_entry_bitmap_size);
+        memcpy((void*)(je+1), (alloc_dyn_data
+            ? (uint8_t*)dirty_it->second.dyn_data+sizeof(int) : (uint8_t*)&dirty_it->second.dyn_data), dyn_size);
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
        prepare_journal_sector_write(journal.cur_sector, op);
@@ -547,14 +593,20 @@ resume_4:
 #endif
        bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
        bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
+        bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
        if (imm)
        {
            auto & unstab = unstable_writes[op->oid];
            unstab = unstab < op->version ? op->version : unstab;
+            if (!is_instant)
+            {
+                unstable_unsynced--;
+                assert(unstable_unsynced >= 0);
+            }
        }
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
            | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
-        if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
+        if (imm && is_instant)
        {
            // Deletions and 'instant' operations are treated as immediately stable
            mark_stable(dirty_it->first);
@@ -700,7 +752,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
    {
        return 0;
    }
@@ -716,17 +768,11 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    }
    write_iodepth++;
    // Prepare journal sector write
-    if (immediate_commit == IMMEDIATE_NONE)
+    if (immediate_commit == IMMEDIATE_NONE &&
+        (dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
+        journal.sector_info[journal.cur_sector].dirty)
    {
-        if ((dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            prepare_journal_sector_write(journal.cur_sector, op);
-        }
-        else
-        {
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        }
+        prepare_journal_sector_write(journal.cur_sector, op);
    }
    // Pre-fill journal entry
    journal_entry_del *je = (journal_entry_del*)prefill_single_journal_entry(
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -17,7 +17,7 @@
 static const char *exe_name = NULL;

 static const char* help_text =
-    "Vitastor command-line tool\n"
+    "Vitastor command-line tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -116,7 +116,8 @@ static const char* help_text =
    "Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
    "\n"
    "GLOBAL OPTIONS:\n"
-    "  --etcd_address <etcd_address>\n"
+    "  --config_file FILE  Path to Vitastor configuration file\n"
+    "  --etcd_address URL  Etcd connection address\n"
    "  --iodepth N         Send N operations in parallel to each OSD when possible (default 32)\n"
    "  --parallel_osds M   Work with M osds in parallel when possible (default 4)\n"
    "  --progress 1|0      Report progress (default 1)\n"
@@ -331,7 +332,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
    {
        // Create client
        json11::Json cfg_j = cfg;
-        p->ringloop = new ring_loop_t(512);
+        p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
        p->epmgr = new epoll_manager_t(p->ringloop);
        p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
        // Smaller timeout by default for more interactiveness
@@ -349,6 +350,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
                p->ringloop->wait();
        }
        // Destroy the client
+        p->cli->flush();
        delete p->cli;
        delete p->epmgr;
        delete p->ringloop;
@@ -357,6 +359,8 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
        p->ringloop = NULL;
    }
    // Print result
+    fflush(stderr);
+    fflush(stdout);
    if (p->json_output && !result.data.is_null())
    {
        printf("%s\n", result.data.dump().c_str());
--- a/Show More
+++ b/Show More