Use vitastor-cli instead of direct etcd interaction in the CSI driver

Implement PG scrub runner
Implement scrubbing "data path" - OSD_OP_SCRUB
2023-02-28 02:40:19 +03:00 · 2023-02-28 02:40:19 +03:00 · 2023-02-28 02:40:19 +03:00 · 2023-02-28 02:40:19 +03:00 · 2023-02-28 02:40:19 +03:00 · 2023-02-28 02:40:19 +03:00
87 changed files with 2749 additions and 869 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)

 project(vitastor)

-set(VERSION "0.8.3")
+set(VERSION "0.8.5")

 add_subdirectory(src)
--- a/VNPL-1.1-RU.txt
+++ b/VNPL-1.1-RU.txt
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
 интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
 самой программы, так и прокси.

-  Сетевая Публичная Лицензия Vitastor разработана специально чтобы
+  Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
 гарантировать, что в таких случаях и модифицированная версия программы, и
-прокси оставались доступными сообществу. Для этого лицензия требует от
+прокси останутся доступными сообществу. Для этого лицензия требует от
 операторов сетевых серверов предоставлять исходный код оригинальной программы,
 а также всех других программ, взаимодействующих с ней на их серверах,
 пользователям этих серверов, на условиях свободных лицензий. Таким образом,
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.8.3
+VERSION ?= v0.8.5

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.8.3
+          image: vitalif/vitastor-csi:v0.8.5
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.8.3
+          image: vitalif/vitastor-csi:v0.8.5
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.8.3"
+    vitastorCSIDriverVersion = "0.8.5"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -10,7 +10,6 @@ import (
    "bytes"
    "strconv"
    "time"
-    "fmt"
    "os"
    "os/exec"
    "io/ioutil"
@@ -21,8 +20,6 @@ import (
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"

-    "go.etcd.io/etcd/clientv3"
-
    "github.com/container-storage-interface/spec/lib/go/csi"
 )

@@ -114,6 +111,34 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
    return ctxVars, etcdUrl, etcdPrefix
 }

+func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
+{
+    if (ctxVars["etcdUrl"] != "")
+    {
+        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
+    }
+    if (ctxVars["etcdPrefix"] != "")
+    {
+        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
+    }
+    if (ctxVars["configPath"] != "")
+    {
+        args = append(args, "--config_path", ctxVars["configPath"])
+    }
+    c := exec.Command("/usr/bin/vitastor-cli", args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout = &stdout
+    c.Stderr = &stderr
+    err := c.Run()
+    stderrStr := string(stderr.Bytes())
+    if (err != nil)
+    {
+        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
+        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
+    }
+    return stdout.Bytes(), nil
+}
+
 // Create the volume
 func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
 {
@@ -146,128 +171,41 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }

-    // FIXME: The following should PROBABLY be implemented externally in a management tool
-
-    ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
+    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
    if (len(etcdUrl) == 0)
    {
        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

-    // Connect to etcd
-    cli, err := clientv3.New(clientv3.Config{
-        DialTimeout: ETCD_TIMEOUT,
-        Endpoints: etcdUrl,
-    })
+    // Create image using vitastor-cli
+    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", string(volSize), "--pool", string(poolId) })
    if (err != nil)
    {
-        return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
-    }
-    defer cli.Close()
-
-    var imageId uint64 = 0
-    for
-    {
-        // Check if the image exists
-        ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-        resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
-        cancel()
-        if (err != nil)
+        if (strings.Index(err.Error(), "already exists") > 0)
        {
-            return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
-        }
-        if (len(resp.Kvs) > 0)
-        {
-            kv := resp.Kvs[0]
-            var v InodeIndex
-            err := json.Unmarshal(kv.Value, &v)
+            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
            if (err != nil)
            {
-                return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
+                return nil, err
            }
-            poolId = v.PoolId
-            imageId = v.Id
-            inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
-            ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-            resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
-            cancel()
+            var inodeCfg []InodeConfig
+            err = json.Unmarshal(stat, &inodeCfg)
            if (err != nil)
            {
-                return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
            }
-            if (len(resp.Kvs) == 0)
+            if (len(inodeCfg) == 0)
            {
-                return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
+                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
            }
-            var inodeCfg InodeConfig
-            err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
-            }
-            if (inodeCfg.Size < uint64(volSize))
+            if (inodeCfg[0].Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
            }
        }
        else
        {
-            // Find a free ID
-            // Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
-            maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
-            ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-            resp, err := cli.Get(ctx, maxIdKey)
-            cancel()
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
-            }
-            var modRev int64
-            var nextId uint64
-            if (len(resp.Kvs) > 0)
-            {
-                var err error
-                nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
-                if (err != nil)
-                {
-                    return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
-                }
-                modRev = resp.Kvs[0].ModRevision
-                nextId++
-            }
-            else
-            {
-                nextId = 1
-            }
-            inodeIdxJson, _ := json.Marshal(InodeIndex{
-                Id: nextId,
-                PoolId: poolId,
-            })
-            inodeCfgJson, _ := json.Marshal(InodeConfig{
-                Name: volName,
-                Size: uint64(volSize),
-            })
-            ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-            txnResp, err := cli.Txn(ctx).If(
-                clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
-                clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
-                clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
-            ).Then(
-                clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
-                clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
-                clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
-            ).Commit()
-            cancel()
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
-            }
-            if (txnResp.Succeeded)
-            {
-                imageId = nextId
-                break
-            }
-            // Start over if the transaction fails
+            return nil, err
        }
    }

@@ -299,97 +237,12 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
    }
    volName := ctxVars["name"]

-    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
-    if (len(etcdUrl) == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
-    }
+    ctxVars, _, _ = GetConnectionParams(ctxVars)

-    cli, err := clientv3.New(clientv3.Config{
-        DialTimeout: ETCD_TIMEOUT,
-        Endpoints: etcdUrl,
-    })
+    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
    if (err != nil)
    {
-        return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
-    }
-    defer cli.Close()
-
-    // Find inode by name
-    ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-    resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
-    cancel()
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
-    }
-    if (len(resp.Kvs) == 0)
-    {
-        return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
-    }
-    var idx InodeIndex
-    err = json.Unmarshal(resp.Kvs[0].Value, &idx)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
-    }
-
-    // Get inode config
-    inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
-    ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-    resp, err = cli.Get(ctx, inodeCfgKey)
-    cancel()
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
-    }
-    if (len(resp.Kvs) == 0)
-    {
-        return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
-    }
-    var inodeCfg InodeConfig
-    err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
-    }
-
-    // Delete inode data by invoking vitastor-cli
-    args := []string{
-        "rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
-        "--pool", fmt.Sprintf("%d", idx.PoolId),
-        "--inode", fmt.Sprintf("%d", idx.Id),
-    }
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    c := exec.Command("/usr/bin/vitastor-cli", args...)
-    var stderr bytes.Buffer
-    c.Stdout = nil
-    c.Stderr = &stderr
-    err = c.Run()
-    stderrStr := string(stderr.Bytes())
-    if (err != nil)
-    {
-        klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
-        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
-    }
-
-    // Delete inode config in etcd
-    ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
-    txnResp, err := cli.Txn(ctx).Then(
-        clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
-        clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
-    ).Commit()
-    cancel()
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
-    }
-    if (!txnResp.Succeeded)
-    {
-        return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
+        return nil, err
    }

    return &csi.DeleteVolumeResponse{}, nil
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.8.3-1) unstable; urgency=medium
+vitastor (0.8.5-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.8.3-1) unstable; urgency=medium
+vitastor (0.8.5-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -34,8 +34,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.8.3; \
-    cd vitastor-0.8.3; \
+    cp -r /root/vitastor vitastor-0.8.5; \
+    cd vitastor-0.8.5; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
-    cd vitastor-0.8.3; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
+    cd vitastor-0.8.5; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
+- [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
 - [no_recovery](#no_recovery)
@@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
 Currently it's the only parameter available to tune the speed or recovery
 and rebalancing, but it's planned to implement more.

+## recovery_pg_switch
+
+- Type: integer
+- Default: 128
+
+Number of recovery operations before switching to recovery of the next PG.
+The idea is to mix all PGs during recovery for more even space and load
+distribution but still benefit from recovery queue depth greater than 1.
+Degraded PGs are anyway scanned first.
+
 ## recovery_sync_batch

 - Type: integer
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -18,6 +18,7 @@
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
+- [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
 - [no_recovery](#no_recovery)
@@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
 для ускорения или замедления восстановления и перебалансировки данных, но
 в планах реализация других параметров.

+## recovery_pg_switch
+
+- Тип: целое число
+- Значение по умолчанию: 128
+
+Число операций восстановления перед переключением на восстановление другой PG.
+Идея заключается в том, чтобы восстанавливать все PG одновременно для более
+равномерного распределения места и нагрузки, но при этом всё равно выигрывать
+от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
+случае сканируются первыми.
+
 ## recovery_sync_batch

 - Тип: целое число
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -102,6 +102,20 @@
    момент времени. На данный момент единственный параметр, который можно менять
    для ускорения или замедления восстановления и перебалансировки данных, но
    в планах реализация других параметров.
+- name: recovery_pg_switch
+  type: int
+  default: 128
+  info: |
+    Number of recovery operations before switching to recovery of the next PG.
+    The idea is to mix all PGs during recovery for more even space and load
+    distribution but still benefit from recovery queue depth greater than 1.
+    Degraded PGs are anyway scanned first.
+  info_ru: |
+    Число операций восстановления перед переключением на восстановление другой PG.
+    Идея заключается в том, чтобы восстанавливать все PG одновременно для более
+    равномерного распределения места и нагрузки, но при этом всё равно выигрывать
+    от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
+    случае сканируются первыми.
 - name: recovery_sync_batch
  type: int
  default: 16
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -9,7 +9,7 @@
 ## Debian

 - Trust Vitastor package signing key:
-  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
+  `wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
 - Add Vitastor package repository to your /etc/apt/sources.list:
  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
@@ -20,8 +20,8 @@
 ## CentOS

 - Add Vitastor package repository:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
 - Enable EPEL: `yum/dnf install epel-release`
 - Enable additional CentOS repositories:
  - CentOS 7: `yum install centos-release-scl`
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -9,7 +9,7 @@
 ## Debian

 - Добавьте ключ репозитория Vitastor:
-  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
+  `wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
 - Добавьте репозиторий Vitastor в /etc/apt/sources.list:
  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
@@ -20,8 +20,8 @@
 ## CentOS

 - Добавьте в систему репозиторий Vitastor:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
 - Включите EPEL: `yum/dnf install epel-release`
 - Включите дополнительные репозитории CentOS:
  - CentOS 7: `yum install centos-release-scl`
--- a/docs/intro/quickstart.en.md
+++ b/docs/intro/quickstart.en.md
@@ -70,7 +70,7 @@ For EC pools the configuration should look like the following:

 ```
 etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
-  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
+  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
 ```

 After you do this, one of the monitors will configure PGs and OSDs will start them.
--- a/docs/intro/quickstart.ru.md
+++ b/docs/intro/quickstart.ru.md
@@ -71,7 +71,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",

 ```
 etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
-  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
+  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
 ```

 После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -14,6 +14,7 @@ It supports the following commands:
 - [df](#df)
 - [ls](#ls)
 - [create](#create)
+- [snap-create](#create)
 - [modify](#modify)
 - [rm](#rm)
 - [flatten](#flatten)
@@ -123,6 +124,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.

+See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
+
 ## modify

 `vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -15,6 +15,7 @@ vitastor-cli - интерфейс командной строки для адм
 - [df](#df)
 - [ls](#ls)
 - [create](#create)
+- [snap-create](#create)
 - [modify](#modify)
 - [rm](#rm)
 - [flatten](#flatten)
@@ -126,6 +127,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
 клиентов, если пишущий клиент максимум 1.

+Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
+
 ## modify

 `vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -46,3 +46,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7

 You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
 if you don't want to use inode metadata.
+
+### Exporting snapshots
+
+Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
+
+Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
+
+Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
+the snapshot separately using the following commands (key points are using `skip-parents=1` and
+`-B backing_file` option):
+
+```
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
+    -O qcow2 testimg_0.qcow2
+
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
+    -O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
+```
+
+In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
+
+QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
+overwritten  **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
+get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
+containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
+in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
+
+After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
+its parent, run:
+
+```
+qemu-img rebase -u -b '' testimg.qcow2
+```
+
+This can be used for backups. Just note that exporting an image that is currently being written to
+is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
+on a live VM.
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -50,3 +50,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.

 Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
 `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
+
+### Экспорт снимков
+
+Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
+
+Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
+
+Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
+с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
+
+```
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
+    -O qcow2 testimg_0.qcow2
+
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
+    -O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
+```
+
+На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
+даже пустой.
+
+Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
+данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
+вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
+что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
+как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
+
+После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
+`testimg.qcow2` от базового, выполните:
+
+```
+qemu-img rebase -u -b '' testimg.qcow2
+```
+
+Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
+в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
+с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
--- a/2
+++ b/2
--- a/mon/PGUtil.js
+++ b/mon/PGUtil.js
@@ -21,7 +21,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
    {
        for (const pg of oh.osd_sets)
        {
-            nh.osd_sets[pg.join(' ')] = pg;
+            nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num));
        }
    }
    if (oh && oh.all_peers && oh.all_peers.length)
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
        seed ^= seed << 5;
        return seed + 2147483648;
    };
-    const hosts = Object.keys(osd_tree).sort();
    const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
+    const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
    const r = {};
    // Generate random combinations including each OSD at least once
    for (let h = 0; h < hosts.length; h++)
--- a/mon/make-etcd
+++ b/mon/make-etcd
@@ -79,7 +79,7 @@ StartLimitInterval=0
 RestartSec=10

 [Install]
-WantedBy=local.target
+WantedBy=multi-user.target
 `);
    await system(`useradd etcd`);
    await system(`systemctl daemon-reload`);
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -70,9 +70,9 @@ const etcd_tree = {
            rdma_gid_index: 0,
            rdma_mtu: 4096,
            rdma_max_sge: 128,
-            rdma_max_send: 32,
-            rdma_max_recv: 8,
-            rdma_max_msg: 1048576,
+            rdma_max_send: 64,
+            rdma_max_recv: 128,
+            rdma_max_msg: 132096,
            log_level: 0,
            block_size: 131072,
            disk_alignment: 4096,
@@ -107,6 +107,10 @@ const etcd_tree = {
            slow_log_interval: 10,
            inode_vanish_time: 60,
            osd_memlock: false,
+            scrub_interval: '30d', // 1s/1m/1h/1d
+            scrub_queue_depth: 1,
+            scrub_sleep: 0, // milliseconds
+            scrub_list_limit: 1000, // objects to list on one scrub iteration
            // blockstore - fixed in superblock
            block_size,
            disk_alignment,
@@ -168,6 +172,8 @@ const etcd_tree = {
                osd_tags?: 'nvme' | [ 'nvme', ... ],
                // prefer to put primary on OSD with these tags
                primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
+                // scrub interval
+                scrub_interval?: '30d',
            },
            ...
        }, */
@@ -261,9 +267,9 @@ const etcd_tree = {
            /* <pool_id>: {
                <pg_id>: {
                    primary: osd_num_t,
-                    state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
-                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
-                        "has_invalid"|"left_on_dead")[],
+                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
+                        "degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
+                        "has_invalid"|"left_on_dead"|"scrubbing")[],
                }
            }, */
        },
@@ -285,6 +291,7 @@ const etcd_tree = {
                    osd_sets: osd_num_t[][],
                    all_peers: osd_num_t[],
                    epoch: uint64_t,
+                    scrub_ts: uint64_t,
                },
            }, */
        },
@@ -663,12 +670,15 @@ class Mon
    async save_last_clean()
    {
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
+        const new_clean_pgs = { items: {} };
+    next_pool:
        for (const pool_id in this.state.config.pools)
        {
+            new_clean_pgs.items[pool_id] = (this.state.history.last_clean_pgs.items||{})[pool_id];
            const pool_cfg = this.state.config.pools[pool_id];
            if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
            {
-                continue;
+                continue next_pool;
            }
            for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
            {
@@ -677,17 +687,18 @@ class Mon
                    !(this.state.pg.state[pool_id][pg_num].state instanceof Array))
                {
                    // Unclean
-                    return;
+                    continue next_pool;
                }
                let st = this.state.pg.state[pool_id][pg_num].state.join(',');
                if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
                {
                    // Unclean
-                    return;
+                    continue next_pool;
                }
            }
+            new_clean_pgs.items[pool_id] = this.state.config.pgs.items[pool_id];
        }
-        this.state.history.last_clean_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
+        this.state.history.last_clean_pgs = new_clean_pgs;
        await this.etcd_call('/kv/txn', {
            success: [ { requestPut: {
                key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
@@ -1374,16 +1385,14 @@ class Mon
    // This is required for multiple change events to trigger at most 1 recheck in 1s
    schedule_recheck()
    {
-        if (this.recheck_timer)
+        if (!this.recheck_timer)
        {
-            clearTimeout(this.recheck_timer);
-            this.recheck_timer = null;
+            this.recheck_timer = setTimeout(() =>
+            {
+                this.recheck_timer = null;
+                this.recheck_pgs().catch(this.die);
+            }, this.config.mon_change_timeout || 1000);
        }
-        this.recheck_timer = setTimeout(() =>
-        {
-            this.recheck_timer = null;
-            this.recheck_pgs().catch(this.die);
-        }, this.config.mon_change_timeout || 1000);
    }

    sum_op_stats(timestamp, prev_stats)
@@ -1719,11 +1728,11 @@ class Mon
        else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
        {
            // Recheck OSD tree on OSD addition/deletion
-            if ((!old) != (!kv.value) || old && kv.value && (old.size != kv.value.size || old.time != kv.value.time))
+            if ((!old) != (!kv.value) || old && kv.value && old.size != kv.value.size)
            {
                this.schedule_recheck();
            }
-            // Recheck PGs <osd_out_time> later
+            // Recheck PGs <osd_out_time> after last OSD statistics report
            this.schedule_next_recheck_at(
                !this.state.osd.stats[key[2]] ? 0 : this.state.osd.stats[key[2]].time+this.config.osd_out_time
            );
--- a/patches/VitastorPlugin.pm
+++ b/patches/VitastorPlugin.pm
@@ -16,6 +16,11 @@ use PVE::Tools qw(run_command);

 use base qw(PVE::Storage::Plugin);

+if (@PVE::Storage::Plugin::SHARED_STORAGE)
+{
+    push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
+}
+
 sub api
 {
    # Trick it :)
@@ -133,9 +138,11 @@ sub properties
 sub options
 {
    return {
+        shared => { optional => 1 },
+        content => { optional => 1 },
        nodes => { optional => 1 },
        disable => { optional => 1 },
-        vitastor_etcd_address => { optional => 1},
+        vitastor_etcd_address => { optional => 1 },
        vitastor_etcd_prefix => { optional => 1 },
        vitastor_config_path => { optional => 1 },
        vitastor_prefix => { optional => 1 },
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.8.3'
+VERSION = '0.8.5'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -25,4 +25,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.3
+Version:        0.8.5
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.3.el7.tar.gz
+Source0:        vitastor-0.8.5.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -35,6 +35,7 @@ Summary:        Vitastor - OSD
 Requires:       libJerasure2
 Requires:       libisa-l
 Requires:       liburing >= 0.6
+Requires:       liburing < 2
 Requires:       vitastor-client = %{version}-%{release}
 Requires:       util-linux
 Requires:       parted
@@ -59,6 +60,7 @@ scheduling cluster-level operations.
 %package -n vitastor-client
 Summary:        Vitastor - client
 Requires:       liburing >= 0.6
+Requires:       liburing < 2


 %description -n vitastor-client
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.3
+Version:        0.8.5
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.3.el8.tar.gz
+Source0:        vitastor-0.8.5.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -34,6 +34,7 @@ Summary:        Vitastor - OSD
 Requires:       libJerasure2
 Requires:       libisa-l
 Requires:       liburing >= 0.6
+Requires:       liburing < 2
 Requires:       vitastor-client = %{version}-%{release}
 Requires:       util-linux
 Requires:       parted
@@ -57,6 +58,7 @@ scheduling cluster-level operations.
 %package -n vitastor-client
 Summary:        Vitastor - client
 Requires:       liburing >= 0.6
+Requires:       liburing < 2


 %description -n vitastor-client
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(vitastor)

 include(GNUInstallDirs)
+include(CTest)

 set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
 set(WITH_FIO true CACHE BOOL "Build FIO driver")
@@ -15,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.8.3")
+add_definitions(-DVERSION="0.8.5")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -55,6 +56,14 @@ if (ISAL_LIBRARIES)
 	add_definitions(-DWITH_ISAL)
 endif (ISAL_LIBRARIES)

+add_custom_target(build_tests)
+add_custom_target(test
+	COMMAND
+	echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
+	env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
+)
+add_dependencies(test build_tests)
+
 include_directories(
 	../
 	/usr/include/jerasure
@@ -102,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
 add_executable(vitastor-osd
 	osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
 	osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
-	osd_cluster.cpp osd_rmw.cpp
+	osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
 )
 target_link_libraries(vitastor-osd
 	vitastor_common
@@ -145,7 +154,6 @@ add_library(vitastor_client SHARED
 set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
 target_link_libraries(vitastor_client
 	vitastor_common
-	tcmalloc_minimal
 	${LIBURING_LIBRARIES}
 	${IBVERBS_LIBRARIES}
 )
@@ -235,8 +243,18 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
 target_link_libraries(osd_test tcmalloc_minimal)

 # osd_rmw_test
-add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
+add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
 target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
+add_dependencies(build_tests osd_rmw_test)
+add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
+
+if (ISAL_LIBRARIES)
+	add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
+	target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
+	target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
+	add_dependencies(build_tests osd_rmw_test_je)
+	add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
+endif (ISAL_LIBRARIES)

 # stub_uring_osd
 add_executable(stub_uring_osd
@@ -250,11 +268,15 @@ target_link_libraries(stub_uring_osd
 )

 # osd_peering_pg_test
-add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
+add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
 target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
+add_dependencies(build_tests osd_peering_pg_test)
+add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)

 # test_allocator
-add_executable(test_allocator test_allocator.cpp allocator.cpp)
+add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
+add_dependencies(build_tests test_allocator)
+add_test(NAME test_allocator COMMAND test_allocator)

 # test_cas
 add_executable(test_cas
@@ -274,12 +296,15 @@ target_link_libraries(test_crc32

 # test_cluster_client
 add_executable(test_cluster_client
+	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
 	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
-	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
+	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
 target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
+add_dependencies(build_tests test_cluster_client)
+add_test(NAME test_cluster_client COMMAND test_cluster_client)

 ## test_blockstore, test_shit
 #add_executable(test_blockstore test_blockstore.cpp)
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -122,11 +122,14 @@ Output:
 Get a list of all objects in this Blockstore.

 Input:
- oid.stripe = PG alignment
- len = PG count or 0 to list all objects
- offset = PG number
- oid.inode = min inode number or 0 to list all inodes
- version = max inode number or 0 to list all inodes
+- pg_alignment = PG alignment
+- pg_count = PG count or 0 to list all objects
+- pg_number = PG number
+- list_stable_limit = max number of clean objects in the reply
+  it's guaranteed that dirty objects are returned from the same interval,
+  i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
+- min_oid = min inode/stripe or 0 to list all objects
+- max_oid = max inode/stripe or 0 to list all objects

 Output:
 - retval = total obj_ver_id count
@@ -143,10 +146,27 @@ struct blockstore_op_t
    uint64_t opcode;
    // finish callback
    std::function<void (blockstore_op_t*)> callback;
-    object_id oid;
-    uint64_t version;
-    uint32_t offset;
-    uint32_t len;
+    union
+    {
+        // R/W
+        struct
+        {
+            object_id oid;
+            uint64_t version;
+            uint32_t offset;
+            uint32_t len;
+        };
+        // List
+        struct __attribute__((__packed__))
+        {
+            object_id min_oid;
+            object_id max_oid;
+            uint32_t pg_alignment;
+            uint32_t pg_count;
+            uint32_t pg_number;
+            uint32_t list_stable_limit;
+        };
+    };
    void *buf;
    void *bitmap;
    int retval;
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -162,7 +162,8 @@ void journal_flusher_t::mark_trim_possible()
    if (trim_wanted > 0)
    {
        dequeuing = true;
-        journal_trim_counter++;
+        if (!journal_trim_counter)
+            journal_trim_counter = journal_trim_interval;
        bs->ringloop->wakeup();
    }
 }
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -193,6 +193,7 @@ void blockstore_impl_t::loop()
            }
            if (wr_st == 2)
            {
+                submit_queue[op_idx] = NULL;
                new_idx--;
            }
            if (wr_st == 0)
@@ -324,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
    {
        // Basic verification not passed
        op->retval = -EINVAL;
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
    if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -367,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
    }
    if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
    {
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
@@ -444,11 +445,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint

 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    uint32_t list_pg = op->offset+1;
-    uint32_t pg_count = op->len;
-    uint64_t pg_stripe_size = op->oid.stripe;
-    uint64_t min_inode = op->oid.inode;
-    uint64_t max_inode = op->version;
+    uint32_t list_pg = op->pg_number+1;
+    uint32_t pg_count = op->pg_count;
+    uint64_t pg_stripe_size = op->pg_alignment;
+    uint64_t min_inode = op->min_oid.inode;
+    uint64_t max_inode = op->max_oid.inode;
    // Check PG
    if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
    {
@@ -495,7 +496,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
            stable_alloc += clean_db.size();
        }
    }
-    else
+    if (op->list_stable_limit > 0)
+    {
+        stable_alloc = op->list_stable_limit;
+        if (stable_alloc > 1024*1024)
+            stable_alloc = 1024*1024;
+    }
+    if (stable_alloc < 32768)
    {
        stable_alloc = 32768;
    }
@@ -506,22 +513,21 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        FINISH_OP(op);
        return;
    }
+    auto max_oid = op->max_oid;
+    bool limited = false;
    for (auto shard_it = clean_db_shards.lower_bound(first_shard);
        shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
        shard_it++)
    {
        auto & clean_db = shard_it->second;
        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
-        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
+        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
        {
-            clean_it = clean_db.lower_bound({
-                .inode = min_inode,
-                .stripe = 0,
-            });
-            clean_end = clean_db.upper_bound({
-                .inode = max_inode,
-                .stripe = UINT64_MAX,
-            });
+            clean_it = clean_db.lower_bound(op->min_oid);
+        }
+        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
+        {
+            clean_end = clean_db.upper_bound(max_oid);
        }
        for (; clean_it != clean_end; clean_it++)
        {
@@ -540,11 +546,24 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                .oid = clean_it->first,
                .version = clean_it->second.version,
            };
+            if (op->list_stable_limit > 0 && !limited && stable_count >= op->list_stable_limit)
+            {
+                limited = true;
+                break;
+            }
+        }
+        if (op->list_stable_limit > 0 && first_shard != last_shard)
+        {
+            // To maintain the order, we have to include objects in the same range from other shards
+            std::sort(stable, stable+stable_count);
+            if (stable_count > op->list_stable_limit)
+                stable_count = op->list_stable_limit;
+            max_oid = stable[stable_count-1].oid;
        }
    }
-    if (first_shard != last_shard)
+    if (op->list_stable_limit == 0 && first_shard != last_shard)
    {
-        // If that's not a per-PG listing, sort clean entries
+        // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
        std::sort(stable, stable+stable_count);
    }
    int clean_stable_count = stable_count;
@@ -553,20 +572,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    obj_ver_id *unstable = NULL;
    {
        auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
-        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
+        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
        {
            dirty_it = dirty_db.lower_bound({
-                .oid = {
-                    .inode = min_inode,
-                    .stripe = 0,
-                },
+                .oid = op->min_oid,
                .version = 0,
            });
+        }
+        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
+        {
            dirty_end = dirty_db.upper_bound({
-                .oid = {
-                    .inode = max_inode,
-                    .stripe = UINT64_MAX,
-                },
+                .oid = max_oid,
                .version = UINT64_MAX,
            });
        }
@@ -582,7 +598,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                        replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
                    }
                }
-                else if (IS_STABLE(dirty_it->second.state))
+                else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
                {
                    // First try to replace a clean stable version in the first part of the list
                    if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -16,6 +16,7 @@
 // FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
 // writing more than can be stabilized afterwards
 #define JOURNAL_STABILIZE_RESERVATION 65536
+#define JOURNAL_INSTANT_RESERVATION 131072

 // Journal entries
 // Journal entries are linked to each other by their crc32 value
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -286,7 +286,10 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
                {
                    auto used = --journal.used_sectors[rv.journal_sector-1];
                    if (used == 0)
+                    {
                        journal.used_sectors.erase(rv.journal_sector-1);
+                        flusher->mark_trim_possible();
+                    }
                }
            }
        }
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -127,7 +127,6 @@ resume_4:
    {
        mark_rolled_back(*v);
    }
-    flusher->mark_trim_possible();
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
@@ -232,6 +231,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
        if (used == 0)
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
+            flusher->mark_trim_possible();
        }
        if (dsk.clean_entry_bitmap_size > sizeof(void*))
        {
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -89,6 +89,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        else
        {
            // Invalid version requested
+#ifdef BLOCKSTORE_DEBUG
+            printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
+#endif
            op->retval = -EEXIST;
            if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
            {
@@ -115,8 +118,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    else if (!wait_del)
        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
-    // FIXME No strict need to add it into dirty_db here, it's just left
-    // from the previous implementation where reads waited for writes
+    // No strict need to add it into dirty_db here except maybe for listings to return
+    // correct data when there are inflight operations in the queue
    uint32_t state;
    if (is_del)
        state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
@@ -182,9 +185,15 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
    bool found = false;
    for (auto other_op: submit_queue)
    {
-        // <op> may be present in queue multiple times due to moving operations in submit_queue
-        if (other_op == op)
+        if (!other_op)
+        {
+            // freed operations during submitting are zeroed
+        }
+        else if (other_op == op)
+        {
+            // <op> may be present in queue multiple times due to moving operations in submit_queue
            found = true;
+        }
        else if (found && other_op->oid == op->oid &&
            (other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
        {
@@ -252,7 +261,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
+            (dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@@ -332,7 +342,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            !space_check.check_available(op, unsynced_big_write_count,
                sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
            || !space_check.check_available(op, 1,
-                sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION))
+                sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size,
+                op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
        {
            return 0;
        }
@@ -443,18 +454,19 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
 resume_2:
    // Only for the immediate_commit mode: prepare and submit big_write journal entry
    {
-        blockstore_journal_check_t space_check(this);
-        if (!space_check.check_available(op, 1,
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
-        {
-            return 0;
-        }
-        BS_SUBMIT_CHECK_SQES(1);
        auto dirty_it = dirty_db.find((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
        assert(dirty_it != dirty_db.end());
+        blockstore_journal_check_t space_check(this);
+        if (!space_check.check_available(op, 1,
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
+            ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+        {
+            return 0;
+        }
+        BS_SUBMIT_CHECK_SQES(1);
        journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
@@ -641,7 +653,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_STABILIZE_RESERVATION))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
    {
        return 0;
    }
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -121,8 +121,7 @@ resume_1:
            }
            if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
            {
-                uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
-                pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
+                pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
            }
            pool_stats[pool_cfg.id] = json11::Json::object {
                { "name", pool_cfg.name },
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@@ -403,7 +403,7 @@ struct snap_merger_t
        op->opcode = OSD_OP_READ_BITMAP;
        op->inode = target;
        op->offset = offset;
-        op->len = 0;
+        op->len = target_block_size;
        op->callback = [this](cluster_op_t *op)
        {
            if (op->retval < 0)
--- a/src/cli_rm_data.cpp
+++ b/src/cli_rm_data.cpp
@@ -92,6 +92,7 @@ struct rm_inode_t

    void send_ops(rm_pg_t *cur_list)
    {
+        parent->cli->init_msgr();
        if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
            parent->cli->msgr.osd_peer_fds.end())
        {
--- a/src/cli_rm_osd.cpp
+++ b/src/cli_rm_osd.cpp
@@ -5,6 +5,7 @@
 #include "cli.h"
 #include "cluster_client.h"
 #include "str_util.h"
+#include "epoll_manager.h"

 #include <algorithm>

@@ -14,13 +15,21 @@ struct rm_osd_t
    cli_tool_t *parent;

    bool dry_run, force_warning, force_dataloss;
+    uint64_t etcd_tx_retry_ms = 500;
+    uint64_t etcd_tx_retries = 10000;
    std::vector<uint64_t> osd_ids;

    int state = 0;
    cli_result_t result;

    std::set<uint64_t> to_remove;
+    std::set<uint64_t> to_restart;
    json11::Json::array pool_effects;
+    json11::Json::array history_updates, history_checks;
+    json11::Json new_pgs, new_clean_pgs;
+    uint64_t new_pgs_mod_rev, new_clean_pgs_mod_rev;
+    uint64_t cur_retry = 0;
+    uint64_t retry_wait = 0;
    bool is_warning, is_dataloss;

    bool is_done()
@@ -32,6 +41,12 @@ struct rm_osd_t
    {
        if (state == 1)
            goto resume_1;
+        else if (state == 2)
+            goto resume_2;
+        else if (state == 3)
+            goto resume_3;
+        else if (state == 4)
+            goto resume_4;
        if (!osd_ids.size())
        {
            result = (cli_result_t){ .err = EINVAL, .text = "OSD numbers are not specified" };
@@ -152,14 +167,48 @@ struct rm_osd_t
            result.text = error;
            if (dry_run || is_dataloss && !force_dataloss || is_warning && !force_warning)
            {
-                result.err = is_dataloss || is_warning ? EBUSY : 0;
+                result.err = is_dataloss && !force_dataloss || is_warning && !force_warning ? EBUSY : 0;
                state = 100;
                return;
            }
        }
+        parent->etcd_txn(json11::Json::object { { "success", json11::Json::array {
+            json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", base64_encode(
+                        parent->cli->st_cli.etcd_prefix+"/config/pgs"
+                    ) },
+                } },
+            },
+            json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", base64_encode(
+                        parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs"
+                    ) },
+                } },
+            },
+        } } });
+    resume_4:
+        state = 4;
+        if (parent->waiting > 0)
+            return;
+        if (parent->etcd_err.err)
+        {
+            result = parent->etcd_err;
+            state = 100;
+            return;
+        }
+        {
+            auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
+            new_pgs = remove_osds_from_pgs(kv);
+            new_pgs_mod_rev = kv.mod_revision;
+            kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
+            new_clean_pgs = remove_osds_from_pgs(kv);
+            new_clean_pgs_mod_rev = kv.mod_revision;
+        }
        // Remove keys from etcd
        {
-            json11::Json::array rm_items;
+            json11::Json::array rm_items, rm_checks;
            for (auto osd_id: osd_ids)
            {
                rm_items.push_back("/config/osd/"+std::to_string(osd_id));
@@ -178,7 +227,39 @@ struct rm_osd_t
                    } },
                };
            }
-            parent->etcd_txn(json11::Json::object { { "success", rm_items } });
+            if (!new_pgs.is_null())
+            {
+                auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pgs");
+                rm_items.push_back(json11::Json::object {
+                    { "request_put", json11::Json::object {
+                        { "key", pgs_key },
+                        { "value", base64_encode(new_pgs.dump()) },
+                    } },
+                });
+                rm_checks.push_back(json11::Json::object {
+                    { "target", "MOD" },
+                    { "key", pgs_key },
+                    { "result", "LESS" },
+                    { "mod_revision", new_pgs_mod_rev+1 },
+                });
+            }
+            if (!new_clean_pgs.is_null())
+            {
+                auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs");
+                rm_items.push_back(json11::Json::object {
+                    { "request_put", json11::Json::object {
+                        { "key", pgs_key },
+                        { "value", base64_encode(new_clean_pgs.dump()) },
+                    } },
+                });
+                rm_checks.push_back(json11::Json::object {
+                    { "target", "MOD" },
+                    { "key", pgs_key },
+                    { "result", "LESS" },
+                    { "mod_revision", new_clean_pgs_mod_rev+1 },
+                });
+            }
+            parent->etcd_txn(json11::Json::object { { "success", rm_items }, { "checks", rm_checks } });
        }
    resume_1:
        state = 1;
@@ -190,6 +271,46 @@ struct rm_osd_t
            state = 100;
            return;
        }
+        // Remove old OSD from PG all_peers to prevent left_on_dead and from
+        // target_history to prevent INCOMPLETE if --allow-data-loss is specified
+        for (auto & rsp: parent->etcd_result["responses"].array_items())
+        {
+            if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
+            {
+                // Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
+                retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
+                if (!retry_wait)
+                    retry_wait = 1000;
+                retry_wait += etcd_tx_retry_ms;
+            }
+        }
+        while (1)
+        {
+    resume_2:
+            if (!remove_osds_from_history(2))
+                return;
+    resume_3:
+            state = 3;
+            if (parent->waiting > 0)
+                return;
+            if (parent->etcd_err.err)
+            {
+                result = parent->etcd_err;
+                state = 100;
+                return;
+            }
+            if (parent->etcd_result["succeeded"].bool_value())
+                break;
+            if ((++cur_retry) >= etcd_tx_retries)
+            {
+                result.err = EAGAIN;
+                result.text += "Failed to remove OSDs from PG history due to update conflicts."
+                    " Some PGs may remain left_on_dead or incomplete. Please retry later\n";
+                state = 100;
+                return;
+            }
+            retry_wait = etcd_tx_retry_ms;
+        }
        std::string ids = "";
        for (auto osd_id: osd_ids)
        {
@@ -200,6 +321,141 @@ struct rm_osd_t
        result.text = (result.text != "" ? ids+"\n"+result.text : ids);
        result.err = 0;
    }
+
+    json11::Json remove_osds_from_pgs(const etcd_kv_t & kv)
+    {
+        if (kv.value.is_null())
+        {
+            return kv.value;
+        }
+        json11::Json::object new_pgs;
+        for (auto & pp: kv.value["items"].object_items())
+        {
+            if (pp.second.is_object())
+            {
+                json11::Json::object new_pool;
+                for (auto & pgp: pp.second.object_items())
+                {
+                    json11::Json::array osd_set;
+                    for (auto & osd_json: pgp.second["osd_set"].array_items())
+                    {
+                        uint64_t osd_num = osd_json.uint64_value();
+                        osd_set.push_back(osd_num == 0 || to_remove.find(osd_num) != to_remove.end() ? 0 : osd_num);
+                    }
+                    json11::Json::object new_pg = pgp.second.object_items();
+                    new_pg["osd_set"] = osd_set;
+                    new_pool[pgp.first] = new_pg;
+                }
+                new_pgs[pp.first] = new_pool;
+            }
+            else
+                new_pgs[pp.first] = pp.second;
+        }
+        auto res = kv.value.object_items();
+        res["items"] = new_pgs;
+        return res;
+    }
+
+    bool remove_osds_from_history(int base_state)
+    {
+        if (state == base_state+0)
+            goto resume_0;
+        history_updates.clear();
+        history_checks.clear();
+        for (auto & pp: parent->cli->st_cli.pool_config)
+        {
+            bool update_pg_history = false;
+            auto & pool_cfg = pp.second;
+            for (auto & pgp: pool_cfg.pg_config)
+            {
+                auto pg_num = pgp.first;
+                auto & pg_cfg = pgp.second;
+                for (int i = 0; i < pg_cfg.all_peers.size(); i++)
+                {
+                    if (to_remove.find(pg_cfg.all_peers[i]) != to_remove.end())
+                    {
+                        update_pg_history = true;
+                        pg_cfg.all_peers.erase(pg_cfg.all_peers.begin()+i, pg_cfg.all_peers.begin()+i+1);
+                        i--;
+                    }
+                }
+                for (int i = 0; i < pg_cfg.target_history.size(); i++)
+                {
+                    int hist_size = 0, hist_rm = 0;
+                    for (auto & old_osd: pg_cfg.target_history[i])
+                    {
+                        if (old_osd != 0)
+                        {
+                            hist_size++;
+                            if (to_remove.find(old_osd) != to_remove.end())
+                            {
+                                hist_rm++;
+                                old_osd = 0;
+                            }
+                        }
+                    }
+                    if (hist_rm > 0)
+                    {
+                        if (hist_size-hist_rm == 0)
+                        {
+                            pg_cfg.target_history.erase(pg_cfg.target_history.begin()+i, pg_cfg.target_history.begin()+i+1);
+                            i--;
+                        }
+                        update_pg_history = true;
+                    }
+                }
+                if (update_pg_history)
+                {
+                    std::string history_key = base64_encode(
+                        parent->cli->st_cli.etcd_prefix+"/pg/history/"+
+                        std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
+                    );
+                    auto hist = json11::Json::object {
+                        { "epoch", pg_cfg.epoch },
+                        { "all_peers", pg_cfg.all_peers },
+                        { "osd_sets", pg_cfg.target_history },
+                    };
+                    if (pg_cfg.scrub_ts)
+                        hist["scrub_ts"] = pg_cfg.scrub_ts;
+                    history_updates.push_back(json11::Json::object {
+                        { "request_put", json11::Json::object {
+                            { "key", history_key },
+                            { "value", base64_encode(json11::Json(hist).dump()) },
+                        } },
+                    });
+                    history_checks.push_back(json11::Json::object {
+                        { "target", "MOD" },
+                        { "key", history_key },
+                        { "result", "LESS" },
+                        { "mod_revision", parent->cli->st_cli.etcd_watch_revision+1 },
+                    });
+                }
+            }
+        }
+        if (history_updates.size())
+        {
+            if (retry_wait)
+            {
+                parent->waiting++;
+                parent->epmgr->tfd->set_timer(retry_wait, false, [this](int timer_id)
+                {
+                    parent->waiting--;
+                    parent->ringloop->wakeup();
+                });
+    resume_0:
+                state = base_state+0;
+                if (parent->waiting > 0)
+                    return false;
+            }
+            parent->etcd_txn(json11::Json::object {
+                { "success", history_updates },
+                { "compare", history_checks },
+            });
+        }
+        else
+            parent->etcd_result = json11::Json::object{ { "succeeded", true } };
+        return true;
+    }
 };

 std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
@@ -209,6 +465,14 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
    rm_osd->dry_run = cfg["dry_run"].bool_value();
    rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
    rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
+    if (!cfg["etcd_tx_retries"].is_null())
+        rm_osd->etcd_tx_retries = cfg["etcd_tx_retries"].uint64_value();
+    if (!cfg["etcd_tx_retry_ms"].is_null())
+    {
+        rm_osd->etcd_tx_retry_ms = cfg["etcd_tx_retry_ms"].uint64_value();
+        if (rm_osd->etcd_tx_retry_ms < 100)
+            rm_osd->etcd_tx_retry_ms = 100;
+    }
    if (cfg["osd_id"].is_number() || cfg["osd_id"].is_string())
        rm_osd->osd_ids.push_back(cfg["osd_id"].uint64_value());
    else
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -59,7 +59,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
        delete op;
    };
    msgr.parse_config(this->config);
-    msgr.init();

    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -73,17 +72,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

    scrap_buffer_size = SCRAP_BUFFER_SIZE;
    scrap_buffer = malloc_or_die(scrap_buffer_size);
-
-    if (ringloop)
-    {
-        consumer.loop = [this]()
-        {
-            msgr.read_requests();
-            msgr.send_replies();
-            this->ringloop->submit();
-        };
-        ringloop->register_consumer(&consumer);
-    }
 }

 cluster_client_t::~cluster_client_t()
@@ -115,6 +103,24 @@ cluster_op_t::~cluster_op_t()
    }
 }

+void cluster_client_t::init_msgr()
+{
+    if (msgr_initialized)
+        return;
+    msgr.init();
+    msgr_initialized = true;
+    if (ringloop)
+    {
+        consumer.loop = [this]()
+        {
+            msgr.read_requests();
+            msgr.send_replies();
+            this->ringloop->submit();
+        };
+        ringloop->register_consumer(&consumer);
+    }
+}
+
 void cluster_client_t::calc_wait(cluster_op_t *op)
 {
    op->prev_wait = 0;
@@ -143,7 +149,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
        if (!op->prev_wait)
            continue_sync(op);
    }
-    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
+    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
    {
        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
        {
@@ -151,7 +157,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
            {
                op->prev_wait++;
            }
-            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
+            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
+                prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
            {
                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
                break;
@@ -171,7 +178,8 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
            auto n2 = next->next;
            if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
-                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
+                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
+                    next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
            {
                next->prev_wait += inc;
                assert(next->prev_wait >= 0);
@@ -221,11 +229,14 @@ void cluster_client_t::erase_op(cluster_op_t *op)
    if (op_queue_tail == op)
        op_queue_tail = op->prev;
    op->next = op->prev = NULL;
+    if (flags & OP_FLUSH_BUFFER)
+        std::function<void(cluster_op_t*)>(op->callback)(op);
    if (!(flags & OP_IMMEDIATE_COMMIT))
        inc_wait(opcode, flags, next, -1);
    // Call callback at the end to avoid inconsistencies in prev_wait
    // if the callback adds more operations itself
-    std::function<void(cluster_op_t*)>(op->callback)(op);
+    if (!(flags & OP_FLUSH_BUFFER))
+        std::function<void(cluster_op_t*)>(op->callback)(op);
 }

 void cluster_client_t::continue_ops(bool up_retry)
@@ -337,7 +348,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
            // And now they have to be resliced!
            for (auto op = op_queue_head; op; op = op->next)
            {
-                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
+                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
+                    op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
                    INODE_POOL(op->cur_inode) == pool_item.first)
                {
                    op->needs_reslice = true;
@@ -409,7 +421,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 void cluster_client_t::execute(cluster_op_t *op)
 {
    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
-        op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
+        op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
@@ -441,7 +453,7 @@ void cluster_client_t::execute(cluster_op_t *op)
            return;
        }
        // Check alignment
-        if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
+        if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
            op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
        {
            op->retval = -EINVAL;
@@ -702,8 +714,7 @@ resume_3:
        // Finished successfully
        // Even if the PG count has changed in meanwhile we treat it as success
        // because if some operations were invalid for the new PG count we'd get errors
-        bool is_read = op->opcode == OSD_OP_READ;
-        if (is_read)
+        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
        {
            // Check parent inode
            auto ino_it = st_cli.inode_config.find(op->cur_inode);
@@ -727,6 +738,11 @@ resume_3:
            }
        }
        op->retval = op->len;
+        if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
+        {
+            auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
+            op->retval = op->len / pool_cfg.bitmap_granularity;
+        }
        erase_op(op);
        return 1;
    }
@@ -750,7 +766,10 @@ resume_3:
        {
            for (int i = 0; i < op->parts.size(); i++)
            {
-                op->parts[i].flags = PART_RETRY;
+                if (!(op->parts[i].flags & PART_DONE))
+                {
+                    op->parts[i].flags = PART_RETRY;
+                }
            }
            goto resume_2;
        }
@@ -809,23 +828,19 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
    op->retval = 0;
    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
-    if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
+    if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
    {
        // Allocate memory for the bitmap
-        unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
+        unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
        object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
        unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
-        if (op->bitmap_buf_size < bitmap_mem)
+        if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
        {
            op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
-            if (!op->bitmap_buf_size)
-            {
-                // First allocation
-                memset(op->bitmap_buf, 0, object_bitmap_size);
-            }
            op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
            op->bitmap_buf_size = bitmap_mem;
        }
+        memset(op->bitmap_buf, 0, bitmap_mem);
    }
    int iov_idx = 0;
    size_t iov_pos = 0;
@@ -876,13 +891,14 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            if (end == begin)
                op->done_count++;
        }
-        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
+        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
        {
            add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
        }
        op->parts[i].parent = op;
        op->parts[i].offset = begin;
-        op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
+        op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
+            op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
        op->parts[i].pg_num = pg_num;
        op->parts[i].osd_num = 0;
        op->parts[i].flags = 0;
@@ -911,6 +927,10 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len

 bool cluster_client_t::try_send(cluster_op_t *op, int i)
 {
+    if (!msgr_initialized)
+    {
+        init_msgr();
+    }
    auto part = &op->parts[i];
    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
@@ -929,7 +949,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
            );
            uint64_t meta_rev = 0;
-            if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
+            if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
            {
                auto ino_it = st_cli.inode_config.find(op->inode);
                if (ino_it != st_cli.inode_config.end())
@@ -942,7 +962,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
                        .id = next_op_id(),
-                        .opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
+                        .opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
                    },
                    .inode = op->cur_inode,
                    .offset = part->offset,
@@ -950,8 +970,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                    .meta_revision = meta_rev,
                    .version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
                } },
-                .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
-                .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
+                .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
+                    ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
+                .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
+                    ? pg_bitmap_size : 0),
                .callback = [this, part](osd_op_t *op_part)
                {
                    handle_op_part(part);
@@ -1130,11 +1152,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
    else
    {
        // OK
-        if (!(op->flags & OP_IMMEDIATE_COMMIT))
+        if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
            dirty_osds.insert(part->osd_num);
        part->flags |= PART_DONE;
        op->done_count++;
-        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
+        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
        {
            copy_part_bitmap(op, part);
            op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
@@ -1158,7 +1180,12 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
    );
    uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
    uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
-    uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
+    uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
+    uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
+    if (part_len > op_len-object_offset)
+    {
+        part_len = op_len-object_offset;
+    }
    if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
    {
        // Copy bytes
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -11,6 +11,7 @@
 #define INODE_LIST_DONE 1
 #define INODE_LIST_HAS_UNSTABLE 2
 #define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
+#define OSD_OP_READ_CHAIN_BITMAP 0x102

 #define OSD_OP_IGNORE_READONLY 0x08

@@ -30,7 +31,7 @@ struct cluster_op_part_t

 struct cluster_op_t
 {
-    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
+    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
    uint64_t inode;
    uint64_t offset;
    uint64_t len;
@@ -39,9 +40,13 @@ struct cluster_op_t
    uint64_t version = 0;
    // now only OSD_OP_IGNORE_READONLY is supported
    uint64_t flags = 0;
+    // negative retval is an error number
+    // write and read return len on success
+    // sync and delete return 0 on success
+    // read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
    int retval;
    osd_op_buf_list_t iov;
-    // READ and READ_BITMAP return the bitmap here
+    // READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
    void *bitmap_buf = NULL;
    std::function<void(cluster_op_t*)> callback;
    ~cluster_op_t();
@@ -99,10 +104,14 @@ class cluster_client_t
    std::vector<std::function<void(void)>> on_ready_hooks;
    std::vector<inode_list_t*> lists;
    int continuing_ops = 0;
+    bool msgr_initialized = false;

 public:
    etcd_state_client_t st_cli;
+
    osd_messenger_t msgr;
+    void init_msgr();
+
    json11::Json config;
    json11::Json::object merged_config;

--- a/src/disk_tool_udev.cpp
+++ b/src/disk_tool_udev.cpp
@@ -387,6 +387,14 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
        rm_osd_cli.push_back(std::to_string(osd_num));
    }
    // Check for data loss
+    if (options["force"] != "")
+    {
+        rm_osd_cli.push_back("--force");
+    }
+    else if (options["allow_data_loss"] != "")
+    {
+        rm_osd_cli.push_back("--allow-data-loss");
+    }
    rm_osd_cli.push_back("--dry-run");
    std::string dry_run_ignore_stdout;
    if (shell_exec(rm_osd_cli, "", &dry_run_ignore_stdout, NULL) != 0)
@@ -405,14 +413,6 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
    }
    // Remove OSD metadata
    rm_osd_cli.pop_back();
-    if (options["force"] != "")
-    {
-        rm_osd_cli.push_back("--force");
-    }
-    else if (options["allow_data_loss"] != "")
-    {
-        rm_osd_cli.push_back("--allow-data-loss");
-    }
    if (shell_exec(rm_osd_cli, "", NULL, NULL) != 0)
    {
        return 1;
--- a/src/disk_tool_utils.cpp
+++ b/src/disk_tool_utils.cpp
@@ -305,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
 json11::Json read_parttable(std::string dev)
 {
    std::string part_dump;
-    int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
+    int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
    if (r == 255)
    {
-        fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
+        fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
        return json11::Json(false);
    }
    // Decode partition table
@@ -319,7 +319,7 @@ json11::Json read_parttable(std::string dev)
        pt = json11::Json::parse(part_dump, err);
        if (err != "")
        {
-            fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
+            fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
            return json11::Json(false);
        }
        pt = pt["partitiontable"];
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -7,8 +7,8 @@
 #ifndef __MOCK__
 #include "addr_util.h"
 #include "http_client.h"
-#include "str_util.h"
 #endif
+#include "str_util.h"

 etcd_state_client_t::~etcd_state_client_t()
 {
@@ -759,6 +759,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
                continue;
            }
+            // Scrub Interval
+            pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
+            if (!pc.scrub_interval)
+                pc.scrub_interval = 0;
            // Immediate Commit Mode
            pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
                ? (pool_item.second["immediate_commit"].string_value() == "all"
@@ -871,22 +875,38 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            pg_cfg.target_history.clear();
            pg_cfg.all_peers.clear();
            // Refuse to start PG if any set of the <osd_sets> has no live OSDs
-            for (auto hist_item: value["osd_sets"].array_items())
+            for (auto & hist_item: value["osd_sets"].array_items())
            {
                std::vector<osd_num_t> history_set;
-                for (auto pg_osd: hist_item.array_items())
+                for (auto & pg_osd: hist_item.array_items())
                {
-                    history_set.push_back(pg_osd.uint64_value());
+                    osd_num_t pg_osd_num = pg_osd.uint64_value();
+                    if (pg_osd_num != 0)
+                    {
+                        auto it = std::lower_bound(history_set.begin(), history_set.end(), pg_osd_num);
+                        if (it == history_set.end() || *it != pg_osd_num)
+                            history_set.insert(it, pg_osd_num);
+                    }
                }
-                pg_cfg.target_history.push_back(history_set);
+                auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
+                if (it == pg_cfg.target_history.end() || *it != history_set)
+                    pg_cfg.target_history.insert(it, history_set);
            }
            // Include these additional OSDs when peering the PG
            for (auto pg_osd: value["all_peers"].array_items())
            {
-                pg_cfg.all_peers.push_back(pg_osd.uint64_value());
+                osd_num_t pg_osd_num = pg_osd.uint64_value();
+                if (pg_osd_num != 0)
+                {
+                    auto it = std::lower_bound(pg_cfg.all_peers.begin(), pg_cfg.all_peers.end(), pg_osd_num);
+                    if (it == pg_cfg.all_peers.end() || *it != pg_osd_num)
+                        pg_cfg.all_peers.insert(it, pg_osd_num);
+                }
            }
            // Read epoch
            pg_cfg.epoch = value["epoch"].uint64_value();
+            // Scrub timestamp
+            pg_cfg.scrub_ts = parse_time(value["scrub_ts"].string_value());
            if (on_change_pg_history_hook != NULL)
            {
                on_change_pg_history_hook(pool_id, pg_num);
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -39,6 +39,7 @@ struct pg_config_t
    osd_num_t cur_primary;
    int cur_state;
    uint64_t epoch;
+    uint64_t scrub_ts;
 };

 struct pool_config_t
@@ -55,6 +56,7 @@ struct pool_config_t
    uint64_t max_osd_combinations;
    uint64_t pg_stripe_size;
    std::map<pg_num_t, pg_config_t> pg_config;
+    uint64_t scrub_interval;
 };

 struct inode_config_t
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -157,7 +157,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
        this->rdma_max_sge = 128;
    this->rdma_max_send = config["rdma_max_send"].uint64_value();
    if (!this->rdma_max_send)
-        this->rdma_max_send = 1;
+        this->rdma_max_send = 64;
    this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
    if (!this->rdma_max_recv)
        this->rdma_max_recv = 128;
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -138,6 +138,7 @@ protected:

    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
+    // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
    std::vector<std::function<void()>> set_immediate;

 public:
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@@ -368,9 +368,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
 bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
-    if (!cl->send_list.size() || rc->cur_send > 0)
+    if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
    {
-        // Only send one batch at a time
        return true;
    }
    uint64_t op_size = 0, op_sge = 0;
@@ -380,6 +379,7 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
        iovec & iov = cl->send_list[rc->send_pos];
        if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
        {
+            rc->send_sizes.push_back(op_size);
            try_send_rdma_wr(cl, sge, op_sge);
            op_sge = 0;
            op_size = 0;
@@ -405,18 +405,24 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
    }
    if (op_sge > 0)
    {
+        rc->send_sizes.push_back(op_size);
        try_send_rdma_wr(cl, sge, op_sge);
    }
    return true;
 }

-static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
+static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
 {
+    ibv_sge sge = {
+        .addr = (uintptr_t)buf,
+        .length = (uint32_t)cl->rdma_conn->max_msg,
+        .lkey = cl->rdma_conn->ctx->mr->lkey,
+    };
    ibv_recv_wr *bad_wr = NULL;
    ibv_recv_wr wr = {
        .wr_id = (uint64_t)(cl->peer_fd*2),
-        .sg_list = sge,
-        .num_sge = op_sge,
+        .sg_list = &sge,
+        .num_sge = 1,
    };
    int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
    if (err || bad_wr)
@@ -434,12 +440,7 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
    {
        void *buf = malloc_or_die(rc->max_msg);
        rc->recv_buffers.push_back(buf);
-        ibv_sge sge = {
-            .addr = (uintptr_t)buf,
-            .length = (uint32_t)rc->max_msg,
-            .lkey = rc->ctx->mr->lkey,
-        };
-        try_recv_rdma_wr(cl, &sge, 1);
+        try_recv_rdma_wr(cl, buf);
    }
    return true;
 }
@@ -476,6 +477,7 @@ void osd_messenger_t::handle_rdma_events()
                continue;
            }
            osd_client_t *cl = cl_it->second;
+            auto rc = cl->rdma_conn;
            if (wc[i].status != IBV_WC_SUCCESS)
            {
                fprintf(stderr, "RDMA work request failed for client %d", client_id);
@@ -489,44 +491,59 @@ void osd_messenger_t::handle_rdma_events()
            }
            if (!is_send)
            {
-                cl->rdma_conn->cur_recv--;
-                if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
+                rc->cur_recv--;
+                if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
                {
                    // handle_read_buffer may stop the client
                    continue;
                }
-                free(cl->rdma_conn->recv_buffers[0]);
-                cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
-                try_recv_rdma(cl);
+                try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
+                rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
            }
            else
            {
-                cl->rdma_conn->cur_send--;
-                if (!cl->rdma_conn->cur_send)
+                rc->cur_send--;
+                uint64_t sent_size = rc->send_sizes.at(0);
+                rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
+                int send_pos = 0, send_buf_pos = 0;
+                while (sent_size > 0)
                {
-                    // Wait for the whole batch
-                    for (int i = 0; i < cl->rdma_conn->send_pos; i++)
+                    if (sent_size >= cl->send_list.at(send_pos).iov_len)
                    {
-                        if (cl->outbox[i].flags & MSGR_SENDP_FREE)
-                        {
-                            // Reply fully sent
-                            delete cl->outbox[i].op;
-                        }
+                        sent_size -= cl->send_list[send_pos].iov_len;
+                        send_pos++;
                    }
-                    if (cl->rdma_conn->send_pos > 0)
+                    else
                    {
-                        cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
-                        cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
-                        cl->rdma_conn->send_pos = 0;
+                        send_buf_pos = sent_size;
+                        sent_size = 0;
                    }
-                    if (cl->rdma_conn->send_buf_pos > 0)
-                    {
-                        cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
-                        cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
-                        cl->rdma_conn->send_buf_pos = 0;
-                    }
-                    try_send_rdma(cl);
                }
+                assert(rc->send_pos >= send_pos);
+                if (rc->send_pos == send_pos)
+                {
+                    rc->send_buf_pos -= send_buf_pos;
+                }
+                rc->send_pos -= send_pos;
+                for (int i = 0; i < send_pos; i++)
+                {
+                    if (cl->outbox[i].flags & MSGR_SENDP_FREE)
+                    {
+                        // Reply fully sent
+                        delete cl->outbox[i].op;
+                    }
+                }
+                if (send_pos > 0)
+                {
+                    cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
+                    cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
+                }
+                if (send_buf_pos > 0)
+                {
+                    cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
+                    cl->send_list[0].iov_len -= send_buf_pos;
+                }
+                try_send_rdma(cl);
            }
        }
    } while (event_count > 0);
--- a/src/msgr_rdma.h
+++ b/src/msgr_rdma.h
@@ -49,8 +49,9 @@ struct msgr_rdma_connection_t
    uint64_t max_msg = 0;

    int send_pos = 0, send_buf_pos = 0;
-    int recv_pos = 0, recv_buf_pos = 0;
+    int next_recv_buf = 0;
    std::vector<void*> recv_buffers;
+    std::vector<uint64_t> send_sizes;

    ~msgr_rdma_connection_t();
    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
    recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
+    if (recovery_pg_switch < 1)
+        recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
    if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
        recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
@@ -175,6 +178,16 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
    inode_vanish_time = config["inode_vanish_time"].uint64_value();
    if (!inode_vanish_time)
        inode_vanish_time = 60;
+    global_scrub_interval = config["scrub_interval"].uint64_value();
+    if (!global_scrub_interval)
+        global_scrub_interval = 30*86400;
+    scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
+    if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
+        scrub_queue_depth = 1;
+    scrub_sleep_ms = config["scrub_sleep"].uint64_value();
+    scrub_list_limit = config["scrub_list_limit"].uint64_value();
+    if (!scrub_list_limit)
+        scrub_list_limit = 1000;
 }

 void osd_t::bind_socket()
@@ -259,7 +272,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
            cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
            (cur_op->req.rw.len > OSD_RW_MAX ||
            cur_op->req.rw.len % bs_bitmap_granularity ||
-            cur_op->req.rw.offset % bs_bitmap_granularity)))
+            cur_op->req.rw.offset % bs_bitmap_granularity)) ||
+        cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
    {
        // Bad command
        finish_op(cur_op, -EINVAL);
@@ -276,6 +290,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
        cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
        cur_op->req.hdr.opcode != OSD_OP_READ &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
+        cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
    {
        // Readonly mode
@@ -306,6 +321,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_del(cur_op);
    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
+    {
+        continue_primary_scrub(cur_op);
+    }
    else
    {
        exec_secondary(cur_op);
@@ -370,6 +389,10 @@ void osd_t::print_stats()
            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
        }
    }
+    if (corrupted_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
+    }
    if (incomplete_objects > 0)
    {
        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
@@ -437,10 +460,11 @@ void osd_t::print_slow()
                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
                {
                    bufprintf(
-                        " inode=%lx-%lx pg=%u/%u, stripe=%lu",
-                        op->req.sec_list.min_inode, op->req.sec_list.max_inode,
+                        " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
+                        op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
+                        op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
                        op->req.sec_list.list_pg, op->req.sec_list.pg_count,
-                        op->req.sec_list.pg_stripe_size
+                        op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
                    );
                }
                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
--- a/src/osd.h
+++ b/src/osd.h
@@ -28,12 +28,14 @@
 #define OSD_PEERING_PGS 0x04
 #define OSD_FLUSHING_PGS 0x08
 #define OSD_RECOVERING 0x10
+#define OSD_SCRUBBING 0x20

 #define MAX_AUTOSYNC_INTERVAL 3600
 #define DEFAULT_AUTOSYNC_INTERVAL 5
 #define DEFAULT_AUTOSYNC_WRITES 128
 #define MAX_RECOVERY_QUEUE 2048
 #define DEFAULT_RECOVERY_QUEUE 4
+#define DEFAULT_RECOVERY_PG_SWITCH 128
 #define DEFAULT_RECOVERY_BATCH 16

 //#define OSD_STUB
@@ -108,9 +110,14 @@ class osd_t
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
    int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
    int log_level = 0;
+    uint64_t global_scrub_interval = 30*86400;
+    uint64_t scrub_queue_depth = 1;
+    uint64_t scrub_sleep_ms = 0;
+    uint32_t scrub_list_limit = 1000;

    // cluster state

@@ -132,12 +139,24 @@ class osd_t
    std::set<pool_pg_num_t> dirty_pgs;
    std::set<osd_num_t> dirty_osds;
    int copies_to_delete_after_sync_count = 0;
-    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
+    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    int recovery_done = 0;
+    std::map<object_id, osd_op_t*> scrub_ops;
+    bool recovery_last_degraded = true;
+    pool_pg_num_t recovery_last_pg;
+    object_id recovery_last_oid;
+    int recovery_pg_done = 0, recovery_done = 0;
    osd_op_t *autosync_op = NULL;

+    // Scrubbing
+    uint64_t scrub_nearest_ts = 0;
+    int scrub_timer_id = -1;
+    pool_pg_num_t scrub_last_pg;
+    osd_op_t *scrub_list_op;
+    pg_list_result_t scrub_cur_list = {};
+    uint64_t scrub_list_pos = 0;
+
    // Unstable writes
    uint64_t unstable_write_count = 0;
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -200,7 +219,6 @@ class osd_t
    bool check_peer_config(osd_client_t *cl, json11::Json conf);
    void repeer_pgs(osd_num_t osd_num);
    void start_pg_peering(pg_t & pg);
-    void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
    void discard_list_subop(osd_op_t *list_op);
    bool stop_pg(pg_t & pg);
@@ -216,6 +234,13 @@ class osd_t
    bool continue_recovery();
    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

+    // scrub
+    void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
+    bool pick_next_scrub(object_id & next_oid);
+    void submit_scrub_op(object_id oid);
+    bool continue_scrub();
+    void schedule_scrub(pg_t & pg);
+
    // op execution
    void exec_op(osd_op_t *cur_op);
    void finish_op(osd_op_t *cur_op, int retval);
@@ -230,13 +255,15 @@ class osd_t
    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
+    void continue_primary_scrub(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
    void cancel_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
    void continue_primary_del(osd_op_t *cur_op);
    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
-    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
-    void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
+    void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
+    pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
+    void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
    void handle_primary_bs_subop(osd_op_t *subop);
@@ -251,10 +278,11 @@ class osd_t
    int submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);

-    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
+    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);

    void continue_chained_read(osd_op_t *cur_op);
    int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
+    void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
    void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
    std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
    int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -132,7 +132,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
                this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
                cl->osd_num, conf["immediate_commit"].string_value().c_str()
            );
-            return true;
+            return false;
        }
        else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
        {
@@ -140,7 +140,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
                "[OSD %lu] My block_size is %u, but peer OSD %lu has %lu. We can't work together\n",
                this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
            );
-            return true;
+            return false;
        }
        else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
        {
@@ -148,7 +148,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
                "[OSD %lu] My bitmap_granularity is %u, but peer OSD %lu has %lu. We can't work together\n",
                this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
            );
-            return true;
+            return false;
        }
    }
    return true;
@@ -336,6 +336,8 @@ void osd_t::report_statistics()
        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
        pg_stats["degraded_count"] = pg.degraded_objects.size();
        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
+        if (pg.corrupted_count)
+            pg_stats["corrupted_count"] = pg.corrupted_count;
        pg_stats["write_osd_set"] = pg.cur_set;
        txn.push_back(json11::Json::object {
            { "request_put", json11::Json::object {
@@ -382,30 +384,6 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
    }
 }

-void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
-{
-    auto pg_it = pgs.find({
-        .pool_id = pool_id,
-        .pg_num = pg_num,
-    });
-    if (pg_it != pgs.end() && pg_it->second.epoch > pg_it->second.reported_epoch &&
-        st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg_it->second.epoch)
-    {
-        pg_it->second.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
-        object_id oid = { 0 };
-        bool first = true;
-        for (auto op: pg_it->second.write_queue)
-        {
-            if (first || oid != op.first)
-            {
-                oid = op.first;
-                first = false;
-                continue_primary_write(op.second);
-            }
-        }
-    }
-}
-
 void osd_t::on_load_config_hook(json11::Json::object & global_config)
 {
    json11::Json::object osd_config = this->config;
@@ -704,13 +682,22 @@ void osd_t::apply_pg_config()
                        }
                    }
                }
+                auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
                if (currently_taken)
                {
-                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
+                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
                    {
-                        if (pg_it->second.target_set == pg_cfg.target_set)
+                        if (pg_it->second.target_set == pg_cfg.target_set &&
+                            pg_it->second.target_history == pg_cfg.target_history &&
+                            pg_it->second.all_peers == vec_all_peers)
                        {
-                            // No change in osd_set; history changes are ignored
+                            // No change in osd_set and history
+                            if (pg_it->second.scrub_ts != pg_cfg.scrub_ts)
+                            {
+                                pg_it->second.scrub_ts = pg_cfg.scrub_ts;
+                                peering_state = peering_state | OSD_SCRUBBING;
+                                ringloop->wakeup();
+                            }
                            continue;
                        }
                        else
@@ -761,7 +748,8 @@ void osd_t::apply_pg_config()
                    .pg_num = pg_num,
                    .reported_epoch = pg_cfg.epoch,
                    .target_history = pg_cfg.target_history,
-                    .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
+                    .all_peers = vec_all_peers,
+                    .scrub_ts = pg_cfg.scrub_ts,
                    .target_set = pg_cfg.target_set,
                };
                if (pg.scheme == POOL_SCHEME_EC)
@@ -892,6 +880,8 @@ void osd_t::report_pg_states()
                    { "all_peers", pg.all_peers },
                    { "osd_sets", pg.target_history },
                };
+                if (pg.scrub_ts)
+                    history_value["scrub_ts"] = pg.scrub_ts;
                checks.push_back(json11::Json::object {
                    { "target", "MOD" },
                    { "key", history_key },
@@ -984,13 +974,6 @@ void osd_t::report_pg_states()
                        }
                        this->pgs.erase(pg_it);
                    }
-                    else if (pg_it->second.state & PG_PEERED)
-                    {
-                        // Activate PG after PG PEERED state is reported along with history
-                        // (if the state wasn't changed again)
-                        pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
-                        report_pg_state(pg_it->second);
-                    }
                }
            }
            // Push other PG state updates, if any
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -182,7 +182,9 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
                op->bs_op = NULL;
                delete op;
            },
-            .len = (uint32_t)count,
+            {
+                .len = (uint32_t)count,
+            },
            .buf = op->buf,
        });
        bs->enqueue_op(op->bs_op);
@@ -226,42 +228,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t

 bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
 {
-    if (!no_recovery)
+    if (!pgs.size())
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-        {
-            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
-            {
-                for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
-                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    {
-                        op.degraded = true;
-                        op.oid = obj_it->first;
-                        return true;
-                    }
-                }
-            }
-        }
+        return false;
    }
-    if (!no_rebalance)
+    // Restart scanning from the same degraded/misplaced status as the last time
+    for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+        if (recovery_last_degraded ? !no_recovery : !no_rebalance)
        {
            // Don't try to "recover" misplaced objects if "recovery" would make them degraded
-            if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+            auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
+            auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
+            // Restart scanning from the same PG as the last time
+            for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
            {
-                for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+                if ((pg_it->second.state & mask) == check)
                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                    auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
+                    assert(src.size() > 0);
+                    // Restart scanning from the next object
+                    for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
                    {
-                        op.degraded = false;
-                        op.oid = obj_it->first;
-                        return true;
+                        if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                        {
+                            op.degraded = recovery_last_degraded;
+                            recovery_last_oid = op.oid = obj_it->first;
+                            recovery_pg_done++;
+                            // Switch to another PG after recovery_pg_switch operations
+                            // to always mix all PGs during recovery but still benefit
+                            // from recovery queue depth greater than 1
+                            if (recovery_pg_done >= recovery_pg_switch)
+                            {
+                                recovery_pg_done = 0;
+                                recovery_last_pg.pg_num++;
+                                recovery_last_oid = {};
+                            }
+                            return true;
+                        }
                    }
                }
            }
        }
+        recovery_last_degraded = !recovery_last_degraded;
+        recovery_last_pg = {};
+        recovery_last_oid = {};
    }
    return false;
 }
@@ -291,19 +302,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
        if (osd_op->reply.hdr.retval < 0)
        {
            // Error recovering object
-            if (osd_op->reply.hdr.retval == -EPIPE)
-            {
-                // PG is stopped or one of the OSDs is gone, error is harmless
-                printf(
-                    "Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
-                    op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
-                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
-                );
-            }
-            else
-            {
-                throw std::runtime_error("Failed to recover an object");
-            }
+            // EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
+            printf(
+                "Recovery operation failed with object %lx:%lx (PG %u/%u): error %ld\n",
+                op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
+                map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
+                osd_op->reply.hdr.retval
+            );
+        }
+        else if (log_level > 2)
+        {
+            printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
        }
        // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
        op->osd_op = NULL;
--- a/src/osd_id.h
+++ b/src/osd_id.h
@@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
 {
    return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
 }
+
+inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
+{
+    return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
+}
+
+inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
+{
+    return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
+}
--- a/src/osd_ops.h
+++ b/src/osd_ops.h
@@ -29,7 +29,8 @@
 #define OSD_OP_DELETE               14
 #define OSD_OP_PING                 15
 #define OSD_OP_SEC_READ_BMP         16
-#define OSD_OP_MAX                  16
+#define OSD_OP_SCRUB                17
+#define OSD_OP_MAX                  17
 #define OSD_RW_MAX                  64*1024*1024
 #define OSD_PROTOCOL_VERSION        1

@@ -173,6 +174,11 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
    uint64_t pg_stripe_size;
    // inode range (used to select pools)
    uint64_t min_inode, max_inode;
+    // min/max oid stripe, added after inodes for backwards compatibility
+    // also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
+    uint64_t min_stripe, max_stripe;
+    // max stable object count
+    uint32_t stable_limit;
 };

 struct __attribute__((__packed__)) osd_reply_sec_list_t
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -24,6 +24,7 @@ void osd_t::handle_peers()
                if (!p.second.peering_state->list_ops.size())
                {
                    p.second.calc_object_states(log_level);
+                    schedule_scrub(p.second);
                    report_pg_state(p.second);
                    incomplete_objects += p.second.incomplete_objects.size();
                    misplaced_objects += p.second.misplaced_objects.size();
@@ -32,7 +33,16 @@ void osd_t::handle_peers()
                    if (p.second.state & PG_HAS_UNCLEAN)
                        peering_state = peering_state | OSD_FLUSHING_PGS;
                    else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
+                    {
                        peering_state = peering_state | OSD_RECOVERING;
+                        if (p.second.state & PG_HAS_DEGRADED)
+                        {
+                            // Restart recovery from degraded objects
+                            recovery_last_degraded = true;
+                            recovery_last_pg = {};
+                            recovery_last_oid = {};
+                        }
+                    }
                    ringloop->wakeup();
                    return;
                }
@@ -41,10 +51,6 @@ void osd_t::handle_peers()
                    still = true;
                }
            }
-            else if (p.second.state & PG_PEERED)
-            {
-                still = true;
-            }
        }
        if (!still)
        {
@@ -65,10 +71,6 @@ void osd_t::handle_peers()
                }
                still = true;
            }
-            else if (p.second.state & PG_PEERED)
-            {
-                still = true;
-            }
        }
        if (!still)
        {
@@ -82,6 +84,13 @@ void osd_t::handle_peers()
            peering_state = peering_state & ~OSD_RECOVERING;
        }
    }
+    if (peering_state & OSD_SCRUBBING)
+    {
+        if (!continue_scrub())
+        {
+            peering_state = peering_state & ~OSD_SCRUBBING;
+        }
+    }
 }

 void osd_t::repeer_pgs(osd_num_t peer_osd)
@@ -91,7 +100,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
    {
        auto & pg = p.second;
        bool repeer = false;
-        if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
+        if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
        {
            for (osd_num_t pg_osd: pg.all_peers)
            {
@@ -127,9 +136,11 @@ void osd_t::reset_pg(pg_t & pg)
    pg.state_dict.clear();
    copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
    pg.copies_to_delete_after_sync.clear();
+    corrupted_objects -= pg.corrupted_count;
    incomplete_objects -= pg.incomplete_objects.size();
    misplaced_objects -= pg.misplaced_objects.size();
    degraded_objects -= pg.degraded_objects.size();
+    pg.corrupted_count = 0;
    pg.incomplete_objects.clear();
    pg.misplaced_objects.clear();
    pg.degraded_objects.clear();
@@ -205,7 +216,7 @@ void osd_t::start_pg_peering(pg_t & pg)
            pg.cur_loc_set.push_back({
                .role = (uint64_t)role,
                .osd_num = pg.cur_set[role],
-                .outdated = false,
+                .loc_bad = 0,
            });
        }
    }
@@ -302,82 +313,11 @@ void osd_t::start_pg_peering(pg_t & pg)
        {
            continue;
        }
-        submit_sync_and_list_subop(peer_osd, pg.peering_state);
+        submit_list_subop(peer_osd, pg.peering_state);
    }
    ringloop->wakeup();
 }

-void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
-{
-    // Sync before listing, if not readonly
-    if (readonly)
-    {
-        submit_list_subop(role_osd, ps);
-    }
-    else if (role_osd == this->osd_num)
-    {
-        // Self
-        osd_op_t *op = new osd_op_t();
-        op->op_type = 0;
-        op->peer_fd = SELF_FD;
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t();
-        op->bs_op->opcode = BS_OP_SYNC;
-        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
-        {
-            if (bs_op->retval < 0)
-            {
-                printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
-                force_stop(1);
-                return;
-            }
-            add_bs_subop_stats(op);
-            delete op->bs_op;
-            op->bs_op = NULL;
-            delete op;
-            ps->list_ops.erase(role_osd);
-            submit_list_subop(role_osd, ps);
-        };
-        ps->list_ops[role_osd] = op;
-        bs->enqueue_op(op->bs_op);
-    }
-    else
-    {
-        // Peer
-        auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
-        osd_op_t *op = new osd_op_t();
-        op->op_type = OSD_OP_OUT;
-        op->peer_fd = cl->peer_fd;
-        op->req = (osd_any_op_t){
-            .sec_sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = msgr.next_subop_id++,
-                    .opcode = OSD_OP_SEC_SYNC,
-                },
-            },
-        };
-        op->callback = [this, ps, role_osd](osd_op_t *op)
-        {
-            if (op->reply.hdr.retval < 0)
-            {
-                // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
-                printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
-                int fail_fd = op->peer_fd;
-                ps->list_ops.erase(role_osd);
-                delete op;
-                msgr.stop_client(fail_fd);
-                return;
-            }
-            delete op;
-            ps->list_ops.erase(role_osd);
-            submit_list_subop(role_osd, ps);
-        };
-        ps->list_ops[role_osd] = op;
-        msgr.outbox_push(op);
-    }
-}
-
 void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
 {
    if (role_osd == this->osd_num)
@@ -389,11 +329,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
        op->bs_op = new blockstore_op_t();
        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
-        op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
-        op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
-        op->bs_op->len = pg_counts[ps->pool_id];
-        op->bs_op->offset = ps->pg_num-1;
+        op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
+        op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
+        op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
+        op->bs_op->max_oid.stripe = UINT64_MAX;
+        op->bs_op->pg_count = pg_counts[ps->pool_id];
+        op->bs_op->pg_number = ps->pg_num-1;
        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
        {
            if (op->bs_op->retval < 0)
@@ -551,13 +492,17 @@ void osd_t::report_pg_state(pg_t & pg)
        pg.history_changed = true;
        pg.target_history.clear();
        pg.all_peers = pg.target_set;
+        std::sort(pg.all_peers.begin(), pg.all_peers.end());
        pg.cur_peers = pg.target_set;
    }
    else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
    {
        // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
-        pg.history_changed = true;
-        pg.target_history.clear();
+        if (pg.target_history.size())
+        {
+            pg.history_changed = true;
+            pg.target_history.clear();
+        }
        std::set<osd_num_t> dead_peers;
        for (auto pg_osd: pg.all_peers)
        {
@@ -574,8 +519,12 @@ void osd_t::report_pg_state(pg_t & pg)
                dead_peers.insert(pg_osd);
            }
        }
-        pg.all_peers.clear();
-        pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
+        auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end());
+        if (pg.all_peers != new_all_peers)
+        {
+            pg.history_changed = true;
+            pg.all_peers = new_all_peers;
+        }
        pg.cur_peers.clear();
        for (auto pg_osd: pg.target_set)
        {
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -86,24 +86,11 @@ void pg_obj_state_check_t::walk()
    }
    if (pg->pg_cursize < pg->pg_size)
    {
-        // Report PG history and activate
-        pg->state |= PG_DEGRADED | PG_PEERED;
-        std::vector<osd_num_t> history_set;
-        for (auto peer_osd: pg->cur_set)
-        {
-            if (peer_osd != 0)
-            {
-                history_set.push_back(peer_osd);
-            }
-        }
-        pg->target_history.push_back(history_set);
-        pg->history_changed = true;
-    }
-    else
-    {
-        // Just activate
-        pg->state |= PG_ACTIVE;
+        // Activate as degraded
+        // Current OSD set will be added into target_history on first write
+        pg->state |= PG_DEGRADED;
    }
+    pg->state |= PG_ACTIVE;
    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
    {
        pg->state |= PG_LEFT_ON_DEAD;
@@ -293,7 +280,7 @@ void pg_obj_state_check_t::finish_object()
            osd_set.push_back((pg_obj_loc_t){
                .role = (list[i].oid.stripe & STRIPE_MASK),
                .osd_num = list[i].osd_num,
-                .outdated = false,
+                .loc_bad = 0,
            });
        }
    }
@@ -315,7 +302,7 @@ void pg_obj_state_check_t::finish_object()
                osd_set.push_back((pg_obj_loc_t){
                    .role = (list[i].oid.stripe & STRIPE_MASK),
                    .osd_num = list[i].osd_num,
-                    .outdated = true,
+                    .loc_bad = LOC_OUTDATED,
                });
                if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
                {
@@ -335,67 +322,73 @@ void pg_obj_state_check_t::finish_object()
    }
    else
    {
-        auto it = pg->state_dict.find(osd_set);
-        if (it == pg->state_dict.end())
-        {
-            std::vector<uint64_t> read_target;
-            if (replicated)
-            {
-                for (auto & o: osd_set)
-                {
-                    if (!o.outdated)
-                    {
-                        read_target.push_back(o.osd_num);
-                    }
-                }
-                while (read_target.size() < pg->pg_size)
-                {
-                    // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
-                    read_target.push_back(0);
-                }
-            }
-            else
-            {
-                read_target.resize(pg->pg_size);
-                for (int i = 0; i < pg->pg_size; i++)
-                {
-                    read_target[i] = 0;
-                }
-                for (auto & o: osd_set)
-                {
-                    if (!o.outdated)
-                    {
-                        read_target[o.role] = o.osd_num;
-                    }
-                }
-            }
-            pg->state_dict[osd_set] = {
-                .read_target = read_target,
-                .osd_set = osd_set,
-                .state = state,
-                .object_count = 1,
-            };
-            it = pg->state_dict.find(osd_set);
-        }
-        else
-        {
-            it->second.object_count++;
-        }
-        if (state & OBJ_INCOMPLETE)
-        {
-            pg->incomplete_objects[oid] = &it->second;
-        }
-        else if (state & OBJ_DEGRADED)
-        {
-            pg->degraded_objects[oid] = &it->second;
-        }
-        else
-        {
-            pg->misplaced_objects[oid] = &it->second;
-        }
+        pg->add_object_to_state(oid, state, osd_set);
    }
 }

+pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
+{
+    auto it = state_dict.find(osd_set);
+    if (it == state_dict.end())
+    {
+        std::vector<osd_num_t> read_target;
+        if (scheme == POOL_SCHEME_REPLICATED)
+        {
+            for (auto & o: osd_set)
+            {
+                if (!o.loc_bad)
+                {
+                    read_target.push_back(o.osd_num);
+                }
+            }
+            while (read_target.size() < pg_size)
+            {
+                // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
+                read_target.push_back(0);
+            }
+        }
+        else
+        {
+            read_target.resize(pg_size);
+            for (int i = 0; i < pg_size; i++)
+            {
+                read_target[i] = 0;
+            }
+            for (auto & o: osd_set)
+            {
+                if (!o.loc_bad)
+                {
+                    read_target[o.role] = o.osd_num;
+                }
+            }
+        }
+        state_dict[osd_set] = {
+            .read_target = read_target,
+            .osd_set = osd_set,
+            .state = state,
+            .object_count = 1,
+        };
+        it = state_dict.find(osd_set);
+    }
+    else
+    {
+        it->second.object_count++;
+    }
+    if (state & OBJ_INCOMPLETE)
+    {
+        incomplete_objects[oid] = &it->second;
+    }
+    else if (state & OBJ_DEGRADED)
+    {
+        degraded_objects[oid] = &it->second;
+    }
+    else
+    {
+        misplaced_objects[oid] = &it->second;
+    }
+    return &it->second;
+}
+
 // FIXME: Write at least some tests for this function
 void pg_t::calc_object_states(int log_level)
 {
@@ -435,32 +428,58 @@ void pg_t::calc_object_states(int log_level)
    std::sort(st.list.begin(), st.list.end());
    // Walk over it and check object states
    st.walk();
-    if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
+    if (this->state != PG_ACTIVE)
    {
        assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
        epoch++;
    }
+    if (log_level > 0)
+    {
+        std::string osd_set_desc;
+        for (auto & osd_num: target_set)
+        {
+            osd_set_desc += (osd_set_desc == "" ? "" : ", ")+std::to_string(osd_num);
+        }
+        printf(
+            "[PG %u/%u] %lu clean objects on target OSD set %s\n",
+            pool_id, pg_num, clean_count, osd_set_desc.c_str()
+        );
+        for (auto & stp: state_dict)
+        {
+            osd_set_desc = "";
+            for (auto & loc: stp.first)
+            {
+                osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
+                    std::to_string(loc.osd_num)+
+                    (st.replicated ? "" : "("+std::to_string(loc.role)+")")+
+                    (loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
+                    (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "");
+            }
+            printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
+        }
+    }
 }

 void pg_t::print_state()
 {
    printf(
-        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
-        (state & PG_PEERED) ? "peered" : "",
        (state & PG_INCOMPLETE) ? "incomplete" : "",
        (state & PG_ACTIVE) ? "active" : "",
        (state & PG_REPEERING) ? "repeering" : "",
        (state & PG_STOPPING) ? "stopping" : "",
        (state & PG_DEGRADED) ? " + degraded" : "",
+        (state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
        (state & PG_HAS_INVALID) ? " + has_invalid" : "",
        (state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
+        (state & PG_SCRUBBING) ? " + scrubbing" : "",
        total_count
    );
 }
--- a/src/osd_peering_pg.h
+++ b/src/osd_peering_pg.h
@@ -13,11 +13,14 @@

 #define PG_EPOCH_BITS 48

+#define LOC_OUTDATED 1
+#define LOC_CORRUPTED 2
+
 struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    bool outdated;
+    uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -30,6 +33,7 @@ struct pg_osd_set_state_t
    pg_osd_set_t osd_set;
    uint64_t state = 0;
    uint64_t object_count = 0;
+    uint64_t ref_count = 0;
 };

 struct pg_list_result_t
@@ -91,6 +95,8 @@ struct pg_t
    // target history and all potential peers
    std::vector<std::vector<osd_num_t>> target_history;
    std::vector<osd_num_t> all_peers;
+    // last scrub time
+    uint64_t scrub_ts = 0;
    bool history_changed = false;
    // peer list from the last peering event
    std::vector<osd_num_t> cur_peers;
@@ -106,6 +112,7 @@ struct pg_t
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
+    uint64_t corrupted_count;
    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
    std::map<obj_piece_id_t, flush_action_t> flush_actions;
    std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
@@ -116,15 +123,16 @@ struct pg_t
    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

+    pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
    void calc_object_states(int log_level);
    void print_state();
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.outdated < b.outdated ||
-        a.outdated == b.outdated && a.role < b.role ||
-        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
+    return a.loc_bad < b.loc_bad ||
+        a.loc_bad == b.loc_bad && a.role < b.role ||
+        a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
--- a/src/osd_peering_pg_test.cpp
+++ b/src/osd_peering_pg_test.cpp
@@ -54,5 +54,6 @@ int main(int argc, char *argv[])
    {
        printf("dev: state=%lx\n", it.second.state);
    }
+    delete pg.peering_state;
    return 0;
 }
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@@ -52,7 +52,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        finish_op(cur_op, -EINVAL);
        return false;
    }
-    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
+    // Scrub is similar to r/w, so it's also handled here
+    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
+        && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
    int chain_size = 0;
    if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
    {
@@ -90,6 +92,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        chain_size * (
            // - copy of the chain
            sizeof(inode_t) +
+            // - object states for every chain item
+            sizeof(void*) +
            // - bitmap buffers for chained read
            stripe_count * clean_entry_bitmap_size +
            // - 'missing' flags for chained reads
@@ -117,6 +121,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    {
        op_data->read_chain = (inode_t*)data_buf;
        data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
+        op_data->chain_states = (pg_osd_set_state_t**)data_buf;
+        data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
        op_data->snapshot_bitmaps = data_buf;
        data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
        op_data->missing_flags = (uint8_t*)data_buf;
@@ -131,6 +137,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
            inode_it->second.parent_id != cur_op->req.rw.inode)
        {
            op_data->read_chain[chain_num++] = inode_it->second.parent_id;
+            op_data->chain_states[chain_num++] = NULL;
            inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
        }
    }
@@ -138,12 +145,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    return true;
 }

-uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
+uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
 {
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
        *object_state = NULL;
-        return def;
+        return pg.cur_set.data();
    }
    auto st_it = pg.incomplete_objects.find(oid);
    if (st_it != pg.incomplete_objects.end())
@@ -164,7 +171,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_
        return st_it->second->read_target.data();
    }
    *object_state = NULL;
-    return def;
+    return pg.cur_set.data();
 }

 void osd_t::continue_primary_read(osd_op_t *cur_op)
@@ -183,6 +190,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        goto resume_1;
    else if (op_data->st == 2)
        goto resume_2;
+resume_0:
    cur_op->reply.rw.bitmap_len = 0;
    {
        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
@@ -194,15 +202,17 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        op_data->prev_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
-        {
-            // PG may be degraded or have misplaced objects
-            op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-        }
+        // PG may have degraded or misplaced objects
+        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
+            if (op_data->scheme == POOL_SCHEME_REPLICATED &&
+                op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
+            {
+                finish_op(cur_op, -EIO);
+                return;
+            }
            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
            op_data->st = 1;
@@ -228,7 +238,15 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // I/O or checksum error
+            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
+            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
+            goto resume_0;
+        }
+        finish_op(cur_op, op_data->errcode);
        return;
    }
    cur_op->reply.rw.version = op_data->fact_ver;
@@ -266,10 +284,144 @@ resume_2:
    finish_op(cur_op, cur_op->req.rw.len);
 }

-// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
-void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
+pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
 {
-    if (object_state->state & OBJ_INCOMPLETE)
+    pg_osd_set_state_t *object_state = NULL;
+    get_object_osd_set(pg, oid, &object_state);
+    if (prev_object_state != object_state)
+    {
+        // Object state changed in between by a parallel I/O operation, skip marking as failed
+        if (ref)
+        {
+            deref_object_state(pg, &prev_object_state, ref);
+            if (object_state)
+                object_state->ref_count++;
+        }
+        return object_state;
+    }
+    pg_osd_set_t corrupted_set;
+    if (object_state)
+    {
+        corrupted_set = object_state->osd_set;
+    }
+    else
+    {
+        for (int i = 0; i < pg.cur_set.size(); i++)
+        {
+            corrupted_set.push_back((pg_obj_loc_t){
+                .role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
+                .osd_num = pg.cur_set[i],
+            });
+        }
+    }
+    // Mark object chunk(s) as corrupted
+    uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
+    for (auto & chunk: corrupted_set)
+    {
+        bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
+        if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
+            n_corrupted++;
+        chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
+        if (!chunk.loc_bad)
+        {
+            if (pg.scheme == POOL_SCHEME_REPLICATED)
+                n_roles = 1;
+            else if (!(has_roles & (1 << chunk.role)))
+            {
+                n_roles++;
+                has_roles |= (1 << chunk.role);
+            }
+            n_copies++;
+        }
+    }
+    if (!n_corrupted)
+    {
+        // No chunks newly marked as corrupted - object is already marked or moved
+        return object_state;
+    }
+    int old_pg_state = pg.state;
+    if (object_state)
+    {
+        remove_object_from_state(oid, &object_state, pg, false);
+        deref_object_state(pg, &object_state, ref);
+    }
+    // Calculate object state
+    uint64_t obj_state = OBJ_CORRUPTED;
+    int pg_state_bits = PG_HAS_CORRUPTED;
+    this->corrupted_objects++;
+    pg.corrupted_count++;
+    if (log_level > 1)
+    {
+        printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
+            oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
+    }
+    if (n_roles < pg.pg_data_size)
+    {
+        this->incomplete_objects++;
+        obj_state |= OBJ_INCOMPLETE;
+        pg_state_bits = PG_HAS_INCOMPLETE;
+    }
+    else if (n_roles < pg.pg_cursize)
+    {
+        this->degraded_objects++;
+        obj_state |= OBJ_DEGRADED;
+        pg_state_bits = PG_HAS_DEGRADED;
+    }
+    else
+    {
+        this->misplaced_objects++;
+        obj_state |= OBJ_MISPLACED;
+        pg_state_bits = PG_HAS_MISPLACED;
+    }
+    pg.state |= pg_state_bits;
+    if (pg.state != old_pg_state)
+    {
+        report_pg_state(pg);
+        if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
+            (old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
+        {
+            peering_state = peering_state | OSD_RECOVERING;
+            if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
+            {
+                // Restart recovery from degraded objects
+                recovery_last_degraded = true;
+                recovery_last_pg = {};
+                recovery_last_oid = {};
+            }
+            ringloop->wakeup();
+        }
+    }
+    // Insert object into the new state and retry
+    object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
+    if (ref)
+        object_state->ref_count++;
+    return object_state;
+}
+
+// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
+void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
+{
+    if (!*object_state)
+    {
+        return;
+    }
+    pg_osd_set_state_t *recheck_state = NULL;
+    get_object_osd_set(pg, oid, &recheck_state);
+    if (recheck_state != *object_state)
+    {
+        recheck_state->ref_count++;
+        (*object_state)->ref_count--;
+        *object_state = recheck_state;
+        return;
+    }
+    (*object_state)->object_count--;
+    if ((*object_state)->state & OBJ_CORRUPTED)
+    {
+        this->corrupted_objects--;
+        pg.corrupted_count--;
+    }
+    bool changed = false;
+    if ((*object_state)->state & OBJ_INCOMPLETE)
    {
        // Successful write means that object is not incomplete anymore
        this->incomplete_objects--;
@@ -277,41 +429,52 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
        if (!pg.incomplete_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
-            report_pg_state(pg);
+            changed = true;
        }
    }
-    else if (object_state->state & OBJ_DEGRADED)
+    else if ((*object_state)->state & OBJ_DEGRADED)
    {
        this->degraded_objects--;
        pg.degraded_objects.erase(oid);
        if (!pg.degraded_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_DEGRADED;
-            report_pg_state(pg);
+            changed = true;
        }
    }
-    else if (object_state->state & OBJ_MISPLACED)
+    else if ((*object_state)->state & OBJ_MISPLACED)
    {
        this->misplaced_objects--;
        pg.misplaced_objects.erase(oid);
        if (!pg.misplaced_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_MISPLACED;
-            report_pg_state(pg);
+            changed = true;
        }
    }
    else
    {
-        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
+        throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
+    }
+    if (changed && report)
+    {
+        report_pg_state(pg);
    }
 }

-void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
+void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
 {
-    if (*object_state && !(--(*object_state)->object_count))
+    if (*object_state)
    {
-        pg.state_dict.erase((*object_state)->osd_set);
-        *object_state = NULL;
+        if (deref)
+        {
+            (*object_state)->ref_count--;
+        }
+        if (!(*object_state)->object_count && !(*object_state)->ref_count)
+        {
+            pg.state_dict.erase((*object_state)->osd_set);
+            *object_state = NULL;
+        }
    }
 }

@@ -341,21 +504,28 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
    }
 resume_1:
    // Determine which OSDs contain this object and delete it
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+    if (op_data->object_state)
+    {
+        op_data->object_state->ref_count++;
+    }
    // Submit 1 read to determine the actual version number
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
+    op_data->prev_set = NULL;
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        deref_object_state(pg, &op_data->object_state, true);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
+        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -371,7 +541,8 @@ resume_4:
 resume_5:
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        deref_object_state(pg, &op_data->object_state, true);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Remove version override
@@ -383,8 +554,8 @@ resume_5:
    }
    else
    {
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
-        free_object_state(pg, &op_data->object_state);
+        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
+        deref_object_state(pg, &op_data->object_state, true);
    }
    pg.total_count--;
    cur_op->reply.hdr.retval = 0;
--- a/src/osd_primary.h
+++ b/src/osd_primary.h
@@ -9,6 +9,7 @@
 #define SUBMIT_READ 0
 #define SUBMIT_RMW_READ 1
 #define SUBMIT_WRITE 2
+#define SUBMIT_SCRUB_READ 3

 struct unstable_osd_num_t
 {
@@ -24,7 +25,7 @@ struct osd_primary_op_data_t
    uint64_t target_ver;
    uint64_t orig_ver = 0, fact_ver = 0;
    uint64_t scheme = 0;
-    int n_subops = 0, done = 0, errors = 0, epipe = 0;
+    int n_subops = 0, done = 0, errors = 0, errcode = 0;
    int degraded = 0, pg_size, pg_data_size;
    osd_rmw_stripe_t *stripes;
    osd_op_t *subops = NULL;
@@ -50,6 +51,7 @@ struct osd_primary_op_data_t
            // for read_bitmaps
            void *snapshot_bitmaps;
            inode_t *read_chain;
+            pg_osd_set_state_t **chain_states;
            uint8_t *missing_flags;
            int chain_size;
            osd_chain_read_t *chain_reads;
--- a/src/osd_primary_chain.cpp
+++ b/src/osd_primary_chain.cpp
@@ -40,10 +40,24 @@ resume_3:
 resume_4:
    if (op_data->errors > 0)
    {
-        free(op_data->chain_reads);
-        op_data->chain_reads = NULL;
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
-        return;
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // Handle corrupted reads and retry...
+            check_corrupted_chained(pg, cur_op);
+            free(cur_op->buf);
+            cur_op->buf = NULL;
+            free(op_data->chain_reads);
+            op_data->chain_reads = NULL;
+            // FIXME: We can in theory retry only specific parts instead of the whole operation
+            goto resume_1;
+        }
+        else
+        {
+            free(op_data->chain_reads);
+            op_data->chain_reads = NULL;
+            finish_op(cur_op, op_data->errcode);
+            return;
+        }
    }
    send_chained_read_results(pg, cur_op);
    finish_op(cur_op, cur_op->req.rw.len);
@@ -131,8 +145,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
        object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        pg_osd_set_state_t *object_state;
-        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
+        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
        if (pg.scheme == POOL_SCHEME_REPLICATED)
        {
            osd_num_t read_target = 0;
@@ -247,6 +260,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                osd_op_t *subop = op_data->subops+subop_idx;
                subop->op_type = OSD_OP_OUT;
                // FIXME: Use the pre-allocated buffer
+                assert(!subop->buf);
                subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
                subop->req = (osd_any_op_t){
                    .sec_read_bmp = {
@@ -297,7 +311,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                    // Fail it immediately
                    subop->peer_fd = -1;
                    subop->reply.hdr.retval = -EPIPE;
-                    subop->callback(subop);
+                    ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
                }
                subop_idx++;
            }
@@ -375,6 +389,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    op_data->chain_read_count = chain_reads.size();
    op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
        1, sizeof(osd_chain_read_t) * chain_reads.size()
+        // FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
+        // (but it's slightly harder to handle in send_chained_read_results())
        + sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
    );
    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
@@ -403,8 +419,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        uint64_t *cur_set = pg.cur_set.data();
        if (pg.state != PG_ACTIVE)
        {
-            pg_osd_set_state_t *object_state;
-            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
+            cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
            if (op_data->scheme != POOL_SCHEME_REPLICATED)
            {
                if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
@@ -416,6 +431,17 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
                }
                op_data->degraded = 1;
            }
+            else
+            {
+                auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
+                if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
+                {
+                    free(op_data->chain_reads);
+                    op_data->chain_reads = NULL;
+                    finish_op(cur_op, -EIO);
+                    return -1;
+                }
+            }
        }
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -433,6 +459,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
            }
        }
    }
+    assert(!cur_op->buf);
    cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
    void *cur_buf = cur_op->buf;
    for (int cri = 0; cri < chain_reads.size(); cri++)
@@ -468,12 +495,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        uint64_t *cur_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
-        {
-            pg_osd_set_state_t *object_state;
-            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
-        }
+        auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
+        uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
        int zero_read = -1;
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -487,6 +510,33 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    return 0;
 }

+void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
+    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
+        (uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
+    );
+    for (int cri = 0; cri < op_data->chain_read_count; cri++)
+    {
+        object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
+        osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
+        bool corrupted = false;
+        for (int i = 0; i < stripe_count; i++)
+        {
+            if (stripes[i].read_error)
+            {
+                corrupted = true;
+                break;
+            }
+        }
+        if (corrupted)
+        {
+            mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
+        }
+    }
+}
+
 void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -9,6 +9,7 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
+        autosync_op->peer_fd = -1;
        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
@@ -122,7 +123,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
        zero_read = -1;
    osd_op_t *subops = new osd_op_t[n_subops];
    op_data->fact_ver = 0;
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    op_data->n_subops = n_subops;
    op_data->subops = subops;
    int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
@@ -139,34 +140,40 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
    for (int role = 0; role < op_data->pg_size; role++)
    {
        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
+        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
        {
            continue;
        }
        osd_num_t role_osd_num = osd_set[role];
+        int stripe_num = rep ? 0 : role;
        if (role_osd_num != 0)
        {
-            int stripe_num = rep ? 0 : role;
            osd_op_t *subop = op_data->subops + i;
+            stripes[stripe_num].osd_num = role_osd_num;
+            stripes[stripe_num].read_error = false;
+            subop->bitmap = stripes[stripe_num].bmp_buf;
+            subop->bitmap_len = clean_entry_bitmap_size;
+            // Using rmw_buf to pass pointer to stripes. Dirty but should work
+            subop->rmw_buf = stripes+stripe_num;
            if (role_osd_num == this->osd_num)
            {
                clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
                subop->op_type = (uint64_t)cur_op;
-                subop->bitmap = stripes[stripe_num].bmp_buf;
-                subop->bitmap_len = clean_entry_bitmap_size;
-                subop->bs_op = new blockstore_op_t({
+                subop->bs_op = new blockstore_op_t((blockstore_op_t){
                    .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
                    .callback = [subop, this](blockstore_op_t *bs_subop)
                    {
                        handle_primary_bs_subop(subop);
                    },
-                    .oid = {
-                        .inode = inode,
-                        .stripe = op_data->oid.stripe | stripe_num,
+                    {
+                        .oid = (object_id){
+                            .inode = inode,
+                            .stripe = op_data->oid.stripe | stripe_num,
+                        },
+                        .version = op_version,
+                        .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
+                        .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
                    },
-                    .version = op_version,
-                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
-                    .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
                    .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
                    .bitmap = stripes[stripe_num].bmp_buf,
                });
@@ -182,8 +189,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            else
            {
                subop->op_type = OSD_OP_OUT;
-                subop->bitmap = stripes[stripe_num].bmp_buf;
-                subop->bitmap_len = clean_entry_bitmap_size;
                subop->req.sec_rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
@@ -235,11 +240,15 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
                    // Fail it immediately
                    subop->peer_fd = -1;
                    subop->reply.hdr.retval = -EPIPE;
-                    subop->callback(subop);
+                    ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
                }
            }
            i++;
        }
+        else
+        {
+            stripes[stripe_num].osd_num = 0;
+        }
    }
    return i-subop_idx;
 }
@@ -263,9 +272,11 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
    blockstore_op_t *bs_op = subop->bs_op;
    int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE
        || bs_op->opcode == BS_OP_WRITE_STABLE ? bs_op->len : 0;
-    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
+    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ &&
+        (bs_op->opcode != BS_OP_WRITE && bs_op->opcode != BS_OP_WRITE_STABLE ||
+        bs_op->retval != -ENOSPC))
    {
-        // die
+        // die on any error except ENOSPC
        throw std::runtime_error(
            "local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
            " retval = "+std::to_string(bs_op->retval)+")"
@@ -276,6 +287,8 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
    subop->reply.hdr.retval = bs_op->retval;
    if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE)
    {
+        subop->req.sec_rw.oid = bs_op->oid;
+        subop->req.sec_rw.version = bs_op->version;
        subop->req.sec_rw.len = bs_op->len;
        subop->reply.sec_rw.version = bs_op->version;
    }
@@ -325,9 +338,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
            printf(
-                "%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
+                subop->peer_fd >= 0
+                    ? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
+                    : "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
                osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
-                subop->peer_fd, retval, expected
+                retval, expected, subop->peer_fd
            );
        }
        else
@@ -337,19 +352,32 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
                osd_op_names[opcode], subop->peer_fd, retval, expected
            );
        }
-        if (retval == -EPIPE)
+        if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
        {
-            op_data->epipe++;
+            // We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
+            ((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
+        }
+        subop->rmw_buf = NULL;
+        // Error priority: EIO > EDOM > ENOSPC > EPIPE
+        if (op_data->errcode == 0 ||
+            retval == -EIO ||
+            retval == -EDOM && (op_data->errcode == -ENOSPC || op_data->errcode == -EPIPE) ||
+            retval == -ENOSPC && op_data->errcode == -EPIPE)
+        {
+            op_data->errcode = retval;
        }
        op_data->errors++;
-        if (subop->peer_fd >= 0)
+        if (subop->peer_fd >= 0 && retval != -EDOM &&
+            (retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
+            (retval != -EIO || opcode != OSD_OP_SEC_READ))
        {
-            // Drop connection on any error
+            // Drop connection on unexpected errors
            msgr.stop_client(subop->peer_fd);
        }
    }
    else
    {
+        subop->rmw_buf = NULL;
        op_data->done++;
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
@@ -393,6 +421,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
        {
            continue_primary_del(cur_op);
        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
+        {
+            continue_primary_scrub(cur_op);
+        }
        else
        {
            throw std::runtime_error("BUG: unknown opcode");
@@ -408,7 +440,8 @@ void osd_t::cancel_primary_write(osd_op_t *cur_op)
        // are sent to peer OSDs, so we can't just throw them away.
        // Mark them with an extra EPIPE.
        cur_op->op_data->errors++;
-        cur_op->op_data->epipe++;
+        if (cur_op->op_data->errcode == 0)
+            cur_op->op_data->errcode = -EPIPE;
        cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
    }
    else
@@ -460,7 +493,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
    op_data->n_subops = chunks_to_delete_count;
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    if (!op_data->n_subops)
    {
        return;
@@ -512,7 +545,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
                // Fail it immediately
                subops[i].peer_fd = -1;
                subops[i].reply.hdr.retval = -EPIPE;
-                subops[i].callback(&subops[i]);
+                ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
            }
        }
    }
@@ -523,7 +556,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
    osd_primary_op_data_t *op_data = cur_op->op_data;
    int n_osds = op_data->dirty_osd_count;
    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    op_data->n_subops = n_osds;
    op_data->subops = subops;
    std::map<uint64_t, int>::iterator peer_it;
@@ -579,7 +612,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
    osd_primary_op_data_t *op_data = cur_op->op_data;
    int n_osds = op_data->unstable_write_osds->size();
    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    op_data->n_subops = n_osds;
    op_data->subops = subops;
    for (int i = 0; i < n_osds; i++)
@@ -595,7 +628,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                {
                    handle_primary_bs_subop(subop);
                },
-                .len = (uint32_t)stab_osd.len,
+                {
+                    .len = (uint32_t)stab_osd.len,
+                },
                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
            });
            bs->enqueue_op(subops[i].bs_op);
@@ -627,7 +662,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                // Fail it immediately
                subops[i].peer_fd = -1;
                subops[i].reply.hdr.retval = -EPIPE;
-                subops[i].callback(&subops[i]);
+                ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
            }
        }
    }
--- a/src/osd_primary_sync.cpp
+++ b/src/osd_primary_sync.cpp
@@ -240,7 +240,7 @@ resume_8:
    }
    if (op_data->errors > 0)
    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+        finish_op(cur_op, op_data->errcode);
    }
    else
    {
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -58,7 +58,13 @@ resume_1:
    // Determine blocks to read and write
    // Missing chunks are allowed to be overwritten even in incomplete objects
    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+    if (op_data->object_state)
+    {
+        // Protect object_state from being freed by a parallel read operation changing it
+        op_data->object_state->ref_count++;
+    }
+retry_1:
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Simplified algorithm
@@ -68,6 +74,12 @@ resume_1:
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
+            if (op_data->object_state->state & OBJ_INCOMPLETE)
+            {
+                // Refuse partial overwrite of an incomplete (corrupted) object
+                cur_op->reply.hdr.retval = -EIO;
+                goto continue_others;
+            }
            // Object is degraded/misplaced and will be moved to <write_osd_set>
            op_data->stripes[0].read_start = 0;
            op_data->stripes[0].read_end = bs_block_size;
@@ -81,24 +93,66 @@ resume_1:
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
-            cur_op->reply.hdr.retval = -EINVAL;
+            cur_op->reply.hdr.retval = -EIO;
            goto continue_others;
        }
    }
    // Read required blocks
-    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
+    {
+        if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
+        {
+            // Allow to read version number (just version number!) from corrupted chunks
+            // to allow full overwrite of a corrupted object
+            bool found = false;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
+                {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                osd_num_t corrupted_target[op_data->pg_size];
+                for (int role = 0; role < op_data->pg_size; role++)
+                {
+                    corrupted_target[role] = 0;
+                }
+                for (auto & loc: op_data->object_state->osd_set)
+                {
+                    if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
+                    {
+                        corrupted_target[loc.role] = loc.osd_num;
+                    }
+                }
+                submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
+                goto resume_2;
+            }
+        }
+        submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
+    }
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // Mark object corrupted and retry
+            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
+            op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
+            goto retry_1;
+        }
+        deref_object_state(pg, &op_data->object_state, true);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
+        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -155,17 +209,37 @@ resume_3:
    if (pg.epoch > pg.reported_epoch)
    {
        // Report newer epoch before writing
-        // FIXME: We may report only one PG state here...
+        // FIXME: We don't have to report all changed PG states here
        this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        if (pg.state != PG_ACTIVE)
+        {
+            // Check that current OSD set is in history and/or add it there
+            std::vector<osd_num_t> history_set;
+            for (auto peer_osd: pg.cur_set)
+                if (peer_osd != 0)
+                    history_set.push_back(peer_osd);
+            std::sort(history_set.begin(), history_set.end());
+            auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
+            if (it == pg.target_history.end() || *it != history_set)
+                pg.target_history.insert(it, history_set);
+        }
        pg.history_changed = true;
        report_pg_states();
 resume_10:
        if (pg.epoch > pg.reported_epoch)
        {
-            op_data->st = 10;
+#define PG_EPOCH_WAIT_STATE 10
+            op_data->st = PG_EPOCH_WAIT_STATE;
            return;
        }
    }
+    // Recheck PG state after reporting history - maybe it's already stopping/restarting
+    if (pg.state & (PG_STOPPING|PG_REPEERING))
+    {
+        deref_object_state(pg, &op_data->object_state, true);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
+        return;
+    }
    submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op);
 resume_4:
    op_data->st = 4;
@@ -178,7 +252,8 @@ resume_5:
    }
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        deref_object_state(pg, &op_data->object_state, true);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    if (op_data->object_state)
@@ -186,7 +261,7 @@ resume_5:
        // We must forget the unclean state of the object before deleting it
        // so the next reads don't accidentally read a deleted version
        // And it should be done at the same time as the removal of the version override
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
        pg.clean_count++;
    }
 resume_6:
@@ -241,12 +316,12 @@ resume_7:
                    copies_to_delete_after_sync_count++;
                }
            }
-            free_object_state(pg, &op_data->object_state);
+            deref_object_state(pg, &op_data->object_state, true);
        }
        else
        {
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
-            free_object_state(pg, &op_data->object_state);
+            deref_object_state(pg, &op_data->object_state, true);
            if (op_data->n_subops > 0)
            {
 resume_8:
@@ -255,7 +330,7 @@ resume_8:
 resume_9:
                if (op_data->errors > 0)
                {
-                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
                    return;
                }
            }
@@ -287,6 +362,50 @@ continue_others:
    }
 }

+void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
+{
+    auto pg_it = pgs.find({
+        .pool_id = pool_id,
+        .pg_num = pg_num,
+    });
+    if (pg_it == pgs.end())
+    {
+        return;
+    }
+    auto & pg = pg_it->second;
+    if (pg.epoch > pg.reported_epoch &&
+        st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg.epoch)
+    {
+        pg.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
+        std::vector<object_id> resume_oids;
+        for (auto & op: pg.write_queue)
+        {
+            if (op.second->op_data->st == PG_EPOCH_WAIT_STATE)
+            {
+                // Run separately to prevent side effects
+                resume_oids.push_back(op.first);
+            }
+        }
+        for (auto & oid: resume_oids)
+        {
+            auto pg_it = pgs.find({
+                .pool_id = pool_id,
+                .pg_num = pg_num,
+            });
+            if (pg_it != pgs.end())
+            {
+                auto & pg = pg_it->second;
+                auto op_it = pg.write_queue.find(oid);
+                if (op_it != pg.write_queue.end() &&
+                    op_it->second->op_data->st == PG_EPOCH_WAIT_STATE)
+                {
+                    continue_primary_write(op_it->second);
+                }
+            }
+        }
+    }
+}
+
 bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
@@ -337,7 +456,7 @@ resume_7:
            op_data->unstable_write_osds = NULL;
            if (op_data->errors > 0)
            {
-                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
                return false;
            }
        }
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@@ -154,6 +154,8 @@ struct reed_sol_matrix_t
    int refs = 0;
    int *je_data;
    uint8_t *isal_data;
+    // 32 bytes = 256/8 = max pg_size/8
+    std::map<std::array<uint8_t, 32>, void*> subdata;
    std::map<reed_sol_erased_t, void*> decodings;
 };

@@ -194,6 +196,12 @@ void use_ec(int pg_size, int pg_minsize, bool use)
        free(rs_it->second.je_data);
        if (rs_it->second.isal_data)
            free(rs_it->second.isal_data);
+        for (auto sub_it = rs_it->second.subdata.begin(); sub_it != rs_it->second.subdata.end();)
+        {
+            void *data = sub_it->second;
+            rs_it->second.subdata.erase(sub_it++);
+            free(data);
+        }
        for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
        {
            void *data = dec_it->second;
@@ -294,6 +302,47 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
    return dec_it->second;
 }

+#ifndef WITH_ISAL
+#define JERASURE_ALIGNMENT 16
+
+// jerasure requires 16-byte alignment for SSE...
+// FIXME: jerasure/gf-complete should probably be patched to automatically choose non-sse version for unaligned buffers
+static void jerasure_matrix_encode_unaligned(int k, int m, int w, int *matrix, char **data_ptrs, char **coding_ptrs, int size)
+{
+    bool unaligned = false;
+    for (int i = 0; i < k; i++)
+        if (((unsigned long)data_ptrs[i]) % JERASURE_ALIGNMENT)
+            unaligned = true;
+    for (int i = 0; i < m; i++)
+        if (((unsigned long)coding_ptrs[i]) % JERASURE_ALIGNMENT)
+            unaligned = true;
+    if (!unaligned)
+    {
+        jerasure_matrix_encode(k, m, w, matrix, data_ptrs, coding_ptrs, size);
+        return;
+    }
+    int aligned_size = ((size+JERASURE_ALIGNMENT-1)/JERASURE_ALIGNMENT)*JERASURE_ALIGNMENT;
+    int copy_size = aligned_size*(k+m);
+    char local_data[copy_size > 4096 ? 0 : copy_size];
+    char *data_copy = copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT
+        ? (char*)memalign_or_die(JERASURE_ALIGNMENT, aligned_size*(k+m))
+        : local_data;
+    char *aligned_ptrs[k+m];
+    for (int i = 0; i < k; i++)
+    {
+        memcpy(data_copy + i*aligned_size, data_ptrs[i], size);
+        aligned_ptrs[i] = data_copy + i*aligned_size;
+    }
+    for (int i = 0; i < m; i++)
+        aligned_ptrs[k+i] = data_copy + (k+i)*aligned_size;
+    jerasure_matrix_encode(k, m, w, matrix, aligned_ptrs, aligned_ptrs+k, size);
+    for (int i = 0; i < m; i++)
+        memcpy(coding_ptrs[i], aligned_ptrs[k+i], size);
+    if (copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT)
+        free(data_copy);
+}
+#endif
+
 #ifdef WITH_ISAL
 void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
 {
@@ -357,10 +406,12 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
    {
        data_ptrs[role] = NULL;
    }
+    bool recovered = false;
    for (int role = 0; role < pg_minsize; role++)
    {
        if (stripes[role].read_end != 0 && stripes[role].missing)
        {
+            recovered = true;
            if (stripes[role].read_end > stripes[role].read_start)
            {
                for (int other = 0; other < pg_size; other++)
@@ -378,18 +429,64 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi
                    data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
                );
            }
-            for (int other = 0; other < pg_size; other++)
+        }
+    }
+    if (recovered && bitmap_size > 0)
+    {
+        bool unaligned = false;
+        for (int role = 0; role < pg_size; role++)
+        {
+            if (stripes[role].read_end != 0)
            {
-                if (stripes[other].read_end != 0 && !stripes[other].missing)
+                data_ptrs[role] = (char*)stripes[role].bmp_buf;
+                if (((unsigned long)stripes[role].bmp_buf) % JERASURE_ALIGNMENT)
+                    unaligned = true;
+            }
+        }
+        if (!unaligned)
+        {
+            for (int role = 0; role < pg_minsize; role++)
+            {
+                if (stripes[role].read_end != 0 && stripes[role].missing)
                {
-                    data_ptrs[other] = (char*)(stripes[other].bmp_buf);
+                    jerasure_matrix_dotprod(
+                        pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
+                        data_ptrs, data_ptrs+pg_minsize, bitmap_size
+                    );
                }
            }
-            data_ptrs[role] = (char*)stripes[role].bmp_buf;
-            jerasure_matrix_dotprod(
-                pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
-                data_ptrs, data_ptrs+pg_minsize, bitmap_size
-            );
+        }
+        else
+        {
+            // jerasure_matrix_dotprod requires 16-byte alignment for SSE...
+            int aligned_size = ((bitmap_size+JERASURE_ALIGNMENT-1)/JERASURE_ALIGNMENT)*JERASURE_ALIGNMENT;
+            int copy_size = aligned_size*pg_size;
+            char local_data[copy_size > 4096 ? 0 : copy_size];
+            bool alloc_copy = copy_size > 4096 || (unsigned long)local_data % JERASURE_ALIGNMENT;
+            char *data_copy = alloc_copy
+                ? (char*)memalign_or_die(JERASURE_ALIGNMENT, copy_size)
+                : local_data;
+            for (int role = 0; role < pg_size; role++)
+            {
+                if (stripes[role].read_end != 0)
+                {
+                    data_ptrs[role] = data_copy + role*aligned_size;
+                    memcpy(data_ptrs[role], stripes[role].bmp_buf, bitmap_size);
+                }
+            }
+            for (int role = 0; role < pg_size; role++)
+            {
+                if (stripes[role].read_end != 0 && stripes[role].missing)
+                {
+                    jerasure_matrix_dotprod(
+                        pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
+                        data_ptrs, data_ptrs+pg_minsize, bitmap_size
+                    );
+                    memcpy(stripes[role].bmp_buf, data_ptrs[role], bitmap_size);
+                }
+            }
+            if (alloc_copy)
+                free(data_copy);
        }
    }
 }
@@ -662,7 +759,18 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
    uint32_t &start, uint32_t &end)
 {
-    if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
+    bool required = false;
+    for (int role = pg_minsize; role < pg_size; role++)
+    {
+        if (write_osd_set[role] != 0)
+        {
+            // Whole parity chunk is needed when we move the object
+            if (write_osd_set[role] != read_osd_set[role])
+                end = chunk_size;
+            required = true;
+        }
+    }
+    if (required && end != chunk_size)
    {
        // start & end are required for calc_rmw_parity
        for (int role = 0; role < pg_minsize; role++)
@@ -673,14 +781,6 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
                end = std::max(stripes[role].req_end, end);
            }
        }
-        for (int role = pg_minsize; role < pg_size; role++)
-        {
-            if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
-            {
-                start = 0;
-                end = chunk_size;
-            }
-        }
    }
    // Set bitmap bits accordingly
    if (bitmap_granularity > 0)
@@ -808,11 +908,56 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
    if (end != 0)
    {
        int write_parity = 0;
-        for (int i = pg_minsize; i < pg_size; i++)
+        bool is_seq = true;
+        for (int i = pg_size-1; i >= pg_minsize; i--)
+        {
            if (write_osd_set[i] != 0)
                write_parity++;
+            else if (write_parity != 0)
+                is_seq = false;
+        }
        if (write_parity > 0)
        {
+            // First get the coding matrix or sub-matrix
+            void *matrix_data =
+#ifdef WITH_ISAL
+                matrix->isal_data;
+#else
+                matrix->je_data;
+#endif
+            if (!is_seq)
+            {
+                // We need a coding sub-matrix
+                std::array<uint8_t, 32> missing_parity = {};
+                for (int i = pg_minsize; i < pg_size; i++)
+                {
+                    if (!write_osd_set[i])
+                        missing_parity[(i-pg_minsize) >> 3] |= (1 << ((i-pg_minsize) & 0x7));
+                }
+                auto sub_it = matrix->subdata.find(missing_parity);
+                if (sub_it == matrix->subdata.end())
+                {
+                    int item_size =
+#ifdef WITH_ISAL
+                        32;
+#else
+                        sizeof(int);
+#endif
+                    void *subm = malloc_or_die(item_size * write_parity * pg_minsize);
+                    for (int i = pg_minsize, j = 0; i < pg_size; i++)
+                    {
+                        if (write_osd_set[i])
+                        {
+                            memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
+                            j++;
+                        }
+                    }
+                    matrix->subdata[missing_parity] = subm;
+                    matrix_data = subm;
+                }
+                else
+                    matrix_data = sub_it->second;
+            }
            // Calculate new coding chunks
            buf_len_t bufs[pg_size][3];
            int nbuf[pg_size], curbuf[pg_size];
@@ -841,13 +986,13 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
            while (pos < end)
            {
                uint32_t next_end = end;
-                for (int i = 0; i < pg_size; i++)
+                for (int i = 0, j = 0; i < pg_size; i++)
                {
                    if (i < pg_minsize || write_osd_set[i] != 0)
                    {
                        assert(curbuf[i] < nbuf[i]);
                        assert(bufs[i][curbuf[i]].buf);
-                        data_ptrs[i] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
+                        data_ptrs[j++] = (uint8_t*)bufs[i][curbuf[i]].buf + pos-positions[i];
                        uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
                        if (next_end > this_end)
                            next_end = this_end;
@@ -868,32 +1013,30 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
                }
 #ifdef WITH_ISAL
                ec_encode_data(
-                    next_end-pos, pg_minsize, write_parity, matrix->isal_data,
+                    next_end-pos, pg_minsize, write_parity, (uint8_t*)matrix_data,
                    (uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
                );
 #else
                jerasure_matrix_encode(
-                    pg_minsize, write_parity, OSD_JERASURE_W, matrix->je_data,
+                    pg_minsize, write_parity, OSD_JERASURE_W, (int*)matrix_data,
                    (char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
                );
 #endif
                pos = next_end;
            }
-            for (int i = 0; i < pg_size; i++)
+            for (int i = 0, j = 0; i < pg_size; i++)
            {
                if (i < pg_minsize || write_osd_set[i] != 0)
-                {
-                    data_ptrs[i] = stripes[i].bmp_buf;
-                }
+                    data_ptrs[j++] = stripes[i].bmp_buf;
            }
 #ifdef WITH_ISAL
            ec_encode_data(
-                bitmap_size, pg_minsize, write_parity, matrix->isal_data,
+                bitmap_size, pg_minsize, write_parity, (uint8_t*)matrix_data,
                (uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
            );
 #else
-            jerasure_matrix_encode(
-                pg_minsize, write_parity, OSD_JERASURE_W, matrix->je_data,
+            jerasure_matrix_encode_unaligned(
+                pg_minsize, write_parity, OSD_JERASURE_W, (int*)matrix_data,
                (char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
            );
 #endif
--- a/src/osd_rmw.h
+++ b/src/osd_rmw.h
@@ -25,7 +25,9 @@ struct osd_rmw_stripe_t
    uint32_t req_start, req_end;
    uint32_t read_start, read_end;
    uint32_t write_start, write_end;
-    bool missing;
+    osd_num_t osd_num;
+    bool missing: 1;
+    bool read_error: 1;
 };

 // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@@ -3,6 +3,10 @@

 #define RMW_DEBUG

+#ifdef NO_ISAL
+#undef WITH_ISAL
+#endif
+
 #include <string.h>
 #include "osd_rmw.cpp"
 #include "test_pattern.h"
@@ -20,7 +24,8 @@ void test11();
 void test12();
 void test13();
 void test14();
-void test15();
+void test15(bool second);
+void test16();

 int main(int narg, char *args[])
 {
@@ -49,7 +54,10 @@ int main(int narg, char *args[])
    // Test 14
    test14();
    // Test 15
-    test15();
+    test15(false);
+    test15(true);
+    // Test 16
+    test16();
    // End
    printf("all ok\n");
    return 0;
@@ -819,12 +827,11 @@ void test14()

 ***/

-void test15()
+void test15(bool second)
 {
    const int bmp = 64*1024 / 4096 / 8;
    use_ec(4, 2, true);
-    osd_num_t osd_set[4] = { 1, 2, 3, 0 };
-    osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
+    osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
    osd_rmw_stripe_t stripes[4] = {};
    unsigned bitmaps[4] = { 0 };
    // Test 15.0
@@ -835,7 +842,7 @@ void test15()
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
    // Test 15.1
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
    for (int i = 0; i < 4; i++)
        stripes[i].bmp_buf = bitmaps+i;
    assert(rmw_buf);
@@ -845,34 +852,139 @@ void test15()
    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
-    assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
-    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
+    assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
+    assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
    assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
    assert(stripes[1].read_buf == NULL);
    assert(stripes[2].read_buf == NULL);
    assert(stripes[3].read_buf == NULL);
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == (uint8_t*)write_buf);
-    assert(stripes[2].write_buf == rmw_buf);
-    assert(stripes[3].write_buf == NULL);
+    assert(stripes[2+second].write_buf == rmw_buf);
+    assert(stripes[3-second].write_buf == NULL);
    // Test 15.2 - encode
    set_pattern(write_buf, 4*1024, PATTERN1);
    set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
    memset(stripes[0].bmp_buf, 0, bmp);
    memset(stripes[1].bmp_buf, 0, bmp);
-    calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
-    assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
+    memset(stripes[2+second].write_buf, 0, 4096);
+    calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
+    assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
-    assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
-    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
+    assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
+    assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == (uint8_t*)write_buf);
-    assert(stripes[2].write_buf == rmw_buf);
-    assert(stripes[3].write_buf == NULL);
-    check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
+    assert(stripes[2+second].write_buf == rmw_buf);
+    assert(stripes[3-second].write_buf == NULL);
+    // first parity is always xor :), second isn't...
+    check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
    // Done
    free(rmw_buf);
    free(write_buf);
-    use_ec(3, 2, false);
+    use_ec(4, 2, false);
+}
+
+/***
+
+16. EC 2+2 write one parity block with another missing
+   calc_rmw(offset=0, len=0, osd_set=[1,2,0,0], write_set=[1,2,0,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ], [ 0, 128K ] ],
+     input buffer: [],
+     rmw buffer: [ write3, read0, read1 ],
+   }
+
+***/
+
+void test16()
+{
+    const int bmp = 128*1024 / 4096 / 8;
+    use_ec(4, 2, true);
+    osd_num_t osd_set[4] = { 1, 2, 0, 0 };
+    osd_num_t write_osd_set[4] = { 1, 2, 0, 3 };
+    osd_rmw_stripe_t stripes[4] = {};
+    unsigned bitmaps[4] = { 0 };
+    // Test 16.0
+    void *write_buf = NULL;
+    split_stripes(2, 128*1024, 0, 0, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    // Test 16.1
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 128*1024, bmp);
+    for (int i = 0; i < 4; i++)
+        stripes[i].bmp_buf = bitmaps+i;
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
+    assert(stripes[0].read_buf == (uint8_t*)rmw_buf+128*1024);
+    assert(stripes[1].read_buf == (uint8_t*)rmw_buf+256*1024);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[3].read_buf == NULL);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == NULL);
+    assert(stripes[3].write_buf == rmw_buf);
+    // Test 16.2 - encode
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
+    memset(stripes[0].bmp_buf, 0xff, bmp);
+    memset(stripes[1].bmp_buf, 0xff, bmp);
+    calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp);
+    assert(*(uint32_t*)stripes[2].bmp_buf == 0);
+    assert(*(uint32_t*)stripes[3].bmp_buf == 0xF1F1F1F1);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == NULL);
+    assert(stripes[3].write_buf == rmw_buf);
+    check_pattern(stripes[3].write_buf, 128*1024, 0x7eb9ae9cd8e652c3); // 2nd EC chunk
+    // Test 16.3 - decode and verify
+    osd_num_t read_osd_set[4] = { 0, 2, 0, 3 };
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 256*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    for (int role = 0; role < 4; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
+    void *read_buf = alloc_read_buffer(stripes, 4, 0);
+    for (int i = 0; i < 4; i++)
+        stripes[i].bmp_buf = bitmaps+i;
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024);
+    assert(stripes[3].read_buf == (uint8_t*)read_buf+2*128*1024);
+    set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
+    memcpy(stripes[3].read_buf, rmw_buf, 128*1024);
+    reconstruct_stripes_ec(stripes, 4, 2, bmp);
+    assert(bitmaps[0] == 0xFFFFFFFF);
+    check_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    free(read_buf);
+    // Done
+    free(rmw_buf);
+    free(write_buf);
+    use_ec(4, 2, false);
 }
--- a/src/osd_scrub.cpp
+++ b/src/osd_scrub.cpp
@@ -0,0 +1,531 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "osd_primary.h"
+
+#define SELF_FD -1
+
+void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
+{
+    pool_id_t pool_id = pg_id.pool_id;
+    pg_num_t pg_num = pg_id.pg_num;
+    assert(!scrub_list_op);
+    if (role_osd == this->osd_num)
+    {
+        // Self
+        osd_op_t *op = new osd_op_t();
+        op->op_type = 0;
+        op->peer_fd = SELF_FD;
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t();
+        op->bs_op->opcode = BS_OP_LIST;
+        op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
+        if (min_oid.inode != 0 || min_oid.stripe != 0)
+            op->bs_op->min_oid = min_oid;
+        else
+            op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
+        op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
+        op->bs_op->max_oid.stripe = UINT64_MAX;
+        op->bs_op->list_stable_limit = scrub_list_limit;
+        op->bs_op->pg_count = pg_counts[pool_id];
+        op->bs_op->pg_number = pg_num-1;
+        op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
+        {
+            scrub_list_op = NULL;
+            if (op->bs_op->retval < 0)
+            {
+                printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
+                force_stop(1);
+                return;
+            }
+            add_bs_subop_stats(op);
+            scrub_cur_list = {
+                .buf = (obj_ver_id*)op->bs_op->buf,
+                .total_count = (uint64_t)op->bs_op->retval,
+                .stable_count = op->bs_op->version,
+            };
+            delete op->bs_op;
+            op->bs_op = NULL;
+            delete op;
+            continue_scrub();
+        };
+        scrub_list_op = op;
+        bs->enqueue_op(op->bs_op);
+    }
+    else
+    {
+        // Peer
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->peer_fd = msgr.osd_peer_fds.at(role_osd);
+        op->req = (osd_any_op_t){
+            .sec_list = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = msgr.next_subop_id++,
+                    .opcode = OSD_OP_SEC_LIST,
+                },
+                .list_pg = pg_num,
+                .pg_count = pg_counts[pool_id],
+                .pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
+                .min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
+                .max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
+                .min_stripe = min_oid.stripe,
+                .stable_limit = scrub_list_limit,
+            },
+        };
+        op->callback = [this, role_osd](osd_op_t *op)
+        {
+            scrub_list_op = NULL;
+            if (op->reply.hdr.retval < 0)
+            {
+                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
+                int fail_fd = op->peer_fd;
+                delete op;
+                msgr.stop_client(fail_fd);
+                return;
+            }
+            scrub_cur_list = {
+                .buf = (obj_ver_id*)op->buf,
+                .total_count = (uint64_t)op->reply.hdr.retval,
+                .stable_count = op->reply.sec_list.stable_count,
+            };
+            // set op->buf to NULL so it doesn't get freed
+            op->buf = NULL;
+            delete op;
+            continue_scrub();
+        };
+        scrub_list_op = op;
+        msgr.outbox_push(op);
+    }
+}
+
+bool osd_t::pick_next_scrub(object_id & next_oid)
+{
+    if (!pgs.size())
+    {
+        if (scrub_cur_list.buf)
+        {
+            free(scrub_cur_list.buf);
+            scrub_cur_list = {};
+            scrub_last_pg = {};
+        }
+        return false;
+    }
+    timespec tv_now;
+    clock_gettime(CLOCK_REALTIME, &tv_now);
+    bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
+    // Restart scanning from the same PG as the last time
+    auto pg_it = pgs.lower_bound(scrub_last_pg);
+    while (pg_it != pgs.end())
+    {
+        if (pg_it->second.state & PG_ACTIVE)
+        {
+            auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id);
+            auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
+            if (pg_it->second.scrub_ts < tv_now.tv_sec-interval)
+            {
+                // Continue scrubbing from the next object
+                if (scrub_last_pg == pg_it->first)
+                {
+                    while (scrub_list_pos < scrub_cur_list.total_count)
+                    {
+                        auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
+                        oid.stripe &= ~STRIPE_MASK;
+                        scrub_list_pos++;
+                        if (recovery_ops.find(oid) == recovery_ops.end() &&
+                            scrub_ops.find(oid) == scrub_ops.end())
+                        {
+                            next_oid = oid;
+                            if (!(pg_it->second.state & PG_SCRUBBING))
+                            {
+                                // Currently scrubbing this PG
+                                pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
+                                report_pg_state(pg_it->second);
+                            }
+                            return true;
+                        }
+                    }
+                }
+                if (scrub_last_pg == pg_it->first &&
+                    scrub_cur_list.total_count && scrub_list_pos >= scrub_cur_list.total_count &&
+                    scrub_cur_list.stable_count < scrub_list_limit)
+                {
+                    // End of the list, mark this PG as scrubbed and go to the next PG
+                }
+                else
+                {
+                    // Continue listing
+                    object_id scrub_last_oid;
+                    if (scrub_last_pg != pg_it->first)
+                        scrub_last_oid = (object_id){};
+                    else if (scrub_cur_list.stable_count > 0)
+                    {
+                        scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
+                        scrub_last_oid.stripe++;
+                    }
+                    osd_num_t scrub_osd = 0;
+                    for (osd_num_t pg_osd: pg_it->second.cur_set)
+                    {
+                        if (pg_osd == this->osd_num || scrub_osd == 0)
+                            scrub_osd = pg_osd;
+                    }
+                    if (!(pg_it->second.state & PG_SCRUBBING))
+                    {
+                        // Currently scrubbing this PG
+                        pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
+                        report_pg_state(pg_it->second);
+                    }
+                    if (scrub_cur_list.buf)
+                    {
+                        free(scrub_cur_list.buf);
+                        scrub_cur_list = {};
+                        scrub_last_oid = {};
+                    }
+                    scrub_last_pg = pg_it->first;
+                    scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
+                    return true;
+                }
+            }
+            if (pg_it->second.state & PG_SCRUBBING)
+            {
+                pg_it->second.scrub_ts = tv_now.tv_sec;
+                pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
+                pg_it->second.history_changed = true;
+                report_pg_state(pg_it->second);
+                schedule_scrub(pg_it->second);
+            }
+            // The list is definitely not needed anymore
+            if (scrub_cur_list.buf)
+            {
+                free(scrub_cur_list.buf);
+                scrub_cur_list = {};
+            }
+        }
+        pg_it++;
+        if (pg_it == pgs.end() && rescan)
+        {
+            // Scan one more time to guarantee that there are no PGs to scrub
+            pg_it = pgs.begin();
+            rescan = false;
+        }
+    }
+    // Scanned all PGs - no more scrubs to do
+    return false;
+}
+
+void osd_t::submit_scrub_op(object_id oid)
+{
+    auto osd_op = new osd_op_t();
+    osd_op->op_type = OSD_OP_OUT;
+    osd_op->req = (osd_any_op_t){
+        .rw = {
+            .header = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = 1,
+                .opcode = OSD_OP_SCRUB,
+            },
+            .inode = oid.inode,
+            .offset = oid.stripe,
+            .len = 0,
+        },
+    };
+    if (log_level > 2)
+    {
+        printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
+    }
+    osd_op->callback = [this](osd_op_t *osd_op)
+    {
+        object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
+        if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
+        {
+            // Scrub error
+            printf(
+                "Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
+                oid.inode, oid.stripe, INODE_POOL(oid.inode),
+                map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
+                osd_op->reply.hdr.retval
+            );
+        }
+        else if (log_level > 2)
+        {
+            printf("Scrubbed %lx:%lx OK\n", oid.inode, oid.stripe);
+        }
+        delete osd_op;
+        if (scrub_sleep_ms)
+        {
+            this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
+            {
+                scrub_ops.erase(oid);
+                continue_scrub();
+            });
+        }
+        else
+        {
+            scrub_ops.erase(oid);
+            continue_scrub();
+        }
+    };
+    scrub_ops[oid] = osd_op;
+    exec_op(osd_op);
+}
+
+// Triggers scrub requests
+// Scrub reads data from all replicas and compares it
+// To scrub first we need to read objects listings
+bool osd_t::continue_scrub()
+{
+    if (scrub_list_op)
+    {
+        return true;
+    }
+    while (scrub_ops.size() < scrub_queue_depth)
+    {
+        object_id oid;
+        if (pick_next_scrub(oid))
+            submit_scrub_op(oid);
+        else
+            return false;
+    }
+    return true;
+}
+
+void osd_t::schedule_scrub(pg_t & pg)
+{
+    auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
+    auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
+    if (!scrub_nearest_ts || scrub_nearest_ts > pg.scrub_ts+interval)
+    {
+        scrub_nearest_ts = pg.scrub_ts+interval;
+        timespec tv_now;
+        clock_gettime(CLOCK_REALTIME, &tv_now);
+        if (scrub_timer_id >= 0)
+        {
+            tfd->clear_timer(scrub_timer_id);
+            scrub_timer_id = -1;
+        }
+        if (tv_now.tv_sec > scrub_nearest_ts)
+        {
+            scrub_nearest_ts = 0;
+            peering_state = peering_state | OSD_SCRUBBING;
+            ringloop->wakeup();
+        }
+        else
+        {
+            scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
+            {
+                scrub_timer_id = -1;
+                scrub_nearest_ts = 0;
+                peering_state = peering_state | OSD_SCRUBBING;
+                ringloop->wakeup();
+            });
+        }
+    }
+}
+
+void osd_t::continue_primary_scrub(osd_op_t *cur_op)
+{
+    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
+        return;
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == 1)
+        goto resume_1;
+    else if (op_data->st == 2)
+        goto resume_2;
+    {
+        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+        // Determine version
+        auto vo_it = pg.ver_override.find(op_data->oid);
+        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
+        // PG may have degraded or misplaced objects
+        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+        // Read all available chunks
+        int n_copies = 0;
+        op_data->degraded = false;
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            op_data->stripes[role].read_start = 0;
+            op_data->stripes[role].read_end = bs_block_size;
+            if (op_data->prev_set[role] != 0)
+            {
+                n_copies++;
+            }
+            else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
+            {
+                op_data->degraded = true;
+            }
+        }
+        if (n_copies <= op_data->pg_data_size)
+        {
+            // Nothing to compare, even if we'd like to
+            finish_op(cur_op, 0);
+            return;
+        }
+        cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
+            op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
+        // Submit reads
+        osd_op_t *subops = new osd_op_t[n_copies];
+        op_data->fact_ver = 0;
+        op_data->done = op_data->errors = op_data->errcode = 0;
+        op_data->n_subops = n_copies;
+        op_data->subops = subops;
+        int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
+            op_data->stripes, op_data->prev_set, cur_op, 0, -1);
+        assert(sent == n_copies);
+        op_data->st = 1;
+    }
+resume_1:
+    return;
+resume_2:
+    if (op_data->errors > 0)
+    {
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // I/O or checksum error
+            int n_copies = 0;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (op_data->stripes[role].read_end != 0 &&
+                    !op_data->stripes[role].read_error)
+                {
+                    n_copies++;
+                }
+            }
+            if (n_copies <= op_data->pg_data_size)
+            {
+                // Nothing to compare, just mark the object as corrupted
+                auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+                // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
+                op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
+                // Operation is treated as unsuccessful only if the object becomes unreadable
+                finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
+                return;
+            }
+            // Proceed, we can still compare chunks that were successfully read
+        }
+        else
+        {
+            finish_op(cur_op, op_data->errcode);
+            return;
+        }
+    }
+    if (op_data->scheme == POOL_SCHEME_REPLICATED)
+    {
+        // Check that all chunks have returned the same data
+        int total = 0;
+        int eq_to[op_data->pg_size];
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            eq_to[role] = -1;
+            if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
+            {
+                total++;
+                eq_to[role] = role;
+                for (int other = 0; other < role; other++)
+                {
+                    // Only compare with unique chunks (eq_to[other] == other)
+                    if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
+                    {
+                        eq_to[role] = eq_to[other];
+                        break;
+                    }
+                }
+            }
+        }
+        int votes[op_data->pg_size];
+        for (int role = 0; role < op_data->pg_size; role++)
+            votes[role] = 0;
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            if (eq_to[role] != -1)
+                votes[eq_to[role]]++;
+        }
+        int best = -1;
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
+                best = role;
+        }
+        if (best > 0 && votes[best] < total)
+        {
+            // FIXME Add a flag to allow to skip such objects and not recover them automatically
+            bool unknown = false;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (role != best && votes[role] == votes[best])
+                    unknown = true;
+                if (votes[role] > 0 && votes[role] < votes[best])
+                {
+                    printf(
+                        "[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
+                        INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                        op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
+                    );
+                    op_data->stripes[role].read_error = true;
+                }
+            }
+            if (unknown)
+            {
+                // It's unknown which replica is good. There are multiple versions with no majority
+                best = -1;
+            }
+        }
+    }
+    else
+    {
+        assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
+        if (op_data->degraded)
+        {
+            // Reconstruct missing stripes
+            // XOR shouldn't come here as it only has 1 parity chunk
+            assert(op_data->scheme == POOL_SCHEME_EC);
+            reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
+        }
+        // Generate parity chunks and compare them with actual data
+        osd_num_t fake_osd_set[op_data->pg_size];
+        for (int i = 0; i < op_data->pg_size; i++)
+        {
+            fake_osd_set[i] = 1;
+            op_data->stripes[i].write_buf = i >= op_data->pg_data_size
+                ? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
+                : op_data->stripes[i].read_buf;
+        }
+        if (op_data->scheme == POOL_SCHEME_XOR)
+        {
+            calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
+        }
+        else if (op_data->scheme == POOL_SCHEME_EC)
+        {
+            calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
+        }
+        // Now compare that write_buf == read_buf
+        for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
+        {
+            if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
+                memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
+            {
+                // Chunks don't match - something's wrong... but we don't know what :D
+                // FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
+                printf(
+                    "[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
+                    INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                    op_data->oid.inode, op_data->oid.stripe,
+                    role-op_data->pg_data_size, op_data->stripes[role].osd_num
+                );
+                op_data->stripes[role].read_error = true;
+            }
+        }
+    }
+    for (int role = 0; role < op_data->pg_size; role++)
+    {
+        if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
+        {
+            // Got at least 1 read error or mismatch, mark the object as corrupted
+            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
+            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
+            break;
+        }
+    }
+    finish_op(cur_op, 0);
+}
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@@ -125,11 +125,18 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
-        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
-        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
-        cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
-        cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
+        cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
+        cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
+        cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
+        cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
+        cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
+        cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
+        if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
+        {
+            cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
+                ? cur_op->req.sec_list.max_stripe : UINT64_MAX;
+        }
+        cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
 #ifdef OSD_STUB
        cur_op->bs_op->retval = 0;
        cur_op->bs_op->buf = NULL;
--- a/src/pg_states.cpp
+++ b/src/pg_states.cpp
@@ -8,35 +8,37 @@ const int pg_state_bit_count = 16;
 const int pg_state_bits[16] = {
    PG_STARTING,
    PG_PEERING,
-    PG_PEERED,
    PG_INCOMPLETE,
    PG_ACTIVE,
    PG_REPEERING,
    PG_STOPPING,
    PG_OFFLINE,
    PG_DEGRADED,
+    PG_HAS_CORRUPTED,
    PG_HAS_INCOMPLETE,
    PG_HAS_DEGRADED,
    PG_HAS_MISPLACED,
    PG_HAS_UNCLEAN,
    PG_HAS_INVALID,
    PG_LEFT_ON_DEAD,
+    PG_SCRUBBING,
 };

 const char *pg_state_names[16] = {
    "starting",
    "peering",
-    "peered",
    "incomplete",
    "active",
    "repeering",
    "stopping",
    "offline",
    "degraded",
+    "has_corrupted",
    "has_incomplete",
    "has_degraded",
    "has_misplaced",
    "has_unclean",
    "has_invalid",
    "left_on_dead",
+    "scrubbing",
 };
--- a/src/pg_states.h
+++ b/src/pg_states.h
@@ -4,27 +4,27 @@
 #pragma once

 // Placement group states
-// STARTING -> [acquire lock] -> PEERING -> PEERED
-// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
+// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE
 // ACTIVE -> REPEERING -> PEERING
 // ACTIVE -> STOPPING -> OFFLINE -> [release lock]
 // Exactly one of these:
 #define PG_STARTING (1<<0)
 #define PG_PEERING (1<<1)
-#define PG_PEERED (1<<2)
-#define PG_INCOMPLETE (1<<3)
-#define PG_ACTIVE (1<<4)
-#define PG_REPEERING (1<<5)
-#define PG_STOPPING (1<<6)
-#define PG_OFFLINE (1<<7)
+#define PG_INCOMPLETE (1<<2)
+#define PG_ACTIVE (1<<3)
+#define PG_REPEERING (1<<4)
+#define PG_STOPPING (1<<5)
+#define PG_OFFLINE (1<<6)
 // Plus any of these:
-#define PG_DEGRADED (1<<8)
-#define PG_HAS_INCOMPLETE (1<<9)
-#define PG_HAS_DEGRADED (1<<10)
-#define PG_HAS_MISPLACED (1<<11)
-#define PG_HAS_UNCLEAN (1<<12)
-#define PG_HAS_INVALID (1<<13)
+#define PG_DEGRADED (1<<7)
+#define PG_HAS_INCOMPLETE (1<<8)
+#define PG_HAS_DEGRADED (1<<9)
+#define PG_HAS_MISPLACED (1<<10)
+#define PG_HAS_UNCLEAN (1<<11)
+#define PG_HAS_INVALID (1<<12)
+#define PG_HAS_CORRUPTED (1<<13)
 #define PG_LEFT_ON_DEAD (1<<14)
+#define PG_SCRUBBING (1<<15)

 // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
 // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
@@ -34,6 +34,8 @@
 #define OBJ_DEGRADED 0x02
 #define OBJ_INCOMPLETE 0x04
 #define OBJ_MISPLACED 0x08
+// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
+#define OBJ_CORRUPTED 0x10
 #define OBJ_NEEDS_STABLE 0x10000
 #define OBJ_NEEDS_ROLLBACK 0x20000

--- a/src/qemu_driver.c
+++ b/src/qemu_driver.c
@@ -53,6 +53,7 @@ typedef struct VitastorClient
    char *etcd_host;
    char *etcd_prefix;
    char *image;
+    int skip_parents;
    uint64_t inode;
    uint64_t pool;
    uint64_t size;
@@ -63,6 +64,10 @@ typedef struct VitastorClient
    int rdma_gid_index;
    int rdma_mtu;
    QemuMutex mutex;
+
+    uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
+    uint32_t last_bitmap_granularity;
+    uint8_t *last_bitmap;
 } VitastorClient;

 typedef struct VitastorRPC
@@ -72,6 +77,9 @@ typedef struct VitastorRPC
    QEMUIOVector *iov;
    long ret;
    int complete;
+    uint64_t inode, offset, len;
+    uint32_t bitmap_granularity;
+    uint8_t *bitmap;
 } VitastorRPC;

 static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
@@ -147,6 +155,7 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
        if (!strcmp(name, "inode") ||
            !strcmp(name, "pool") ||
            !strcmp(name, "size") ||
+            !strcmp(name, "skip-parents") ||
            !strcmp(name, "use-rdma") ||
            !strcmp(name, "rdma-port_num") ||
            !strcmp(name, "rdma-gid-index") ||
@@ -227,13 +236,16 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle

 static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
 {
+    VitastorRPC task;
    VitastorClient *client = bs->opaque;
+    void *image = NULL;
    int64_t ret = 0;
    qemu_mutex_init(&client->mutex);
    client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
    // FIXME: Rename to etcd_address
    client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
    client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
+    client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
    client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
    client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
    client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
@@ -243,23 +255,25 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
        vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
        client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
    );
-    client->image = g_strdup(qdict_get_try_str(options, "image"));
+    image = client->image = g_strdup(qdict_get_try_str(options, "image"));
    client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
+    // Get image metadata (size and readonly flag) or just wait until the client is ready
+    if (!image)
+        client->image = (char*)"x";
+    task.complete = 0;
+    task.bs = bs;
+    if (qemu_in_coroutine())
+    {
+        vitastor_co_get_metadata(&task);
+    }
+    else
+    {
+        bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
+        BDRV_POLL_WHILE(bs, !task.complete);
+    }
+    client->image = image;
    if (client->image)
    {
-        // Get image metadata (size and readonly flag)
-        VitastorRPC task;
-        task.complete = 0;
-        task.bs = bs;
-        if (qemu_in_coroutine())
-        {
-            vitastor_co_get_metadata(&task);
-        }
-        else
-        {
-            bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
-            BDRV_POLL_WHILE(bs, !task.complete);
-        }
        client->watch = (void*)task.ret;
        client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
        client->size = vitastor_c_inode_get_size(client->watch);
@@ -284,6 +298,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
            client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
        }
        client->size = qdict_get_try_int(options, "size", 0);
+        vitastor_c_close_watch(client->proxy, (void*)task.ret);
    }
    if (!client->size)
    {
@@ -305,6 +320,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
    qdict_del(options, "inode");
    qdict_del(options, "pool");
    qdict_del(options, "size");
+    qdict_del(options, "skip-parents");
    return ret;
 }

@@ -321,6 +337,8 @@ static void vitastor_close(BlockDriverState *bs)
        g_free(client->etcd_prefix);
    if (client->image)
        g_free(client->image);
+    free(client->last_bitmap);
+    client->last_bitmap = NULL;
 }

 #if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
@@ -486,6 +504,13 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
    vitastor_co_init_task(bs, &task);
    task.iov = iov;

+    if (client->last_bitmap)
+    {
+        // Invalidate last bitmap on write
+        free(client->last_bitmap);
+        client->last_bitmap = NULL;
+    }
+
    uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
    qemu_mutex_lock(&client->mutex);
    vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
@@ -499,6 +524,140 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
    return task.ret;
 }

+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
+#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
+static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
+{
+    VitastorRPC *task = opaque;
+    VitastorClient *client = task->bs->opaque;
+    task->ret = retval;
+    task->complete = 1;
+    if (retval >= 0)
+    {
+        task->bitmap = bitmap;
+        if (client->last_bitmap_inode == task->inode &&
+            client->last_bitmap_offset == task->offset &&
+            client->last_bitmap_len == task->len)
+        {
+            free(client->last_bitmap);
+            client->last_bitmap = bitmap;
+        }
+    }
+    if (qemu_coroutine_self() != task->co)
+    {
+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
+        aio_co_wake(task->co);
+#else
+        qemu_coroutine_enter(task->co, NULL);
+        qemu_aio_release(task);
+#endif
+    }
+}
+
+static int coroutine_fn vitastor_co_block_status(
+    BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
+    int64_t *pnum, int64_t *map, BlockDriverState **file)
+{
+    // Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
+    // Not allocated => return 0
+    // Error => return -errno
+    // Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
+    VitastorRPC task;
+    VitastorClient *client = bs->opaque;
+    uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
+    uint8_t bit = 0;
+    if (client->last_bitmap && client->last_bitmap_inode == inode &&
+        client->last_bitmap_offset <= offset &&
+        client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
+    {
+        // Use the previously read bitmap
+        task.bitmap_granularity = client->last_bitmap_granularity;
+        task.offset = client->last_bitmap_offset;
+        task.len = client->last_bitmap_len;
+        task.bitmap = client->last_bitmap;
+    }
+    else
+    {
+        // Read bitmap from this position, rounding to full inode PG blocks
+        uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
+        if (!block_size)
+            return -EAGAIN;
+        // Init coroutine
+        vitastor_co_init_task(bs, &task);
+        free(client->last_bitmap);
+        task.inode = client->last_bitmap_inode = inode;
+        task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
+        task.offset = client->last_bitmap_offset = offset / block_size * block_size;
+        task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
+        task.bitmap = client->last_bitmap = NULL;
+        qemu_mutex_lock(&client->mutex);
+        vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
+        qemu_mutex_unlock(&client->mutex);
+        while (!task.complete)
+        {
+            qemu_coroutine_yield();
+        }
+        if (task.ret < 0)
+        {
+            // Error
+            return task.ret;
+        }
+    }
+    if (want_zero)
+    {
+        // Get precise mapping with all holes
+        uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
+        uint64_t bmp_len = task.len / task.bitmap_granularity;
+        uint64_t bmp_end = bmp_pos+1;
+        bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
+        while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
+        {
+            bmp_end++;
+        }
+        *pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
+    }
+    else
+    {
+        // Get larger allocated extents, possibly with false positives
+        uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
+        uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
+        while (bmp_pos < bmp_end)
+        {
+            if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
+            {
+                bit = bit || task.bitmap[bmp_pos >> 3];
+                bmp_pos += 8;
+            }
+            else
+            {
+                bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
+                bmp_pos++;
+            }
+        }
+        *pnum = bytes;
+    }
+    if (bit)
+    {
+        *map = offset;
+        *file = bs;
+    }
+    return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
+}
+#endif
+#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
+// QEMU 1.7-2.11
+static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+{
+    int64_t map = 0;
+    int64_t pnumbytes = 0;
+    int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
+    *pnum = pnumbytes/BDRV_SECTOR_SIZE;
+    return r;
+}
+#endif
+#endif
+
 #if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
 static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
 {
@@ -606,6 +765,15 @@ static BlockDriver bdrv_vitastor = {
    .bdrv_co_truncate               = vitastor_co_truncate,
 #endif

+#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
+#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
+    // For snapshot export
+    .bdrv_co_block_status           = vitastor_co_block_status,
+#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
+    .bdrv_co_get_block_status       = vitastor_co_get_block_status,
+#endif
+#endif
+
 #if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
    .bdrv_co_preadv                 = vitastor_co_preadv,
    .bdrv_co_pwritev                = vitastor_co_pwritev,
--- a/src/ringloop.cpp
+++ b/src/ringloop.cpp
@@ -25,7 +25,6 @@ ring_loop_t::ring_loop_t(int qd)
    {
        free_ring_data[i] = i;
    }
-    wait_sqe_id = 1;
 }

 ring_loop_t::~ring_loop_t()
@@ -83,17 +82,19 @@ void ring_loop_t::loop()
        }
        io_uring_cqe_seen(&ring, cqe);
    }
-    while (get_sqe_queue.size() > 0)
-    {
-        (get_sqe_queue[0].second)();
-        get_sqe_queue.erase(get_sqe_queue.begin());
-    }
    do
    {
        loop_again = false;
        for (int i = 0; i < consumers.size(); i++)
        {
            consumers[i]->loop();
+            if (immediate_queue.size())
+            {
+                immediate_queue2.swap(immediate_queue);
+                for (auto & cb: immediate_queue2)
+                    cb();
+                immediate_queue2.clear();
+            }
        }
    } while (loop_again);
 }
--- a/src/ringloop.h
+++ b/src/ringloop.h
@@ -119,11 +119,10 @@ struct ring_consumer_t

 class ring_loop_t
 {
-    std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
+    std::vector<std::function<void()>> immediate_queue, immediate_queue2;
    std::vector<ring_consumer_t*> consumers;
    struct ring_data_t *ring_datas;
    int *free_ring_data;
-    int wait_sqe_id;
    unsigned free_ring_data_ptr;
    bool loop_again;
    struct io_uring ring;
@@ -145,20 +144,9 @@ public:
        }
        return sqe;
    }
-    inline int wait_sqe(std::function<void()> cb)
+    inline void set_immediate(const std::function<void()> cb)
    {
-        get_sqe_queue.push_back({ wait_sqe_id, cb });
-        return wait_sqe_id++;
-    }
-    inline void cancel_wait_sqe(int wait_id)
-    {
-        for (int i = 0; i < get_sqe_queue.size(); i++)
-        {
-            if (get_sqe_queue[i].first == wait_id)
-            {
-                get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
-            }
-        }
+        immediate_queue.push_back(cb);
    }
    inline int submit()
    {
--- a/src/str_util.cpp
+++ b/src/str_util.cpp
@@ -249,3 +249,35 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
    fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
    exit(0);
 }
+
+uint64_t parse_time(std::string time_str, bool *ok)
+{
+    if (!time_str.length())
+    {
+        if (ok)
+            *ok = false;
+        return 0;
+    }
+    uint64_t mul = 1;
+    char type_char = tolower(time_str[time_str.length()-1]);
+    if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
+    {
+        if (type_char == 's')
+            mul = 1;
+        else if (time_str[time_str.length()-1] == 'M')
+            mul = 30*86400;
+        else if (type_char == 'm')
+            mul = 60;
+        else if (type_char == 'h')
+            mul = 3600;
+        else if (type_char == 'd')
+            mul = 86400;
+        else /*if (type_char == 'y')*/
+            mul = 86400*365;
+        time_str = time_str.substr(0, time_str.length()-1);
+    }
+    uint64_t ts = stoull_full(time_str, 0) * mul;
+    if (ok)
+        *ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
+    return ts;
+}
--- a/src/str_util.h
+++ b/src/str_util.h
@@ -15,3 +15,4 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
 uint64_t stoull_full(const std::string & str, int base = 0);
 std::string format_size(uint64_t size, bool nobytes = false);
 void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
+uint64_t parse_time(std::string time_str, bool *ok = NULL);
--- a/src/test_cluster_client.cpp
+++ b/src/test_cluster_client.cpp
@@ -8,7 +8,6 @@

 void configure_single_pg_pool(cluster_client_t *cli)
 {
-    cli->st_cli.on_load_pgs_hook(true);
    cli->st_cli.parse_state((etcd_kv_t){
        .key = "/config/pools",
        .value = json11::Json::object {
@@ -43,6 +42,7 @@ void configure_single_pg_pool(cluster_client_t *cli)
            { "state", json11::Json::array { "active" } },
        },
    });
+    cli->st_cli.on_load_pgs_hook(true);
    std::map<std::string, etcd_kv_t> changes;
    cli->st_cli.on_change_hook(changes);
 }
@@ -188,7 +188,6 @@ void test1()
    int *r1 = test_write(cli, 0, 4096, 0x55);
    configure_single_pg_pool(cli);
    pretend_connected(cli, 1);
-    cli->continue_ops(true);
    can_complete(r1);
    check_op_count(cli, 1, 1);
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
@@ -196,8 +195,6 @@ void test1()
    pretend_disconnected(cli, 1);
    int *r2 = test_sync(cli);
    pretend_connected(cli, 1);
-    check_op_count(cli, 1, 0);
-    cli->continue_ops(true);
    check_op_count(cli, 1, 1);
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
    check_op_count(cli, 1, 1);
@@ -303,8 +300,6 @@ void test1()
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
    check_disconnected(cli, 1);
    pretend_connected(cli, 1);
-    check_op_count(cli, 1, 0);
-    cli->continue_ops(true);
    check_op_count(cli, 1, 1);
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
    check_op_count(cli, 1, 1);
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 0.8.3
+Version: 0.8.5
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/src/vitastor_c.cpp
+++ b/src/vitastor_c.cpp
@@ -207,6 +207,28 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
    client->cli->execute(op);
 }

+void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
+    int with_parents, VitastorReadBitmapHandler cb, void *opaque)
+{
+    cluster_op_t *op = new cluster_op_t;
+    op->opcode = with_parents ? OSD_OP_READ_CHAIN_BITMAP : OSD_OP_READ_BITMAP;
+    op->inode = inode;
+    op->offset = offset;
+    op->len = len;
+    op->callback = [cb, opaque](cluster_op_t *op)
+    {
+        uint8_t *bitmap = NULL;
+        if (op->retval >= 0)
+        {
+            bitmap = (uint8_t*)op->bitmap_buf;
+            op->bitmap_buf = NULL;
+        }
+        cb(opaque, op->retval, bitmap);
+        delete op;
+    };
+    client->cli->execute(op);
+}
+
 void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
 {
    cluster_op_t *op = new cluster_op_t;
@@ -245,6 +267,25 @@ uint64_t vitastor_c_inode_get_num(void *handle)
    return watch->cfg.num;
 }

+uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num)
+{
+    auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
+    if (pool_it == client->cli->st_cli.pool_config.end())
+        return 0;
+    auto & pool_cfg = pool_it->second;
+    uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
+    return pool_cfg.data_block_size * pg_data_size;
+}
+
+uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num)
+{
+    auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
+    if (pool_it == client->cli->st_cli.pool_config.end())
+        return 0;
+    // FIXME: READ_BITMAP may fails if parent bitmap granularity differs from inode bitmap granularity
+    return pool_it->second.bitmap_granularity;
+}
+
 int vitastor_c_inode_get_readonly(void *handle)
 {
    inode_watch_t *watch = (inode_watch_t*)handle;
--- a/src/vitastor_c.h
+++ b/src/vitastor_c.h
@@ -6,6 +6,9 @@
 #ifndef VITASTOR_QEMU_PROXY_H
 #define VITASTOR_QEMU_PROXY_H

+// C API wrapper version
+#define VITASTOR_C_API_VERSION 1
+
 #ifndef POOL_ID_BITS
 #define POOL_ID_BITS 16
 #endif
@@ -21,6 +24,7 @@ typedef struct vitastor_c vitastor_c;

 typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
 typedef void VitastorIOHandler(void *opaque, long retval);
+typedef void VitastorReadBitmapHandler(void *opaque, long retval, uint8_t *bitmap);

 // QEMU
 typedef void IOHandler(void *opaque);
@@ -42,11 +46,15 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
    struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
 void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
    struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
+void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
+    int with_parents, VitastorReadBitmapHandler cb, void *opaque);
 void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
 void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
 void vitastor_c_close_watch(vitastor_c *client, void *handle);
 uint64_t vitastor_c_inode_get_size(void *handle);
 uint64_t vitastor_c_inode_get_num(void *handle);
+uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
+uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
 int vitastor_c_inode_get_readonly(void *handle);

 #ifdef __cplusplus
--- a/tests/test_snapshot.sh
+++ b/tests/test_snapshot.sh
@@ -22,6 +22,16 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M

+qemu-img convert -p \
+    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024)):skip-parents=1" \
+    -O qcow2 ./testdata/layer0.qcow2
+
+qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
+
+qemu-img convert -p \
+    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
+    -O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
+
 qemu-img convert -S 4096 -p \
    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
    -O raw ./testdata/merged.bin
@@ -52,4 +62,18 @@ qemu-img convert -S 4096 -p \

 cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin

+# Test merge by qemu-img
+
+qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
+
+qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
+
+cmp ./testdata/merged.bin ./testdata/rebased.bin
+
+qemu-img rebase -u -b '' ./testdata/layer1.qcow2
+
+qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
+
+cmp ./testdata/layer1.bin ./testdata/rebased.bin
+
 format_green OK
Author	SHA1	Message	Date
Vitaliy Filippov	2b4e0de397	Use vitastor-cli instead of direct etcd interaction in the CSI driver	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	726c6d3470	Implement PG scrub runner	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	2389b49a16	Implement scrubbing "data path" - OSD_OP_SCRUB	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	fe1ee67b05	Add min/max stripe and limit to OP_LIST	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	c775a52a7d	Retry failed reads (including chained and RMW) from other replicas	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	e307dd13ed	Refcount object_states	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	a7f63f7c29	Add corrupted object state	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	1e307069bc	Fix missing parity chunk calculation for EC n+k, k > 1 and first parity chunk missing	2023-02-28 02:40:19 +03:00
Vitaliy Filippov	c3e80abad7	Allow to send more than 1 operation at a time	2023-02-26 02:01:04 +03:00
Vitaliy Filippov	138ffe4032	Reuse incoming RDMA buffers	2023-02-26 00:55:01 +03:00
Vitaliy Filippov	8139a34e97	Fix json11: allow trailing comma	2023-02-23 01:16:01 +03:00
Vitaliy Filippov	4ab630b44d	Use just sfdisk --json, --dump is not needed	2023-02-23 00:55:47 +03:00
Vitaliy Filippov	2c8241b7db	Remove PG "peered" state	2023-02-21 01:30:42 +03:00
Vitaliy Filippov	36a7dd3671	Move tests to "make test"	2023-02-21 01:30:42 +03:00
Vitaliy Filippov	936122bbcf	Initialize msgr lazily in client to speedup vitastor-cli with RDMA enabled	2023-02-19 18:59:07 +03:00
Vitaliy Filippov	1a1ba0d1e7	Add set_immediate to ringloop and use it for bs/osd ops to prevent reenterability issues	2023-02-09 17:37:26 +03:00
Vitaliy Filippov	3d09c9cec7	Remove unused wait_sqe() from ringloop	2023-02-09 17:37:26 +03:00
Vitaliy Filippov	3d08a1ad6c	Fix cluster_client test after last reenterability fixes	2023-02-05 01:47:32 +03:00
Vitaliy Filippov	499881d81c	Fix typo	2023-01-27 01:52:02 +03:00
Vitaliy Filippov	aba93b951b	Fix incorrect EC free space statistics in vitastor-cli df output	2023-01-26 02:04:29 +03:00
Vitaliy Filippov	d125fb1f30	Release 0.8.5 - Fix a possible "double free" bug in the client library happening on OSD restart - Fix a possible write hang on PG history update when only epoch is changed - Fix incorrect systemd target "local.target" in mon/make-etcd - Allow "content" option in PVE storage plugin to allow to enable containers - Build client library without tcmalloc which fixes "attempt to free invalid pointer" errors when, for example, trying to run QEMU with both Vitastor and Ceph RBD disks	2023-01-25 01:43:49 +03:00
Vitaliy Filippov	9d3fd72298	Require liburing < 2 in rpm specs	2023-01-25 01:43:49 +03:00
Vitaliy Filippov	8b552a01f9	Do not retry successful operation parts in client (could lead to "double free" bugs)	2023-01-25 01:30:36 +03:00
Vitaliy Filippov	0385b2f9e8	Fix write hangs on PG epoch update - always set pg.history_changed to true	2023-01-25 01:30:15 +03:00
Vitaliy Filippov	749c837045	Replace non-existing local.target with multi-user.target	2023-01-25 01:29:31 +03:00
Vitaliy Filippov	98001d845b	Remove version from vitastor-release.rpm links	2023-01-23 14:03:33 +03:00
Vitaliy Filippov	c96bcae74b	Allow "content" option in PVE storage plugin to allow to enable containers	2023-01-16 18:14:45 +03:00
Vitaliy Filippov	9f4e34a8cc	Build client library without tcmalloc Fixes "[src/tcmalloc.cc:332] Attempt to free invalid pointer ..." when trying to run QEMU with both Vitastor and Ceph RBD disks and other possible allocator collisions.	2023-01-15 00:01:11 +03:00
Vitaliy Filippov	81fc8bb94c	Release 0.8.4 New features: - Implement QCOW2 image/snapshot export via qemu-img (bdrv_co_block_status in the driver) - Remove OSDs from PG history during `vitastor-cli rm-osd` to prevent `left_on_dead` PG states after deletion - Add a new recovery_pg_switch setting to mix all PGs during recovery, to almost fully reduce the probability of ENOSPC during rebalance - Introduce partial ENOSPC ("OSD is full") handling - now ENOSPC doesn't turn into cascades of crashes - Add migration support to Proxmox VE Vitastor driver - Track last_clean_pgs on a per-pool basis thus reducing data movement in a cluster with pools remaining unclean/degraded for a long time Bug fixes: - Fix a bug where monitor could generate degraded PGs if one of the hosts had no OSDs - Fix a bug where monitor could skip PG redistribution with a lot of OSDs in cluster - Report PG history synchronously on the first write, which improves PG consistency and availability at the same time, because history now gets reported correctly and doesn't get reported without the need for it - Fix possible write and recovery stalls which could happen in a cluster with both EC and replicated pools - Make OSD and monitors sanitize & deduplicate PG history items in etcd - Fix non-working OSD peer config safety check - Fix a rare journal flush stall where flushing wasn't activated with full journal, but with empty flush queue - Fix builds without ISA-L (jerasure-only) crashing with EC N+K, K>=2 due to the lack of 16-byte buffer alignment - Fix a possible crash for EC N+K, K>=2 when calculating a parity chunk with previous parity chunk missing - Fix a bug where vitastor-disk purge with suppressed warnings didn't work	2023-01-13 23:59:54 +03:00
Vitaliy Filippov	bc465c16de	Fix arithmetic on void* for clang	2023-01-13 23:58:42 +03:00
Vitaliy Filippov	8763e9211c	Fix qemu driver compilation warning/error	2023-01-13 23:44:39 +03:00
Vitaliy Filippov	9e1a80bd17	Replace apt-key with trusted.gpg.d	2023-01-13 19:51:47 +03:00
Vitaliy Filippov	3e280f2f08	Mark vitastor as shared storage in PVE driver	2023-01-13 01:36:30 +03:00
Vitaliy Filippov	fe87b4076b	Fix backwards compatibility in cluster_client	2023-01-12 02:37:31 +03:00
Vitaliy Filippov	a38957c1a7	Skip empty hosts in lp-optimizer	2023-01-09 16:26:16 +03:00
Vitaliy Filippov	137309cf29	Implement bdrv_co_block_status for snapshot export support	2023-01-07 17:06:58 +03:00
Vitaliy Filippov	373f9d0387	Try to re-peer PGs on history change	2023-01-06 12:46:44 +03:00
Vitaliy Filippov	c4516ea971	Also remove deleted OSD from PG configuration and last_clean_pgs	2023-01-06 12:46:44 +03:00
Vitaliy Filippov	91065c80fc	Try to prevent left_on_dead when deleting OSDs by removing them from PG history	2023-01-06 12:46:43 +03:00
Vitaliy Filippov	0f6b946add	Time changes with every stat change, do not schedule checks based on it	2023-01-05 13:54:16 +03:00
Vitaliy Filippov	465cbf0b2f	Do not re-schedule recheck indefinitely, run it after mon_change_timeout in any case	2023-01-05 13:48:06 +03:00
Vitaliy Filippov	41add50e4e	Track last_clean_pgs on a per-pool basis	2023-01-03 02:20:50 +03:00
Vitaliy Filippov	02e7be7dc9	Prevent reenterability side effects during PG history operation resume	2023-01-03 02:20:50 +03:00
Vitaliy Filippov	73940adf07	Prioritize EC (non-instantly-stable) operations under journal pressure This reduces the probability of hitting OSD stalls with EC due to "deadlocks" where two parallel write operations wait for each other to complete	2023-01-03 00:05:45 +03:00
Vitaliy Filippov	e950c024d3	Do not sync peer OSDs before listing Sync before listing was added to wait for all PG writes possibly left in queue from the previous master to finish before listing it But in fact it may block the cluster when EC is used and some unstable writes are left in the queue - they block journal flushing, rollback/stabilize is required to unblock them, but rollback/stabilize may only happen after PG is peered. But peering needs listings, listings are requested only after sync, and sync itself waits for currently blocked writes waiting in the queue	2023-01-03 00:05:45 +03:00
Vitaliy Filippov	71d6d9f868	Fix possible crash on ENOSPC during operation cancel in blockstore	2023-01-03 00:05:45 +03:00
Vitaliy Filippov	a4dfa519af	Report PG history synchronously during write This has 2 effects: 1) OSD sets aren't added into PG history until actual write attempts anymore which removes unneeded extra osd_sets in PG history 2) New OSD sets are reported synchronously and can't be lost on PG restarts happening at the same time with reconfiguration	2023-01-01 23:41:05 +03:00
Vitaliy Filippov	37a6aff2fa	Write OSD numbers always as numbers in mon	2023-01-01 23:17:42 +03:00
Vitaliy Filippov	67019f5b02	Make OSD sort & sanitize PG history items	2023-01-01 23:17:42 +03:00
Vitaliy Filippov	0593e5c21c	Fix OSD peer config safety check	2022-12-31 02:24:42 +03:00
Vitaliy Filippov	998e24adf8	Add a new recovery_pg_switch setting to mix all PGs during recovery	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	d7bd36dc32	Fix another rare journal flush stall	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	cf5c562800	Log all object locations when peering PGs	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	629200b0cc	Return ENOSPC as the primary OSD	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	3589ccec22	Do not disconnect peer on ENOSPC during write	2022-12-30 01:54:25 +03:00
Vitaliy Filippov	8d55a1e780	Build osd_rmw_test both with and without ISA-L	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	65f6b3a4eb	Fix jerasure crashing on bitmap calculation/restoration due to the lack of 16-byte alignment	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	fd216eac77	Add a test for missing parity chunk calculation	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	61fca7c426	Fix crash when calculating a parity chunk with previous parity chunk missing (test coming shortly)	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	1c29ed80b9	Fix quote in docs :)	2022-12-28 18:08:53 +03:00
Vitaliy Filippov	68f3fb795e	Suppress warnings in vitastor-disk purge correctly	2022-12-27 11:09:19 +03:00