Add debug

2023-01-06 12:46:44 +03:00
73 changed files with 651 additions and 1889 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)

 project(vitastor)

-set(VERSION "0.8.5")
+set(VERSION "0.8.3")

 add_subdirectory(src)
--- a/VNPL-1.1-RU.txt
+++ b/VNPL-1.1-RU.txt
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
 интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
 самой программы, так и прокси.

-  Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
+  Сетевая Публичная Лицензия Vitastor разработана специально чтобы
 гарантировать, что в таких случаях и модифицированная версия программы, и
-прокси останутся доступными сообществу. Для этого лицензия требует от
+прокси оставались доступными сообществу. Для этого лицензия требует от
 операторов сетевых серверов предоставлять исходный код оригинальной программы,
 а также всех других программ, взаимодействующих с ней на их серверах,
 пользователям этих серверов, на условиях свободных лицензий. Таким образом,
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.8.5
+VERSION ?= v0.8.3

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.8.5
+          image: vitalif/vitastor-csi:v0.8.3
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.8.5
+          image: vitalif/vitastor-csi:v0.8.3
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.8.5"
+    vitastorCSIDriverVersion = "0.8.3"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -10,6 +10,7 @@ import (
    "bytes"
    "strconv"
    "time"
+    "fmt"
    "os"
    "os/exec"
    "io/ioutil"
@@ -20,6 +21,8 @@ import (
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"

+    "go.etcd.io/etcd/clientv3"
+
    "github.com/container-storage-interface/spec/lib/go/csi"
 )

@@ -111,34 +114,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, []string,
    return ctxVars, etcdUrl, etcdPrefix
 }

-func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
-{
-    if (ctxVars["etcdUrl"] != "")
-    {
-        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
-    }
-    if (ctxVars["etcdPrefix"] != "")
-    {
-        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
-    }
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    c := exec.Command("/usr/bin/vitastor-cli", args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout = &stdout
-    c.Stderr = &stderr
-    err := c.Run()
-    stderrStr := string(stderr.Bytes())
-    if (err != nil)
-    {
-        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
-        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
-    }
-    return stdout.Bytes(), nil
-}
-
 // Create the volume
 func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
 {
@@ -171,41 +146,128 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }

-    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
+    // FIXME: The following should PROBABLY be implemented externally in a management tool
+
+    ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
    if (len(etcdUrl) == 0)
    {
        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

-    // Create image using vitastor-cli
-    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", string(volSize), "--pool", string(poolId) })
+    // Connect to etcd
+    cli, err := clientv3.New(clientv3.Config{
+        DialTimeout: ETCD_TIMEOUT,
+        Endpoints: etcdUrl,
+    })
    if (err != nil)
    {
-        if (strings.Index(err.Error(), "already exists") > 0)
+        return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
+    }
+    defer cli.Close()
+
+    var imageId uint64 = 0
+    for
+    {
+        // Check if the image exists
+        ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+        resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
+        cancel()
+        if (err != nil)
        {
-            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
+            return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+        }
+        if (len(resp.Kvs) > 0)
+        {
+            kv := resp.Kvs[0]
+            var v InodeIndex
+            err := json.Unmarshal(kv.Value, &v)
            if (err != nil)
            {
-                return nil, err
+                return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
            }
-            var inodeCfg []InodeConfig
-            err = json.Unmarshal(stat, &inodeCfg)
+            poolId = v.PoolId
+            imageId = v.Id
+            inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
+            ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+            resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
+            cancel()
            if (err != nil)
            {
-                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+                return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
            }
-            if (len(inodeCfg) == 0)
+            if (len(resp.Kvs) == 0)
            {
-                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
+                return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
            }
-            if (inodeCfg[0].Size < uint64(volSize))
+            var inodeCfg InodeConfig
+            err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
+            }
+            if (inodeCfg.Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
            }
        }
        else
        {
-            return nil, err
+            // Find a free ID
+            // Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
+            maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
+            ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+            resp, err := cli.Get(ctx, maxIdKey)
+            cancel()
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+            }
+            var modRev int64
+            var nextId uint64
+            if (len(resp.Kvs) > 0)
+            {
+                var err error
+                nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
+                if (err != nil)
+                {
+                    return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
+                }
+                modRev = resp.Kvs[0].ModRevision
+                nextId++
+            }
+            else
+            {
+                nextId = 1
+            }
+            inodeIdxJson, _ := json.Marshal(InodeIndex{
+                Id: nextId,
+                PoolId: poolId,
+            })
+            inodeCfgJson, _ := json.Marshal(InodeConfig{
+                Name: volName,
+                Size: uint64(volSize),
+            })
+            ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+            txnResp, err := cli.Txn(ctx).If(
+                clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
+                clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
+                clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
+            ).Then(
+                clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
+                clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
+                clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
+            ).Commit()
+            cancel()
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
+            }
+            if (txnResp.Succeeded)
+            {
+                imageId = nextId
+                break
+            }
+            // Start over if the transaction fails
        }
    }

@@ -237,12 +299,97 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
    }
    volName := ctxVars["name"]

-    ctxVars, _, _ = GetConnectionParams(ctxVars)
+    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
+    if (len(etcdUrl) == 0)
+    {
+        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
+    }

-    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
+    cli, err := clientv3.New(clientv3.Config{
+        DialTimeout: ETCD_TIMEOUT,
+        Endpoints: etcdUrl,
+    })
    if (err != nil)
    {
-        return nil, err
+        return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
+    }
+    defer cli.Close()
+
+    // Find inode by name
+    ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+    resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
+    cancel()
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+    }
+    if (len(resp.Kvs) == 0)
+    {
+        return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
+    }
+    var idx InodeIndex
+    err = json.Unmarshal(resp.Kvs[0].Value, &idx)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
+    }
+
+    // Get inode config
+    inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
+    ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+    resp, err = cli.Get(ctx, inodeCfgKey)
+    cancel()
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
+    }
+    if (len(resp.Kvs) == 0)
+    {
+        return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
+    }
+    var inodeCfg InodeConfig
+    err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
+    }
+
+    // Delete inode data by invoking vitastor-cli
+    args := []string{
+        "rm-data", "--etcd_address", strings.Join(etcdUrl, ","),
+        "--pool", fmt.Sprintf("%d", idx.PoolId),
+        "--inode", fmt.Sprintf("%d", idx.Id),
+    }
+    if (ctxVars["configPath"] != "")
+    {
+        args = append(args, "--config_path", ctxVars["configPath"])
+    }
+    c := exec.Command("/usr/bin/vitastor-cli", args...)
+    var stderr bytes.Buffer
+    c.Stdout = nil
+    c.Stderr = &stderr
+    err = c.Run()
+    stderrStr := string(stderr.Bytes())
+    if (err != nil)
+    {
+        klog.Errorf("vitastor-cli rm-data failed: %s, status %s\n", stderrStr, err)
+        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
+    }
+
+    // Delete inode config in etcd
+    ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
+    txnResp, err := cli.Txn(ctx).Then(
+        clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
+        clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
+    ).Commit()
+    cancel()
+    if (err != nil)
+    {
+        return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
+    }
+    if (!txnResp.Succeeded)
+    {
+        return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
    }

    return &csi.DeleteVolumeResponse{}, nil
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.8.5-1) unstable; urgency=medium
+vitastor (0.8.3-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.8.5-1) unstable; urgency=medium
+vitastor (0.8.3-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -34,8 +34,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.8.5; \
-    cd vitastor-0.8.5; \
+    cp -r /root/vitastor vitastor-0.8.3; \
+    cd vitastor-0.8.3; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
-    cd vitastor-0.8.5; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.3.orig.tar.xz vitastor-0.8.3; \
+    cd vitastor-0.8.3; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -9,7 +9,7 @@
 ## Debian

 - Trust Vitastor package signing key:
-  `wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
+  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
 - Add Vitastor package repository to your /etc/apt/sources.list:
  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
@@ -20,8 +20,8 @@
 ## CentOS

 - Add Vitastor package repository:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
 - Enable EPEL: `yum/dnf install epel-release`
 - Enable additional CentOS repositories:
  - CentOS 7: `yum install centos-release-scl`
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -9,7 +9,7 @@
 ## Debian

 - Добавьте ключ репозитория Vitastor:
-  `wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
+  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
 - Добавьте репозиторий Vitastor в /etc/apt/sources.list:
  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
@@ -20,8 +20,8 @@
 ## CentOS

 - Добавьте в систему репозиторий Vitastor:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
 - Включите EPEL: `yum/dnf install epel-release`
 - Включите дополнительные репозитории CentOS:
  - CentOS 7: `yum install centos-release-scl`
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -14,7 +14,6 @@ It supports the following commands:
 - [df](#df)
 - [ls](#ls)
 - [create](#create)
- [snap-create](#create)
 - [modify](#modify)
 - [rm](#rm)
 - [flatten](#flatten)
@@ -124,8 +123,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.

-See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
-
 ## modify

 `vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -15,7 +15,6 @@ vitastor-cli - интерфейс командной строки для адм
 - [df](#df)
 - [ls](#ls)
 - [create](#create)
- [snap-create](#create)
 - [modify](#modify)
 - [rm](#rm)
 - [flatten](#flatten)
@@ -127,8 +126,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
 клиентов, если пишущий клиент максимум 1.

-Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
-
 ## modify

 `vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -46,40 +46,3 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7

 You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
 if you don't want to use inode metadata.
-
-### Exporting snapshots
-
-Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
-
-Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
-
-Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
-the snapshot separately using the following commands (key points are using `skip-parents=1` and
-`-B backing_file` option):
-
-```
-qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
-    -O qcow2 testimg_0.qcow2
-
-qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
-    -O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
-```
-
-In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
-
-QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
-overwritten  **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
-get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
-containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
-in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
-
-After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
-its parent, run:
-
-```
-qemu-img rebase -u -b '' testimg.qcow2
-```
-
-This can be used for backups. Just note that exporting an image that is currently being written to
-is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
-on a live VM.
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -50,40 +50,3 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.

 Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
 `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
-
-### Экспорт снимков
-
-Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
-
-Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
-
-Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
-с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
-
-```
-qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
-    -O qcow2 testimg_0.qcow2
-
-qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
-    -O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
-```
-
-На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
-даже пустой.
-
-Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
-данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
-вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
-что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
-как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
-
-После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
-`testimg.qcow2` от базового, выполните:
-
-```
-qemu-img rebase -u -b '' testimg.qcow2
-```
-
-Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
-в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
-с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
--- a/2
+++ b/2
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
        seed ^= seed << 5;
        return seed + 2147483648;
    };
+    const hosts = Object.keys(osd_tree).sort();
    const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
-    const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
    const r = {};
    // Generate random combinations including each OSD at least once
    for (let h = 0; h < hosts.length; h++)
--- a/mon/make-etcd
+++ b/mon/make-etcd
@@ -79,7 +79,7 @@ StartLimitInterval=0
 RestartSec=10

 [Install]
-WantedBy=multi-user.target
+WantedBy=local.target
 `);
    await system(`useradd etcd`);
    await system(`systemctl daemon-reload`);
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -70,9 +70,9 @@ const etcd_tree = {
            rdma_gid_index: 0,
            rdma_mtu: 4096,
            rdma_max_sge: 128,
-            rdma_max_send: 64,
-            rdma_max_recv: 128,
-            rdma_max_msg: 132096,
+            rdma_max_send: 32,
+            rdma_max_recv: 8,
+            rdma_max_msg: 1048576,
            log_level: 0,
            block_size: 131072,
            disk_alignment: 4096,
@@ -107,10 +107,6 @@ const etcd_tree = {
            slow_log_interval: 10,
            inode_vanish_time: 60,
            osd_memlock: false,
-            scrub_interval: '30d', // 1s/1m/1h/1d
-            scrub_queue_depth: 1,
-            scrub_sleep: 0, // milliseconds
-            scrub_list_limit: 1000, // objects to list on one scrub iteration
            // blockstore - fixed in superblock
            block_size,
            disk_alignment,
@@ -172,8 +168,6 @@ const etcd_tree = {
                osd_tags?: 'nvme' | [ 'nvme', ... ],
                // prefer to put primary on OSD with these tags
                primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
-                // scrub interval
-                scrub_interval?: '30d',
            },
            ...
        }, */
@@ -267,9 +261,9 @@ const etcd_tree = {
            /* <pool_id>: {
                <pg_id>: {
                    primary: osd_num_t,
-                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
-                        "degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
-                        "has_invalid"|"left_on_dead"|"scrubbing")[],
+                    state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
+                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
+                        "has_invalid"|"left_on_dead")[],
                }
            }, */
        },
@@ -291,7 +285,6 @@ const etcd_tree = {
                    osd_sets: osd_num_t[][],
                    all_peers: osd_num_t[],
                    epoch: uint64_t,
-                    scrub_ts: uint64_t,
                },
            }, */
        },
--- a/patches/VitastorPlugin.pm
+++ b/patches/VitastorPlugin.pm
@@ -16,11 +16,6 @@ use PVE::Tools qw(run_command);

 use base qw(PVE::Storage::Plugin);

-if (@PVE::Storage::Plugin::SHARED_STORAGE)
-{
-    push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
-}
-
 sub api
 {
    # Trick it :)
@@ -138,11 +133,9 @@ sub properties
 sub options
 {
    return {
-        shared => { optional => 1 },
-        content => { optional => 1 },
        nodes => { optional => 1 },
        disable => { optional => 1 },
-        vitastor_etcd_address => { optional => 1 },
+        vitastor_etcd_address => { optional => 1},
        vitastor_etcd_prefix => { optional => 1 },
        vitastor_config_path => { optional => 1 },
        vitastor_prefix => { optional => 1 },
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.8.5'
+VERSION = '0.8.3'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -25,4 +25,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.8.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.3$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.3.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.5
+Version:        0.8.3
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.5.el7.tar.gz
+Source0:        vitastor-0.8.3.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -35,7 +35,6 @@ Summary:        Vitastor - OSD
 Requires:       libJerasure2
 Requires:       libisa-l
 Requires:       liburing >= 0.6
-Requires:       liburing < 2
 Requires:       vitastor-client = %{version}-%{release}
 Requires:       util-linux
 Requires:       parted
@@ -60,7 +59,6 @@ scheduling cluster-level operations.
 %package -n vitastor-client
 Summary:        Vitastor - client
 Requires:       liburing >= 0.6
-Requires:       liburing < 2


 %description -n vitastor-client
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.3.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.5
+Version:        0.8.3
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.5.el8.tar.gz
+Source0:        vitastor-0.8.3.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -34,7 +34,6 @@ Summary:        Vitastor - OSD
 Requires:       libJerasure2
 Requires:       libisa-l
 Requires:       liburing >= 0.6
-Requires:       liburing < 2
 Requires:       vitastor-client = %{version}-%{release}
 Requires:       util-linux
 Requires:       parted
@@ -58,7 +57,6 @@ scheduling cluster-level operations.
 %package -n vitastor-client
 Summary:        Vitastor - client
 Requires:       liburing >= 0.6
-Requires:       liburing < 2


 %description -n vitastor-client
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 2.8)
 project(vitastor)

 include(GNUInstallDirs)
-include(CTest)

 set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
 set(WITH_FIO true CACHE BOOL "Build FIO driver")
@@ -16,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.8.5")
+add_definitions(-DVERSION="0.8.3")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -56,14 +55,6 @@ if (ISAL_LIBRARIES)
 	add_definitions(-DWITH_ISAL)
 endif (ISAL_LIBRARIES)

-add_custom_target(build_tests)
-add_custom_target(test
-	COMMAND
-	echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
-	env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
-)
-add_dependencies(test build_tests)
-
 include_directories(
 	../
 	/usr/include/jerasure
@@ -111,7 +102,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
 add_executable(vitastor-osd
 	osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
 	osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
-	osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
+	osd_cluster.cpp osd_rmw.cpp
 )
 target_link_libraries(vitastor-osd
 	vitastor_common
@@ -154,6 +145,7 @@ add_library(vitastor_client SHARED
 set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
 target_link_libraries(vitastor_client
 	vitastor_common
+	tcmalloc_minimal
 	${LIBURING_LIBRARIES}
 	${IBVERBS_LIBRARIES}
 )
@@ -243,17 +235,14 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
 target_link_libraries(osd_test tcmalloc_minimal)

 # osd_rmw_test
-add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
+# FIXME: Move to tests
+add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
 target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
-add_dependencies(build_tests osd_rmw_test)
-add_test(NAME osd_rmw_test COMMAND osd_rmw_test)

 if (ISAL_LIBRARIES)
-	add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
+	add_executable(osd_rmw_test_je osd_rmw_test.cpp allocator.cpp)
 	target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
 	target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
-	add_dependencies(build_tests osd_rmw_test_je)
-	add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
 endif (ISAL_LIBRARIES)

 # stub_uring_osd
@@ -268,15 +257,11 @@ target_link_libraries(stub_uring_osd
 )

 # osd_peering_pg_test
-add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
+add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
 target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
-add_dependencies(build_tests osd_peering_pg_test)
-add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)

 # test_allocator
-add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
-add_dependencies(build_tests test_allocator)
-add_test(NAME test_allocator COMMAND test_allocator)
+add_executable(test_allocator test_allocator.cpp allocator.cpp)

 # test_cas
 add_executable(test_cas
@@ -296,15 +281,12 @@ target_link_libraries(test_crc32

 # test_cluster_client
 add_executable(test_cluster_client
-	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
 	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
-	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
+	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
 target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
-add_dependencies(build_tests test_cluster_client)
-add_test(NAME test_cluster_client COMMAND test_cluster_client)

 ## test_blockstore, test_shit
 #add_executable(test_blockstore test_blockstore.cpp)
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -122,14 +122,11 @@ Output:
 Get a list of all objects in this Blockstore.

 Input:
- pg_alignment = PG alignment
- pg_count = PG count or 0 to list all objects
- pg_number = PG number
- list_stable_limit = max number of clean objects in the reply
-  it's guaranteed that dirty objects are returned from the same interval,
-  i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
- min_oid = min inode/stripe or 0 to list all objects
- max_oid = max inode/stripe or 0 to list all objects
+- oid.stripe = PG alignment
+- len = PG count or 0 to list all objects
+- offset = PG number
+- oid.inode = min inode number or 0 to list all inodes
+- version = max inode number or 0 to list all inodes

 Output:
 - retval = total obj_ver_id count
@@ -146,27 +143,10 @@ struct blockstore_op_t
    uint64_t opcode;
    // finish callback
    std::function<void (blockstore_op_t*)> callback;
-    union
-    {
-        // R/W
-        struct
-        {
-            object_id oid;
-            uint64_t version;
-            uint32_t offset;
-            uint32_t len;
-        };
-        // List
-        struct __attribute__((__packed__))
-        {
-            object_id min_oid;
-            object_id max_oid;
-            uint32_t pg_alignment;
-            uint32_t pg_count;
-            uint32_t pg_number;
-            uint32_t list_stable_limit;
-        };
-    };
+    object_id oid;
+    uint64_t version;
+    uint32_t offset;
+    uint32_t len;
    void *buf;
    void *bitmap;
    int retval;
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -325,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
    {
        // Basic verification not passed
        op->retval = -EINVAL;
-        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -368,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
    }
    if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
    {
-        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
@@ -445,11 +445,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint

 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    uint32_t list_pg = op->pg_number+1;
-    uint32_t pg_count = op->pg_count;
-    uint64_t pg_stripe_size = op->pg_alignment;
-    uint64_t min_inode = op->min_oid.inode;
-    uint64_t max_inode = op->max_oid.inode;
+    uint32_t list_pg = op->offset+1;
+    uint32_t pg_count = op->len;
+    uint64_t pg_stripe_size = op->oid.stripe;
+    uint64_t min_inode = op->oid.inode;
+    uint64_t max_inode = op->version;
    // Check PG
    if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
    {
@@ -496,13 +496,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
            stable_alloc += clean_db.size();
        }
    }
-    if (op->list_stable_limit > 0)
-    {
-        stable_alloc = op->list_stable_limit;
-        if (stable_alloc > 1024*1024)
-            stable_alloc = 1024*1024;
-    }
-    if (stable_alloc < 32768)
+    else
    {
        stable_alloc = 32768;
    }
@@ -513,21 +507,22 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        FINISH_OP(op);
        return;
    }
-    auto max_oid = op->max_oid;
-    bool limited = false;
    for (auto shard_it = clean_db_shards.lower_bound(first_shard);
        shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
        shard_it++)
    {
        auto & clean_db = shard_it->second;
        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
-        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
+        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
        {
-            clean_it = clean_db.lower_bound(op->min_oid);
-        }
-        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
-        {
-            clean_end = clean_db.upper_bound(max_oid);
+            clean_it = clean_db.lower_bound({
+                .inode = min_inode,
+                .stripe = 0,
+            });
+            clean_end = clean_db.upper_bound({
+                .inode = max_inode,
+                .stripe = UINT64_MAX,
+            });
        }
        for (; clean_it != clean_end; clean_it++)
        {
@@ -546,24 +541,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                .oid = clean_it->first,
                .version = clean_it->second.version,
            };
-            if (op->list_stable_limit > 0 && !limited && stable_count >= op->list_stable_limit)
-            {
-                limited = true;
-                break;
-            }
-        }
-        if (op->list_stable_limit > 0 && first_shard != last_shard)
-        {
-            // To maintain the order, we have to include objects in the same range from other shards
-            std::sort(stable, stable+stable_count);
-            if (stable_count > op->list_stable_limit)
-                stable_count = op->list_stable_limit;
-            max_oid = stable[stable_count-1].oid;
        }
    }
-    if (op->list_stable_limit == 0 && first_shard != last_shard)
+    if (first_shard != last_shard)
    {
-        // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
+        // If that's not a per-PG listing, sort clean entries
        std::sort(stable, stable+stable_count);
    }
    int clean_stable_count = stable_count;
@@ -572,17 +554,20 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    obj_ver_id *unstable = NULL;
    {
        auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
-        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
+        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
        {
            dirty_it = dirty_db.lower_bound({
-                .oid = op->min_oid,
+                .oid = {
+                    .inode = min_inode,
+                    .stripe = 0,
+                },
                .version = 0,
            });
-        }
-        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
-        {
            dirty_end = dirty_db.upper_bound({
-                .oid = max_oid,
+                .oid = {
+                    .inode = max_inode,
+                    .stripe = UINT64_MAX,
+                },
                .version = UINT64_MAX,
            });
        }
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -121,7 +121,8 @@ resume_1:
            }
            if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
            {
-                pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
+                uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
+                pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
            }
            pool_stats[pool_cfg.id] = json11::Json::object {
                { "name", pool_cfg.name },
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@@ -403,7 +403,7 @@ struct snap_merger_t
        op->opcode = OSD_OP_READ_BITMAP;
        op->inode = target;
        op->offset = offset;
-        op->len = target_block_size;
+        op->len = 0;
        op->callback = [this](cluster_op_t *op)
        {
            if (op->retval < 0)
--- a/src/cli_rm_data.cpp
+++ b/src/cli_rm_data.cpp
@@ -92,7 +92,6 @@ struct rm_inode_t

    void send_ops(rm_pg_t *cur_list)
    {
-        parent->cli->init_msgr();
        if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
            parent->cli->msgr.osd_peer_fds.end())
        {
--- a/src/cli_rm_osd.cpp
+++ b/src/cli_rm_osd.cpp
@@ -410,17 +410,14 @@ struct rm_osd_t
                        parent->cli->st_cli.etcd_prefix+"/pg/history/"+
                        std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
                    );
-                    auto hist = json11::Json::object {
-                        { "epoch", pg_cfg.epoch },
-                        { "all_peers", pg_cfg.all_peers },
-                        { "osd_sets", pg_cfg.target_history },
-                    };
-                    if (pg_cfg.scrub_ts)
-                        hist["scrub_ts"] = pg_cfg.scrub_ts;
                    history_updates.push_back(json11::Json::object {
                        { "request_put", json11::Json::object {
                            { "key", history_key },
-                            { "value", base64_encode(json11::Json(hist).dump()) },
+                            { "value", base64_encode(json11::Json(json11::Json::object {
+                                { "epoch", pg_cfg.epoch },
+                                { "all_peers", pg_cfg.all_peers },
+                                { "osd_sets", pg_cfg.target_history },
+                            }).dump()) },
                        } },
                    });
                    history_checks.push_back(json11::Json::object {
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -59,6 +59,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
        delete op;
    };
    msgr.parse_config(this->config);
+    msgr.init();

    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -72,6 +73,17 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

    scrap_buffer_size = SCRAP_BUFFER_SIZE;
    scrap_buffer = malloc_or_die(scrap_buffer_size);
+
+    if (ringloop)
+    {
+        consumer.loop = [this]()
+        {
+            msgr.read_requests();
+            msgr.send_replies();
+            this->ringloop->submit();
+        };
+        ringloop->register_consumer(&consumer);
+    }
 }

 cluster_client_t::~cluster_client_t()
@@ -103,24 +115,6 @@ cluster_op_t::~cluster_op_t()
    }
 }

-void cluster_client_t::init_msgr()
-{
-    if (msgr_initialized)
-        return;
-    msgr.init();
-    msgr_initialized = true;
-    if (ringloop)
-    {
-        consumer.loop = [this]()
-        {
-            msgr.read_requests();
-            msgr.send_replies();
-            this->ringloop->submit();
-        };
-        ringloop->register_consumer(&consumer);
-    }
-}
-
 void cluster_client_t::calc_wait(cluster_op_t *op)
 {
    op->prev_wait = 0;
@@ -149,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
        if (!op->prev_wait)
            continue_sync(op);
    }
-    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
+    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
    {
        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
        {
@@ -157,8 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
            {
                op->prev_wait++;
            }
-            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
-                prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
+            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
            {
                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
                break;
@@ -178,8 +171,7 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
            auto n2 = next->next;
            if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
-                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
-                    next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
+                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
            {
                next->prev_wait += inc;
                assert(next->prev_wait >= 0);
@@ -229,14 +221,11 @@ void cluster_client_t::erase_op(cluster_op_t *op)
    if (op_queue_tail == op)
        op_queue_tail = op->prev;
    op->next = op->prev = NULL;
-    if (flags & OP_FLUSH_BUFFER)
-        std::function<void(cluster_op_t*)>(op->callback)(op);
    if (!(flags & OP_IMMEDIATE_COMMIT))
        inc_wait(opcode, flags, next, -1);
    // Call callback at the end to avoid inconsistencies in prev_wait
    // if the callback adds more operations itself
-    if (!(flags & OP_FLUSH_BUFFER))
-        std::function<void(cluster_op_t*)>(op->callback)(op);
+    std::function<void(cluster_op_t*)>(op->callback)(op);
 }

 void cluster_client_t::continue_ops(bool up_retry)
@@ -348,8 +337,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
            // And now they have to be resliced!
            for (auto op = op_queue_head; op; op = op->next)
            {
-                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
-                    op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
+                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
                    INODE_POOL(op->cur_inode) == pool_item.first)
                {
                    op->needs_reslice = true;
@@ -421,7 +409,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 void cluster_client_t::execute(cluster_op_t *op)
 {
    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
-        op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
+        op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
@@ -453,7 +441,7 @@ void cluster_client_t::execute(cluster_op_t *op)
            return;
        }
        // Check alignment
-        if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
+        if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
            op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
        {
            op->retval = -EINVAL;
@@ -714,7 +702,8 @@ resume_3:
        // Finished successfully
        // Even if the PG count has changed in meanwhile we treat it as success
        // because if some operations were invalid for the new PG count we'd get errors
-        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
+        bool is_read = op->opcode == OSD_OP_READ;
+        if (is_read)
        {
            // Check parent inode
            auto ino_it = st_cli.inode_config.find(op->cur_inode);
@@ -738,11 +727,6 @@ resume_3:
            }
        }
        op->retval = op->len;
-        if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
-        {
-            auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
-            op->retval = op->len / pool_cfg.bitmap_granularity;
-        }
        erase_op(op);
        return 1;
    }
@@ -766,10 +750,7 @@ resume_3:
        {
            for (int i = 0; i < op->parts.size(); i++)
            {
-                if (!(op->parts[i].flags & PART_DONE))
-                {
-                    op->parts[i].flags = PART_RETRY;
-                }
+                op->parts[i].flags = PART_RETRY;
            }
            goto resume_2;
        }
@@ -828,19 +809,23 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
    op->retval = 0;
    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
-    if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
+    if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
    {
        // Allocate memory for the bitmap
-        unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
+        unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
        object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
        unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
-        if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
+        if (op->bitmap_buf_size < bitmap_mem)
        {
            op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
+            if (!op->bitmap_buf_size)
+            {
+                // First allocation
+                memset(op->bitmap_buf, 0, object_bitmap_size);
+            }
            op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
            op->bitmap_buf_size = bitmap_mem;
        }
-        memset(op->bitmap_buf, 0, bitmap_mem);
    }
    int iov_idx = 0;
    size_t iov_pos = 0;
@@ -891,14 +876,13 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            if (end == begin)
                op->done_count++;
        }
-        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
+        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
        {
            add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
        }
        op->parts[i].parent = op;
        op->parts[i].offset = begin;
-        op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
-            op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
+        op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
        op->parts[i].pg_num = pg_num;
        op->parts[i].osd_num = 0;
        op->parts[i].flags = 0;
@@ -927,10 +911,6 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len

 bool cluster_client_t::try_send(cluster_op_t *op, int i)
 {
-    if (!msgr_initialized)
-    {
-        init_msgr();
-    }
    auto part = &op->parts[i];
    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
@@ -949,7 +929,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
            );
            uint64_t meta_rev = 0;
-            if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
+            if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
            {
                auto ino_it = st_cli.inode_config.find(op->inode);
                if (ino_it != st_cli.inode_config.end())
@@ -962,7 +942,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
                        .id = next_op_id(),
-                        .opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
+                        .opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
                    },
                    .inode = op->cur_inode,
                    .offset = part->offset,
@@ -970,10 +950,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                    .meta_revision = meta_rev,
                    .version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
                } },
-                .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
-                    ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
-                .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
-                    ? pg_bitmap_size : 0),
+                .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
+                .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
                .callback = [this, part](osd_op_t *op_part)
                {
                    handle_op_part(part);
@@ -1152,11 +1130,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
    else
    {
        // OK
-        if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
+        if (!(op->flags & OP_IMMEDIATE_COMMIT))
            dirty_osds.insert(part->osd_num);
        part->flags |= PART_DONE;
        op->done_count++;
-        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
+        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
        {
            copy_part_bitmap(op, part);
            op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
@@ -1180,12 +1158,7 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
    );
    uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
    uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
-    uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
-    uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
-    if (part_len > op_len-object_offset)
-    {
-        part_len = op_len-object_offset;
-    }
+    uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
    if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
    {
        // Copy bytes
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -11,7 +11,6 @@
 #define INODE_LIST_DONE 1
 #define INODE_LIST_HAS_UNSTABLE 2
 #define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
-#define OSD_OP_READ_CHAIN_BITMAP 0x102

 #define OSD_OP_IGNORE_READONLY 0x08

@@ -31,7 +30,7 @@ struct cluster_op_part_t

 struct cluster_op_t
 {
-    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
+    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
    uint64_t inode;
    uint64_t offset;
    uint64_t len;
@@ -40,13 +39,9 @@ struct cluster_op_t
    uint64_t version = 0;
    // now only OSD_OP_IGNORE_READONLY is supported
    uint64_t flags = 0;
-    // negative retval is an error number
-    // write and read return len on success
-    // sync and delete return 0 on success
-    // read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
    int retval;
    osd_op_buf_list_t iov;
-    // READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
+    // READ and READ_BITMAP return the bitmap here
    void *bitmap_buf = NULL;
    std::function<void(cluster_op_t*)> callback;
    ~cluster_op_t();
@@ -104,14 +99,10 @@ class cluster_client_t
    std::vector<std::function<void(void)>> on_ready_hooks;
    std::vector<inode_list_t*> lists;
    int continuing_ops = 0;
-    bool msgr_initialized = false;

 public:
    etcd_state_client_t st_cli;
-
    osd_messenger_t msgr;
-    void init_msgr();
-
    json11::Json config;
    json11::Json::object merged_config;

--- a/src/disk_tool_utils.cpp
+++ b/src/disk_tool_utils.cpp
@@ -305,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
 json11::Json read_parttable(std::string dev)
 {
    std::string part_dump;
-    int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
+    int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
    if (r == 255)
    {
-        fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
+        fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
        return json11::Json(false);
    }
    // Decode partition table
@@ -319,7 +319,7 @@ json11::Json read_parttable(std::string dev)
        pt = json11::Json::parse(part_dump, err);
        if (err != "")
        {
-            fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
+            fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
            return json11::Json(false);
        }
        pt = pt["partitiontable"];
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -7,8 +7,8 @@
 #ifndef __MOCK__
 #include "addr_util.h"
 #include "http_client.h"
-#endif
 #include "str_util.h"
+#endif

 etcd_state_client_t::~etcd_state_client_t()
 {
@@ -759,10 +759,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
                continue;
            }
-            // Scrub Interval
-            pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
-            if (!pc.scrub_interval)
-                pc.scrub_interval = 0;
            // Immediate Commit Mode
            pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
                ? (pool_item.second["immediate_commit"].string_value() == "all"
@@ -871,6 +867,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
        }
        else
        {
+            fprintf(stderr, "RECEIVED PG %u/%u HISTORY: %s\n", pool_id, pg_num, value.dump().c_str());
            auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
            pg_cfg.target_history.clear();
            pg_cfg.all_peers.clear();
@@ -905,8 +902,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            }
            // Read epoch
            pg_cfg.epoch = value["epoch"].uint64_value();
-            // Scrub timestamp
-            pg_cfg.scrub_ts = parse_time(value["scrub_ts"].string_value());
            if (on_change_pg_history_hook != NULL)
            {
                on_change_pg_history_hook(pool_id, pg_num);
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -39,7 +39,6 @@ struct pg_config_t
    osd_num_t cur_primary;
    int cur_state;
    uint64_t epoch;
-    uint64_t scrub_ts;
 };

 struct pool_config_t
@@ -56,7 +55,6 @@ struct pool_config_t
    uint64_t max_osd_combinations;
    uint64_t pg_stripe_size;
    std::map<pg_num_t, pg_config_t> pg_config;
-    uint64_t scrub_interval;
 };

 struct inode_config_t
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -157,7 +157,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
        this->rdma_max_sge = 128;
    this->rdma_max_send = config["rdma_max_send"].uint64_value();
    if (!this->rdma_max_send)
-        this->rdma_max_send = 64;
+        this->rdma_max_send = 1;
    this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
    if (!this->rdma_max_recv)
        this->rdma_max_recv = 128;
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -138,7 +138,6 @@ protected:

    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
-    // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
    std::vector<std::function<void()>> set_immediate;

 public:
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@@ -368,8 +368,9 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
 bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
-    if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
+    if (!cl->send_list.size() || rc->cur_send > 0)
    {
+        // Only send one batch at a time
        return true;
    }
    uint64_t op_size = 0, op_sge = 0;
@@ -379,7 +380,6 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
        iovec & iov = cl->send_list[rc->send_pos];
        if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
        {
-            rc->send_sizes.push_back(op_size);
            try_send_rdma_wr(cl, sge, op_sge);
            op_sge = 0;
            op_size = 0;
@@ -405,24 +405,18 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
    }
    if (op_sge > 0)
    {
-        rc->send_sizes.push_back(op_size);
        try_send_rdma_wr(cl, sge, op_sge);
    }
    return true;
 }

-static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
+static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
 {
-    ibv_sge sge = {
-        .addr = (uintptr_t)buf,
-        .length = (uint32_t)cl->rdma_conn->max_msg,
-        .lkey = cl->rdma_conn->ctx->mr->lkey,
-    };
    ibv_recv_wr *bad_wr = NULL;
    ibv_recv_wr wr = {
        .wr_id = (uint64_t)(cl->peer_fd*2),
-        .sg_list = &sge,
-        .num_sge = 1,
+        .sg_list = sge,
+        .num_sge = op_sge,
    };
    int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
    if (err || bad_wr)
@@ -440,7 +434,12 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
    {
        void *buf = malloc_or_die(rc->max_msg);
        rc->recv_buffers.push_back(buf);
-        try_recv_rdma_wr(cl, buf);
+        ibv_sge sge = {
+            .addr = (uintptr_t)buf,
+            .length = (uint32_t)rc->max_msg,
+            .lkey = rc->ctx->mr->lkey,
+        };
+        try_recv_rdma_wr(cl, &sge, 1);
    }
    return true;
 }
@@ -477,7 +476,6 @@ void osd_messenger_t::handle_rdma_events()
                continue;
            }
            osd_client_t *cl = cl_it->second;
-            auto rc = cl->rdma_conn;
            if (wc[i].status != IBV_WC_SUCCESS)
            {
                fprintf(stderr, "RDMA work request failed for client %d", client_id);
@@ -491,59 +489,44 @@ void osd_messenger_t::handle_rdma_events()
            }
            if (!is_send)
            {
-                rc->cur_recv--;
-                if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
+                cl->rdma_conn->cur_recv--;
+                if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
                {
                    // handle_read_buffer may stop the client
                    continue;
                }
-                try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
-                rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
+                free(cl->rdma_conn->recv_buffers[0]);
+                cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
+                try_recv_rdma(cl);
            }
            else
            {
-                rc->cur_send--;
-                uint64_t sent_size = rc->send_sizes.at(0);
-                rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
-                int send_pos = 0, send_buf_pos = 0;
-                while (sent_size > 0)
+                cl->rdma_conn->cur_send--;
+                if (!cl->rdma_conn->cur_send)
                {
-                    if (sent_size >= cl->send_list.at(send_pos).iov_len)
+                    // Wait for the whole batch
+                    for (int i = 0; i < cl->rdma_conn->send_pos; i++)
                    {
-                        sent_size -= cl->send_list[send_pos].iov_len;
-                        send_pos++;
+                        if (cl->outbox[i].flags & MSGR_SENDP_FREE)
+                        {
+                            // Reply fully sent
+                            delete cl->outbox[i].op;
+                        }
                    }
-                    else
+                    if (cl->rdma_conn->send_pos > 0)
                    {
-                        send_buf_pos = sent_size;
-                        sent_size = 0;
+                        cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
+                        cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
+                        cl->rdma_conn->send_pos = 0;
                    }
-                }
-                assert(rc->send_pos >= send_pos);
-                if (rc->send_pos == send_pos)
-                {
-                    rc->send_buf_pos -= send_buf_pos;
-                }
-                rc->send_pos -= send_pos;
-                for (int i = 0; i < send_pos; i++)
-                {
-                    if (cl->outbox[i].flags & MSGR_SENDP_FREE)
+                    if (cl->rdma_conn->send_buf_pos > 0)
                    {
-                        // Reply fully sent
-                        delete cl->outbox[i].op;
+                        cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
+                        cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
+                        cl->rdma_conn->send_buf_pos = 0;
                    }
+                    try_send_rdma(cl);
                }
-                if (send_pos > 0)
-                {
-                    cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
-                    cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
-                }
-                if (send_buf_pos > 0)
-                {
-                    cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + send_buf_pos;
-                    cl->send_list[0].iov_len -= send_buf_pos;
-                }
-                try_send_rdma(cl);
            }
        }
    } while (event_count > 0);
--- a/src/msgr_rdma.h
+++ b/src/msgr_rdma.h
@@ -49,9 +49,8 @@ struct msgr_rdma_connection_t
    uint64_t max_msg = 0;

    int send_pos = 0, send_buf_pos = 0;
-    int next_recv_buf = 0;
+    int recv_pos = 0, recv_buf_pos = 0;
    std::vector<void*> recv_buffers;
-    std::vector<uint64_t> send_sizes;

    ~msgr_rdma_connection_t();
    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -178,16 +178,6 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
    inode_vanish_time = config["inode_vanish_time"].uint64_value();
    if (!inode_vanish_time)
        inode_vanish_time = 60;
-    global_scrub_interval = config["scrub_interval"].uint64_value();
-    if (!global_scrub_interval)
-        global_scrub_interval = 30*86400;
-    scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
-    if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
-        scrub_queue_depth = 1;
-    scrub_sleep_ms = config["scrub_sleep"].uint64_value();
-    scrub_list_limit = config["scrub_list_limit"].uint64_value();
-    if (!scrub_list_limit)
-        scrub_list_limit = 1000;
 }

 void osd_t::bind_socket()
@@ -272,8 +262,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
            cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
            (cur_op->req.rw.len > OSD_RW_MAX ||
            cur_op->req.rw.len % bs_bitmap_granularity ||
-            cur_op->req.rw.offset % bs_bitmap_granularity)) ||
-        cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
+            cur_op->req.rw.offset % bs_bitmap_granularity)))
    {
        // Bad command
        finish_op(cur_op, -EINVAL);
@@ -290,7 +279,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
        cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
        cur_op->req.hdr.opcode != OSD_OP_READ &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
-        cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
    {
        // Readonly mode
@@ -321,10 +309,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_del(cur_op);
    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
-    {
-        continue_primary_scrub(cur_op);
-    }
    else
    {
        exec_secondary(cur_op);
@@ -389,10 +373,6 @@ void osd_t::print_stats()
            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
        }
    }
-    if (corrupted_objects > 0)
-    {
-        printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
-    }
    if (incomplete_objects > 0)
    {
        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
@@ -460,11 +440,10 @@ void osd_t::print_slow()
                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
                {
                    bufprintf(
-                        " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
-                        op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
-                        op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
+                        " inode=%lx-%lx pg=%u/%u, stripe=%lu",
+                        op->req.sec_list.min_inode, op->req.sec_list.max_inode,
                        op->req.sec_list.list_pg, op->req.sec_list.pg_count,
-                        op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
+                        op->req.sec_list.pg_stripe_size
                    );
                }
                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
--- a/src/osd.h
+++ b/src/osd.h
@@ -28,7 +28,6 @@
 #define OSD_PEERING_PGS 0x04
 #define OSD_FLUSHING_PGS 0x08
 #define OSD_RECOVERING 0x10
-#define OSD_SCRUBBING 0x20

 #define MAX_AUTOSYNC_INTERVAL 3600
 #define DEFAULT_AUTOSYNC_INTERVAL 5
@@ -114,10 +113,6 @@ class osd_t
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
    int log_level = 0;
-    uint64_t global_scrub_interval = 30*86400;
-    uint64_t scrub_queue_depth = 1;
-    uint64_t scrub_sleep_ms = 0;
-    uint32_t scrub_list_limit = 1000;

    // cluster state

@@ -139,24 +134,15 @@ class osd_t
    std::set<pool_pg_num_t> dirty_pgs;
    std::set<osd_num_t> dirty_osds;
    int copies_to_delete_after_sync_count = 0;
-    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0;
+    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    std::map<object_id, osd_op_t*> scrub_ops;
    bool recovery_last_degraded = true;
    pool_pg_num_t recovery_last_pg;
    object_id recovery_last_oid;
    int recovery_pg_done = 0, recovery_done = 0;
    osd_op_t *autosync_op = NULL;

-    // Scrubbing
-    uint64_t scrub_nearest_ts = 0;
-    int scrub_timer_id = -1;
-    pool_pg_num_t scrub_last_pg;
-    osd_op_t *scrub_list_op;
-    pg_list_result_t scrub_cur_list = {};
-    uint64_t scrub_list_pos = 0;
-
    // Unstable writes
    uint64_t unstable_write_count = 0;
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -234,13 +220,6 @@ class osd_t
    bool continue_recovery();
    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

-    // scrub
-    void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
-    bool pick_next_scrub(object_id & next_oid);
-    void submit_scrub_op(object_id oid);
-    bool continue_scrub();
-    void schedule_scrub(pg_t & pg);
-
    // op execution
    void exec_op(osd_op_t *cur_op);
    void finish_op(osd_op_t *cur_op, int retval);
@@ -255,15 +234,13 @@ class osd_t
    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
-    void continue_primary_scrub(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
    void cancel_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
    void continue_primary_del(osd_op_t *cur_op);
    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
-    void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
-    pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
-    void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
+    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
+    void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
    void handle_primary_bs_subop(osd_op_t *subop);
@@ -278,11 +255,10 @@ class osd_t
    int submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);

-    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
+    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);

    void continue_chained_read(osd_op_t *cur_op);
    int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
-    void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
    void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
    std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
    int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -336,8 +336,6 @@ void osd_t::report_statistics()
        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
        pg_stats["degraded_count"] = pg.degraded_objects.size();
        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
-        if (pg.corrupted_count)
-            pg_stats["corrupted_count"] = pg.corrupted_count;
        pg_stats["write_osd_set"] = pg.cur_set;
        txn.push_back(json11::Json::object {
            { "request_put", json11::Json::object {
@@ -685,23 +683,31 @@ void osd_t::apply_pg_config()
                auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
                if (currently_taken)
                {
-                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
+                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
                    {
                        if (pg_it->second.target_set == pg_cfg.target_set &&
                            pg_it->second.target_history == pg_cfg.target_history &&
                            pg_it->second.all_peers == vec_all_peers)
                        {
                            // No change in osd_set and history
-                            if (pg_it->second.scrub_ts != pg_cfg.scrub_ts)
-                            {
-                                pg_it->second.scrub_ts = pg_cfg.scrub_ts;
-                                peering_state = peering_state | OSD_SCRUBBING;
-                                ringloop->wakeup();
-                            }
                            continue;
                        }
                        else
                        {
+                            printf(
+                                "Repeer %u/%u because of history: %s vs %s\n",
+                                pool_id, pg_num,
+                                json11::Json(json11::Json::object {
+                                    { "target_set", pg_cfg.target_set },
+                                    { "osd_sets", pg_cfg.target_history },
+                                    { "all_peers", vec_all_peers },
+                                }).dump().c_str(),
+                                json11::Json(json11::Json::object {
+                                    { "target_set", pg_it->second.target_set },
+                                    { "osd_sets", pg_it->second.target_history },
+                                    { "all_peers", pg_it->second.all_peers },
+                                }).dump().c_str()
+                            );
                            // Stop PG, reapply change after stopping
                            stop_pg(pg_it->second);
                            all_applied = false;
@@ -749,7 +755,6 @@ void osd_t::apply_pg_config()
                    .reported_epoch = pg_cfg.epoch,
                    .target_history = pg_cfg.target_history,
                    .all_peers = vec_all_peers,
-                    .scrub_ts = pg_cfg.scrub_ts,
                    .target_set = pg_cfg.target_set,
                };
                if (pg.scheme == POOL_SCHEME_EC)
@@ -880,8 +885,7 @@ void osd_t::report_pg_states()
                    { "all_peers", pg.all_peers },
                    { "osd_sets", pg.target_history },
                };
-                if (pg.scrub_ts)
-                    history_value["scrub_ts"] = pg.scrub_ts;
+                printf("PG %u/%u HISTORY -> %s\n", pg.pool_id, pg.pg_num, json11::Json(history_value).dump().c_str());
                checks.push_back(json11::Json::object {
                    { "target", "MOD" },
                    { "key", history_key },
@@ -974,6 +978,13 @@ void osd_t::report_pg_states()
                        }
                        this->pgs.erase(pg_it);
                    }
+                    else if (pg_it->second.state & PG_PEERED)
+                    {
+                        // Activate PG after PG PEERED state is reported along with history
+                        // (if the state wasn't changed again)
+                        pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
+                        report_pg_state(pg_it->second);
+                    }
                }
            }
            // Push other PG state updates, if any
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -182,9 +182,7 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
                op->bs_op = NULL;
                delete op;
            },
-            {
-                .len = (uint32_t)count,
-            },
+            .len = (uint32_t)count,
            .buf = op->buf,
        });
        bs->enqueue_op(op->bs_op);
@@ -302,17 +300,19 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
        if (osd_op->reply.hdr.retval < 0)
        {
            // Error recovering object
-            // EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
-            printf(
-                "Recovery operation failed with object %lx:%lx (PG %u/%u): error %ld\n",
-                op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
-                map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
-                osd_op->reply.hdr.retval
-            );
-        }
-        else if (log_level > 2)
-        {
-            printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
+            if (osd_op->reply.hdr.retval == -EPIPE)
+            {
+                // PG is stopped or one of the OSDs is gone, error is harmless
+                printf(
+                    "Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
+                    op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
+                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
+                );
+            }
+            else
+            {
+                throw std::runtime_error("Failed to recover an object");
+            }
        }
        // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
        op->osd_op = NULL;
--- a/src/osd_ops.h
+++ b/src/osd_ops.h
@@ -29,8 +29,7 @@
 #define OSD_OP_DELETE               14
 #define OSD_OP_PING                 15
 #define OSD_OP_SEC_READ_BMP         16
-#define OSD_OP_SCRUB                17
-#define OSD_OP_MAX                  17
+#define OSD_OP_MAX                  16
 #define OSD_RW_MAX                  64*1024*1024
 #define OSD_PROTOCOL_VERSION        1

@@ -174,11 +173,6 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
    uint64_t pg_stripe_size;
    // inode range (used to select pools)
    uint64_t min_inode, max_inode;
-    // min/max oid stripe, added after inodes for backwards compatibility
-    // also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
-    uint64_t min_stripe, max_stripe;
-    // max stable object count
-    uint32_t stable_limit;
 };

 struct __attribute__((__packed__)) osd_reply_sec_list_t
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -24,7 +24,6 @@ void osd_t::handle_peers()
                if (!p.second.peering_state->list_ops.size())
                {
                    p.second.calc_object_states(log_level);
-                    schedule_scrub(p.second);
                    report_pg_state(p.second);
                    incomplete_objects += p.second.incomplete_objects.size();
                    misplaced_objects += p.second.misplaced_objects.size();
@@ -51,6 +50,10 @@ void osd_t::handle_peers()
                    still = true;
                }
            }
+            else if (p.second.state & PG_PEERED)
+            {
+                still = true;
+            }
        }
        if (!still)
        {
@@ -71,6 +74,10 @@ void osd_t::handle_peers()
                }
                still = true;
            }
+            else if (p.second.state & PG_PEERED)
+            {
+                still = true;
+            }
        }
        if (!still)
        {
@@ -84,13 +91,6 @@ void osd_t::handle_peers()
            peering_state = peering_state & ~OSD_RECOVERING;
        }
    }
-    if (peering_state & OSD_SCRUBBING)
-    {
-        if (!continue_scrub())
-        {
-            peering_state = peering_state & ~OSD_SCRUBBING;
-        }
-    }
 }

 void osd_t::repeer_pgs(osd_num_t peer_osd)
@@ -100,7 +100,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
    {
        auto & pg = p.second;
        bool repeer = false;
-        if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
+        if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
        {
            for (osd_num_t pg_osd: pg.all_peers)
            {
@@ -136,11 +136,9 @@ void osd_t::reset_pg(pg_t & pg)
    pg.state_dict.clear();
    copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
    pg.copies_to_delete_after_sync.clear();
-    corrupted_objects -= pg.corrupted_count;
    incomplete_objects -= pg.incomplete_objects.size();
    misplaced_objects -= pg.misplaced_objects.size();
    degraded_objects -= pg.degraded_objects.size();
-    pg.corrupted_count = 0;
    pg.incomplete_objects.clear();
    pg.misplaced_objects.clear();
    pg.degraded_objects.clear();
@@ -216,7 +214,7 @@ void osd_t::start_pg_peering(pg_t & pg)
            pg.cur_loc_set.push_back({
                .role = (uint64_t)role,
                .osd_num = pg.cur_set[role],
-                .loc_bad = 0,
+                .outdated = false,
            });
        }
    }
@@ -329,12 +327,11 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
        op->bs_op = new blockstore_op_t();
        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
-        op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
-        op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
-        op->bs_op->max_oid.stripe = UINT64_MAX;
-        op->bs_op->pg_count = pg_counts[ps->pool_id];
-        op->bs_op->pg_number = ps->pg_num-1;
+        op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
+        op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
+        op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
+        op->bs_op->len = pg_counts[ps->pool_id];
+        op->bs_op->offset = ps->pg_num-1;
        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
        {
            if (op->bs_op->retval < 0)
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -88,9 +88,13 @@ void pg_obj_state_check_t::walk()
    {
        // Activate as degraded
        // Current OSD set will be added into target_history on first write
-        pg->state |= PG_DEGRADED;
+        pg->state |= PG_DEGRADED | PG_PEERED;
+    }
+    else
+    {
+        // Just activate
+        pg->state |= PG_ACTIVE;
    }
-    pg->state |= PG_ACTIVE;
    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
    {
        pg->state |= PG_LEFT_ON_DEAD;
@@ -280,7 +284,7 @@ void pg_obj_state_check_t::finish_object()
            osd_set.push_back((pg_obj_loc_t){
                .role = (list[i].oid.stripe & STRIPE_MASK),
                .osd_num = list[i].osd_num,
-                .loc_bad = 0,
+                .outdated = false,
            });
        }
    }
@@ -302,7 +306,7 @@ void pg_obj_state_check_t::finish_object()
                osd_set.push_back((pg_obj_loc_t){
                    .role = (list[i].oid.stripe & STRIPE_MASK),
                    .osd_num = list[i].osd_num,
-                    .loc_bad = LOC_OUTDATED,
+                    .outdated = true,
                });
                if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
                {
@@ -322,71 +326,65 @@ void pg_obj_state_check_t::finish_object()
    }
    else
    {
-        pg->add_object_to_state(oid, state, osd_set);
-    }
-}
-
-pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
-{
-    auto it = state_dict.find(osd_set);
-    if (it == state_dict.end())
-    {
-        std::vector<osd_num_t> read_target;
-        if (scheme == POOL_SCHEME_REPLICATED)
+        auto it = pg->state_dict.find(osd_set);
+        if (it == pg->state_dict.end())
        {
-            for (auto & o: osd_set)
+            std::vector<uint64_t> read_target;
+            if (replicated)
            {
-                if (!o.loc_bad)
+                for (auto & o: osd_set)
                {
-                    read_target.push_back(o.osd_num);
+                    if (!o.outdated)
+                    {
+                        read_target.push_back(o.osd_num);
+                    }
+                }
+                while (read_target.size() < pg->pg_size)
+                {
+                    // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
+                    read_target.push_back(0);
                }
            }
-            while (read_target.size() < pg_size)
+            else
            {
-                // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
-                read_target.push_back(0);
+                read_target.resize(pg->pg_size);
+                for (int i = 0; i < pg->pg_size; i++)
+                {
+                    read_target[i] = 0;
+                }
+                for (auto & o: osd_set)
+                {
+                    if (!o.outdated)
+                    {
+                        read_target[o.role] = o.osd_num;
+                    }
+                }
            }
+            pg->state_dict[osd_set] = {
+                .read_target = read_target,
+                .osd_set = osd_set,
+                .state = state,
+                .object_count = 1,
+            };
+            it = pg->state_dict.find(osd_set);
        }
        else
        {
-            read_target.resize(pg_size);
-            for (int i = 0; i < pg_size; i++)
-            {
-                read_target[i] = 0;
-            }
-            for (auto & o: osd_set)
-            {
-                if (!o.loc_bad)
-                {
-                    read_target[o.role] = o.osd_num;
-                }
-            }
+            it->second.object_count++;
+        }
+        if (state & OBJ_INCOMPLETE)
+        {
+            pg->incomplete_objects[oid] = &it->second;
+        }
+        else if (state & OBJ_DEGRADED)
+        {
+            pg->degraded_objects[oid] = &it->second;
+        }
+        else
+        {
+            pg->misplaced_objects[oid] = &it->second;
        }
-        state_dict[osd_set] = {
-            .read_target = read_target,
-            .osd_set = osd_set,
-            .state = state,
-            .object_count = 1,
-        };
-        it = state_dict.find(osd_set);
    }
-    else
-    {
-        it->second.object_count++;
-    }
-    if (state & OBJ_INCOMPLETE)
-    {
-        incomplete_objects[oid] = &it->second;
-    }
-    else if (state & OBJ_DEGRADED)
-    {
-        degraded_objects[oid] = &it->second;
-    }
-    else
-    {
-        misplaced_objects[oid] = &it->second;
-    }
-    return &it->second;
 }

 // FIXME: Write at least some tests for this function
@@ -452,8 +450,7 @@ void pg_t::calc_object_states(int log_level)
                osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
                    std::to_string(loc.osd_num)+
                    (st.replicated ? "" : "("+std::to_string(loc.role)+")")+
-                    (loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
-                    (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "");
+                    (loc.outdated ? "(old)" : "");
            }
            printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
        }
@@ -463,23 +460,22 @@ void pg_t::calc_object_states(int log_level)
 void pg_t::print_state()
 {
    printf(
-        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
+        (state & PG_PEERED) ? "peered" : "",
        (state & PG_INCOMPLETE) ? "incomplete" : "",
        (state & PG_ACTIVE) ? "active" : "",
        (state & PG_REPEERING) ? "repeering" : "",
        (state & PG_STOPPING) ? "stopping" : "",
        (state & PG_DEGRADED) ? " + degraded" : "",
-        (state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
        (state & PG_HAS_INVALID) ? " + has_invalid" : "",
        (state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
-        (state & PG_SCRUBBING) ? " + scrubbing" : "",
        total_count
    );
 }
--- a/src/osd_peering_pg.h
+++ b/src/osd_peering_pg.h
@@ -13,14 +13,11 @@

 #define PG_EPOCH_BITS 48

-#define LOC_OUTDATED 1
-#define LOC_CORRUPTED 2
-
 struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED
+    bool outdated;
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -33,7 +30,6 @@ struct pg_osd_set_state_t
    pg_osd_set_t osd_set;
    uint64_t state = 0;
    uint64_t object_count = 0;
-    uint64_t ref_count = 0;
 };

 struct pg_list_result_t
@@ -95,8 +91,6 @@ struct pg_t
    // target history and all potential peers
    std::vector<std::vector<osd_num_t>> target_history;
    std::vector<osd_num_t> all_peers;
-    // last scrub time
-    uint64_t scrub_ts = 0;
    bool history_changed = false;
    // peer list from the last peering event
    std::vector<osd_num_t> cur_peers;
@@ -112,7 +106,6 @@ struct pg_t
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
-    uint64_t corrupted_count;
    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
    std::map<obj_piece_id_t, flush_action_t> flush_actions;
    std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
@@ -123,16 +116,15 @@ struct pg_t
    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

-    pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
    void calc_object_states(int log_level);
    void print_state();
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.loc_bad < b.loc_bad ||
-        a.loc_bad == b.loc_bad && a.role < b.role ||
-        a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
+    return a.outdated < b.outdated ||
+        a.outdated == b.outdated && a.role < b.role ||
+        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
--- a/src/osd_peering_pg_test.cpp
+++ b/src/osd_peering_pg_test.cpp
@@ -54,6 +54,5 @@ int main(int argc, char *argv[])
    {
        printf("dev: state=%lx\n", it.second.state);
    }
-    delete pg.peering_state;
    return 0;
 }
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@@ -52,9 +52,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        finish_op(cur_op, -EINVAL);
        return false;
    }
-    // Scrub is similar to r/w, so it's also handled here
-    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
-        && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
+    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
    int chain_size = 0;
    if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
    {
@@ -92,8 +90,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        chain_size * (
            // - copy of the chain
            sizeof(inode_t) +
-            // - object states for every chain item
-            sizeof(void*) +
            // - bitmap buffers for chained read
            stripe_count * clean_entry_bitmap_size +
            // - 'missing' flags for chained reads
@@ -121,8 +117,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    {
        op_data->read_chain = (inode_t*)data_buf;
        data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
-        op_data->chain_states = (pg_osd_set_state_t**)data_buf;
-        data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
        op_data->snapshot_bitmaps = data_buf;
        data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
        op_data->missing_flags = (uint8_t*)data_buf;
@@ -137,7 +131,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
            inode_it->second.parent_id != cur_op->req.rw.inode)
        {
            op_data->read_chain[chain_num++] = inode_it->second.parent_id;
-            op_data->chain_states[chain_num++] = NULL;
            inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
        }
    }
@@ -145,12 +138,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    return true;
 }

-uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
+uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
 {
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
        *object_state = NULL;
-        return pg.cur_set.data();
+        return def;
    }
    auto st_it = pg.incomplete_objects.find(oid);
    if (st_it != pg.incomplete_objects.end())
@@ -171,7 +164,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t
        return st_it->second->read_target.data();
    }
    *object_state = NULL;
-    return pg.cur_set.data();
+    return def;
 }

 void osd_t::continue_primary_read(osd_op_t *cur_op)
@@ -190,7 +183,6 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        goto resume_1;
    else if (op_data->st == 2)
        goto resume_2;
-resume_0:
    cur_op->reply.rw.bitmap_len = 0;
    {
        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
@@ -202,17 +194,15 @@ resume_0:
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        // PG may have degraded or misplaced objects
-        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+        op_data->prev_set = pg.cur_set.data();
+        if (pg.state != PG_ACTIVE)
+        {
+            // PG may be degraded or have misplaced objects
+            op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+        }
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
-            if (op_data->scheme == POOL_SCHEME_REPLICATED &&
-                op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
-            {
-                finish_op(cur_op, -EIO);
-                return;
-            }
            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
            op_data->st = 1;
@@ -238,14 +228,6 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // I/O or checksum error
-            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
-            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
-            goto resume_0;
-        }
        finish_op(cur_op, op_data->errcode);
        return;
    }
@@ -284,144 +266,10 @@ resume_2:
    finish_op(cur_op, cur_op->req.rw.len);
 }

-pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
-{
-    pg_osd_set_state_t *object_state = NULL;
-    get_object_osd_set(pg, oid, &object_state);
-    if (prev_object_state != object_state)
-    {
-        // Object state changed in between by a parallel I/O operation, skip marking as failed
-        if (ref)
-        {
-            deref_object_state(pg, &prev_object_state, ref);
-            if (object_state)
-                object_state->ref_count++;
-        }
-        return object_state;
-    }
-    pg_osd_set_t corrupted_set;
-    if (object_state)
-    {
-        corrupted_set = object_state->osd_set;
-    }
-    else
-    {
-        for (int i = 0; i < pg.cur_set.size(); i++)
-        {
-            corrupted_set.push_back((pg_obj_loc_t){
-                .role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
-                .osd_num = pg.cur_set[i],
-            });
-        }
-    }
-    // Mark object chunk(s) as corrupted
-    uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
-    for (auto & chunk: corrupted_set)
-    {
-        bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
-        if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
-            n_corrupted++;
-        chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
-        if (!chunk.loc_bad)
-        {
-            if (pg.scheme == POOL_SCHEME_REPLICATED)
-                n_roles = 1;
-            else if (!(has_roles & (1 << chunk.role)))
-            {
-                n_roles++;
-                has_roles |= (1 << chunk.role);
-            }
-            n_copies++;
-        }
-    }
-    if (!n_corrupted)
-    {
-        // No chunks newly marked as corrupted - object is already marked or moved
-        return object_state;
-    }
-    int old_pg_state = pg.state;
-    if (object_state)
-    {
-        remove_object_from_state(oid, &object_state, pg, false);
-        deref_object_state(pg, &object_state, ref);
-    }
-    // Calculate object state
-    uint64_t obj_state = OBJ_CORRUPTED;
-    int pg_state_bits = PG_HAS_CORRUPTED;
-    this->corrupted_objects++;
-    pg.corrupted_count++;
-    if (log_level > 1)
-    {
-        printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
-            oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
-    }
-    if (n_roles < pg.pg_data_size)
-    {
-        this->incomplete_objects++;
-        obj_state |= OBJ_INCOMPLETE;
-        pg_state_bits = PG_HAS_INCOMPLETE;
-    }
-    else if (n_roles < pg.pg_cursize)
-    {
-        this->degraded_objects++;
-        obj_state |= OBJ_DEGRADED;
-        pg_state_bits = PG_HAS_DEGRADED;
-    }
-    else
-    {
-        this->misplaced_objects++;
-        obj_state |= OBJ_MISPLACED;
-        pg_state_bits = PG_HAS_MISPLACED;
-    }
-    pg.state |= pg_state_bits;
-    if (pg.state != old_pg_state)
-    {
-        report_pg_state(pg);
-        if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
-            (old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
-        {
-            peering_state = peering_state | OSD_RECOVERING;
-            if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
-            {
-                // Restart recovery from degraded objects
-                recovery_last_degraded = true;
-                recovery_last_pg = {};
-                recovery_last_oid = {};
-            }
-            ringloop->wakeup();
-        }
-    }
-    // Insert object into the new state and retry
-    object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
-    if (ref)
-        object_state->ref_count++;
-    return object_state;
-}
-
 // Decrement pg_osd_set_state_t's object_count and change PG state accordingly
-void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
+void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
 {
-    if (!*object_state)
-    {
-        return;
-    }
-    pg_osd_set_state_t *recheck_state = NULL;
-    get_object_osd_set(pg, oid, &recheck_state);
-    if (recheck_state != *object_state)
-    {
-        recheck_state->ref_count++;
-        (*object_state)->ref_count--;
-        *object_state = recheck_state;
-        return;
-    }
-    (*object_state)->object_count--;
-    if ((*object_state)->state & OBJ_CORRUPTED)
-    {
-        this->corrupted_objects--;
-        pg.corrupted_count--;
-    }
-    bool changed = false;
-    if ((*object_state)->state & OBJ_INCOMPLETE)
+    if (object_state->state & OBJ_INCOMPLETE)
    {
        // Successful write means that object is not incomplete anymore
        this->incomplete_objects--;
@@ -429,52 +277,41 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
        if (!pg.incomplete_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
-            changed = true;
+            report_pg_state(pg);
        }
    }
-    else if ((*object_state)->state & OBJ_DEGRADED)
+    else if (object_state->state & OBJ_DEGRADED)
    {
        this->degraded_objects--;
        pg.degraded_objects.erase(oid);
        if (!pg.degraded_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_DEGRADED;
-            changed = true;
+            report_pg_state(pg);
        }
    }
-    else if ((*object_state)->state & OBJ_MISPLACED)
+    else if (object_state->state & OBJ_MISPLACED)
    {
        this->misplaced_objects--;
        pg.misplaced_objects.erase(oid);
        if (!pg.misplaced_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_MISPLACED;
-            changed = true;
+            report_pg_state(pg);
        }
    }
    else
    {
-        throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
-    }
-    if (changed && report)
-    {
-        report_pg_state(pg);
+        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
    }
 }

-void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
+void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
 {
-    if (*object_state)
+    if (*object_state && !(--(*object_state)->object_count))
    {
-        if (deref)
-        {
-            (*object_state)->ref_count--;
-        }
-        if (!(*object_state)->object_count && !(*object_state)->ref_count)
-        {
-            pg.state_dict.erase((*object_state)->osd_set);
-            *object_state = NULL;
-        }
+        pg.state_dict.erase((*object_state)->osd_set);
+        *object_state = NULL;
    }
 }

@@ -504,28 +341,21 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
    }
 resume_1:
    // Determine which OSDs contain this object and delete it
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
-    if (op_data->object_state)
-    {
-        op_data->object_state->ref_count++;
-    }
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    // Submit 1 read to determine the actual version number
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
-    op_data->prev_set = NULL;
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
-        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -541,7 +371,6 @@ resume_4:
 resume_5:
    if (op_data->errors > 0)
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
@@ -554,8 +383,8 @@ resume_5:
    }
    else
    {
-        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
-        deref_object_state(pg, &op_data->object_state, true);
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        free_object_state(pg, &op_data->object_state);
    }
    pg.total_count--;
    cur_op->reply.hdr.retval = 0;
--- a/src/osd_primary.h
+++ b/src/osd_primary.h
@@ -9,7 +9,6 @@
 #define SUBMIT_READ 0
 #define SUBMIT_RMW_READ 1
 #define SUBMIT_WRITE 2
-#define SUBMIT_SCRUB_READ 3

 struct unstable_osd_num_t
 {
@@ -51,7 +50,6 @@ struct osd_primary_op_data_t
            // for read_bitmaps
            void *snapshot_bitmaps;
            inode_t *read_chain;
-            pg_osd_set_state_t **chain_states;
            uint8_t *missing_flags;
            int chain_size;
            osd_chain_read_t *chain_reads;
--- a/src/osd_primary_chain.cpp
+++ b/src/osd_primary_chain.cpp
@@ -40,24 +40,10 @@ resume_3:
 resume_4:
    if (op_data->errors > 0)
    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // Handle corrupted reads and retry...
-            check_corrupted_chained(pg, cur_op);
-            free(cur_op->buf);
-            cur_op->buf = NULL;
-            free(op_data->chain_reads);
-            op_data->chain_reads = NULL;
-            // FIXME: We can in theory retry only specific parts instead of the whole operation
-            goto resume_1;
-        }
-        else
-        {
-            free(op_data->chain_reads);
-            op_data->chain_reads = NULL;
-            finish_op(cur_op, op_data->errcode);
-            return;
-        }
+        free(op_data->chain_reads);
+        op_data->chain_reads = NULL;
+        finish_op(cur_op, op_data->errcode);
+        return;
    }
    send_chained_read_results(pg, cur_op);
    finish_op(cur_op, cur_op->req.rw.len);
@@ -145,7 +131,8 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
        object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
+        pg_osd_set_state_t *object_state;
+        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
        if (pg.scheme == POOL_SCHEME_REPLICATED)
        {
            osd_num_t read_target = 0;
@@ -260,7 +247,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                osd_op_t *subop = op_data->subops+subop_idx;
                subop->op_type = OSD_OP_OUT;
                // FIXME: Use the pre-allocated buffer
-                assert(!subop->buf);
                subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
                subop->req = (osd_any_op_t){
                    .sec_read_bmp = {
@@ -311,7 +297,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                    // Fail it immediately
                    subop->peer_fd = -1;
                    subop->reply.hdr.retval = -EPIPE;
-                    ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
+                    subop->callback(subop);
                }
                subop_idx++;
            }
@@ -389,8 +375,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    op_data->chain_read_count = chain_reads.size();
    op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
        1, sizeof(osd_chain_read_t) * chain_reads.size()
-        // FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
-        // (but it's slightly harder to handle in send_chained_read_results())
        + sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
    );
    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
@@ -419,7 +403,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        uint64_t *cur_set = pg.cur_set.data();
        if (pg.state != PG_ACTIVE)
        {
-            cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
+            pg_osd_set_state_t *object_state;
+            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
            if (op_data->scheme != POOL_SCHEME_REPLICATED)
            {
                if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
@@ -431,17 +416,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
                }
                op_data->degraded = 1;
            }
-            else
-            {
-                auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
-                if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
-                {
-                    free(op_data->chain_reads);
-                    op_data->chain_reads = NULL;
-                    finish_op(cur_op, -EIO);
-                    return -1;
-                }
-            }
        }
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -459,7 +433,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
            }
        }
    }
-    assert(!cur_op->buf);
    cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
    void *cur_buf = cur_op->buf;
    for (int cri = 0; cri < chain_reads.size(); cri++)
@@ -495,8 +468,12 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
-        uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
+        uint64_t *cur_set = pg.cur_set.data();
+        if (pg.state != PG_ACTIVE)
+        {
+            pg_osd_set_state_t *object_state;
+            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
+        }
        int zero_read = -1;
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -510,33 +487,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    return 0;
 }

-void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
-    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
-        (uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
-    );
-    for (int cri = 0; cri < op_data->chain_read_count; cri++)
-    {
-        object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
-        osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
-        bool corrupted = false;
-        for (int i = 0; i < stripe_count; i++)
-        {
-            if (stripes[i].read_error)
-            {
-                corrupted = true;
-                break;
-            }
-        }
-        if (corrupted)
-        {
-            mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
-        }
-    }
-}
-
 void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -9,7 +9,6 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->peer_fd = -1;
        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
@@ -140,40 +139,34 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
    for (int role = 0; role < op_data->pg_size; role++)
    {
        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
+        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
        {
            continue;
        }
        osd_num_t role_osd_num = osd_set[role];
-        int stripe_num = rep ? 0 : role;
        if (role_osd_num != 0)
        {
+            int stripe_num = rep ? 0 : role;
            osd_op_t *subop = op_data->subops + i;
-            stripes[stripe_num].osd_num = role_osd_num;
-            stripes[stripe_num].read_error = false;
-            subop->bitmap = stripes[stripe_num].bmp_buf;
-            subop->bitmap_len = clean_entry_bitmap_size;
-            // Using rmw_buf to pass pointer to stripes. Dirty but should work
-            subop->rmw_buf = stripes+stripe_num;
            if (role_osd_num == this->osd_num)
            {
                clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
                subop->op_type = (uint64_t)cur_op;
-                subop->bs_op = new blockstore_op_t((blockstore_op_t){
+                subop->bitmap = stripes[stripe_num].bmp_buf;
+                subop->bitmap_len = clean_entry_bitmap_size;
+                subop->bs_op = new blockstore_op_t({
                    .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
                    .callback = [subop, this](blockstore_op_t *bs_subop)
                    {
                        handle_primary_bs_subop(subop);
                    },
-                    {
-                        .oid = (object_id){
-                            .inode = inode,
-                            .stripe = op_data->oid.stripe | stripe_num,
-                        },
-                        .version = op_version,
-                        .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
-                        .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
+                    .oid = {
+                        .inode = inode,
+                        .stripe = op_data->oid.stripe | stripe_num,
                    },
+                    .version = op_version,
+                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
+                    .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
                    .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
                    .bitmap = stripes[stripe_num].bmp_buf,
                });
@@ -189,6 +182,8 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            else
            {
                subop->op_type = OSD_OP_OUT;
+                subop->bitmap = stripes[stripe_num].bmp_buf;
+                subop->bitmap_len = clean_entry_bitmap_size;
                subop->req.sec_rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
@@ -240,15 +235,11 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
                    // Fail it immediately
                    subop->peer_fd = -1;
                    subop->reply.hdr.retval = -EPIPE;
-                    ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
+                    subop->callback(subop);
                }
            }
            i++;
        }
-        else
-        {
-            stripes[stripe_num].osd_num = 0;
-        }
    }
    return i-subop_idx;
 }
@@ -338,11 +329,9 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
            printf(
-                subop->peer_fd >= 0
-                    ? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
-                    : "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
+                "%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
                osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
-                retval, expected, subop->peer_fd
+                subop->peer_fd, retval, expected
            );
        }
        else
@@ -352,32 +341,22 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
                osd_op_names[opcode], subop->peer_fd, retval, expected
            );
        }
-        if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
-        {
-            // We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
-            ((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
-        }
-        subop->rmw_buf = NULL;
-        // Error priority: EIO > EDOM > ENOSPC > EPIPE
-        if (op_data->errcode == 0 ||
-            retval == -EIO ||
-            retval == -EDOM && (op_data->errcode == -ENOSPC || op_data->errcode == -EPIPE) ||
+        // Error priority: EIO > ENOSPC > EPIPE
+        if (op_data->errcode == 0 || retval == -EIO ||
            retval == -ENOSPC && op_data->errcode == -EPIPE)
        {
            op_data->errcode = retval;
        }
        op_data->errors++;
-        if (subop->peer_fd >= 0 && retval != -EDOM &&
-            (retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
-            (retval != -EIO || opcode != OSD_OP_SEC_READ))
+        if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
+            retval != -ENOSPC))
        {
-            // Drop connection on unexpected errors
+            // Drop connection on any error expect ENOSPC
            msgr.stop_client(subop->peer_fd);
        }
    }
    else
    {
-        subop->rmw_buf = NULL;
        op_data->done++;
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
@@ -421,10 +400,6 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
        {
            continue_primary_del(cur_op);
        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
-        {
-            continue_primary_scrub(cur_op);
-        }
        else
        {
            throw std::runtime_error("BUG: unknown opcode");
@@ -545,7 +520,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
                // Fail it immediately
                subops[i].peer_fd = -1;
                subops[i].reply.hdr.retval = -EPIPE;
-                ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
+                subops[i].callback(&subops[i]);
            }
        }
    }
@@ -628,9 +603,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                {
                    handle_primary_bs_subop(subop);
                },
-                {
-                    .len = (uint32_t)stab_osd.len,
-                },
+                .len = (uint32_t)stab_osd.len,
                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
            });
            bs->enqueue_op(subops[i].bs_op);
@@ -662,7 +635,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                // Fail it immediately
                subops[i].peer_fd = -1;
                subops[i].reply.hdr.retval = -EPIPE;
-                ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
+                subops[i].callback(&subops[i]);
            }
        }
    }
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -58,13 +58,7 @@ resume_1:
    // Determine blocks to read and write
    // Missing chunks are allowed to be overwritten even in incomplete objects
    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
-    if (op_data->object_state)
-    {
-        // Protect object_state from being freed by a parallel read operation changing it
-        op_data->object_state->ref_count++;
-    }
-retry_1:
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Simplified algorithm
@@ -74,12 +68,6 @@ retry_1:
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
-            if (op_data->object_state->state & OBJ_INCOMPLETE)
-            {
-                // Refuse partial overwrite of an incomplete (corrupted) object
-                cur_op->reply.hdr.retval = -EIO;
-                goto continue_others;
-            }
            // Object is degraded/misplaced and will be moved to <write_osd_set>
            op_data->stripes[0].read_start = 0;
            op_data->stripes[0].read_end = bs_block_size;
@@ -93,66 +81,24 @@ retry_1:
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
-            cur_op->reply.hdr.retval = -EIO;
+            cur_op->reply.hdr.retval = -EINVAL;
            goto continue_others;
        }
    }
    // Read required blocks
-    {
-        if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
-        {
-            // Allow to read version number (just version number!) from corrupted chunks
-            // to allow full overwrite of a corrupted object
-            bool found = false;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
-                {
-                    found = true;
-                    break;
-                }
-            }
-            if (!found)
-            {
-                osd_num_t corrupted_target[op_data->pg_size];
-                for (int role = 0; role < op_data->pg_size; role++)
-                {
-                    corrupted_target[role] = 0;
-                }
-                for (auto & loc: op_data->object_state->osd_set)
-                {
-                    if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
-                    {
-                        corrupted_target[loc.role] = loc.osd_num;
-                    }
-                }
-                submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
-                goto resume_2;
-            }
-        }
-        submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
-    }
+    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // Mark object corrupted and retry
-            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
-            op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
-            goto retry_1;
-        }
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
-        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -222,8 +168,8 @@ resume_3:
            auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
            if (it == pg.target_history.end() || *it != history_set)
                pg.target_history.insert(it, history_set);
+            pg.history_changed = true;
        }
-        pg.history_changed = true;
        report_pg_states();
 resume_10:
        if (pg.epoch > pg.reported_epoch)
@@ -236,7 +182,6 @@ resume_10:
    // Recheck PG state after reporting history - maybe it's already stopping/restarting
    if (pg.state & (PG_STOPPING|PG_REPEERING))
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
        return;
    }
@@ -252,7 +197,6 @@ resume_5:
    }
    if (op_data->errors > 0)
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
@@ -261,7 +205,7 @@ resume_5:
        // We must forget the unclean state of the object before deleting it
        // so the next reads don't accidentally read a deleted version
        // And it should be done at the same time as the removal of the version override
-        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
        pg.clean_count++;
    }
 resume_6:
@@ -316,12 +260,12 @@ resume_7:
                    copies_to_delete_after_sync_count++;
                }
            }
-            deref_object_state(pg, &op_data->object_state, true);
+            free_object_state(pg, &op_data->object_state);
        }
        else
        {
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
-            deref_object_state(pg, &op_data->object_state, true);
+            free_object_state(pg, &op_data->object_state);
            if (op_data->n_subops > 0)
            {
 resume_8:
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@@ -759,18 +759,7 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
    uint32_t &start, uint32_t &end)
 {
-    bool required = false;
-    for (int role = pg_minsize; role < pg_size; role++)
-    {
-        if (write_osd_set[role] != 0)
-        {
-            // Whole parity chunk is needed when we move the object
-            if (write_osd_set[role] != read_osd_set[role])
-                end = chunk_size;
-            required = true;
-        }
-    }
-    if (required && end != chunk_size)
+    if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
    {
        // start & end are required for calc_rmw_parity
        for (int role = 0; role < pg_minsize; role++)
@@ -781,6 +770,14 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
                end = std::max(stripes[role].req_end, end);
            }
        }
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
+            {
+                start = 0;
+                end = chunk_size;
+            }
+        }
    }
    // Set bitmap bits accordingly
    if (bitmap_granularity > 0)
@@ -948,7 +945,7 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
                    {
                        if (write_osd_set[i])
                        {
-                            memcpy((uint8_t*)subm + item_size*pg_minsize*j, (uint8_t*)matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
+                            memcpy(subm + item_size*pg_minsize*j, matrix_data + item_size*pg_minsize*(i-pg_minsize), item_size*pg_minsize);
                            j++;
                        }
                    }
--- a/src/osd_rmw.h
+++ b/src/osd_rmw.h
@@ -25,9 +25,7 @@ struct osd_rmw_stripe_t
    uint32_t req_start, req_end;
    uint32_t read_start, read_end;
    uint32_t write_start, write_end;
-    osd_num_t osd_num;
-    bool missing: 1;
-    bool read_error: 1;
+    bool missing;
 };

 // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@@ -24,7 +24,7 @@ void test11();
 void test12();
 void test13();
 void test14();
-void test15(bool second);
+void test15();
 void test16();

 int main(int narg, char *args[])
@@ -54,8 +54,7 @@ int main(int narg, char *args[])
    // Test 14
    test14();
    // Test 15
-    test15(false);
-    test15(true);
+    test15();
    // Test 16
    test16();
    // End
@@ -827,11 +826,12 @@ void test14()

 ***/

-void test15(bool second)
+void test15()
 {
    const int bmp = 64*1024 / 4096 / 8;
    use_ec(4, 2, true);
-    osd_num_t osd_set[4] = { 1, 2, (osd_num_t)(second ? 0 : 3), (osd_num_t)(second ? 4 : 0) };
+    osd_num_t osd_set[4] = { 1, 2, 3, 0 };
+    osd_num_t write_osd_set[4] = { 1, 2, 3, 0 };
    osd_rmw_stripe_t stripes[4] = {};
    unsigned bitmaps[4] = { 0 };
    // Test 15.0
@@ -842,7 +842,7 @@ void test15(bool second)
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
    // Test 15.1
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, osd_set, 64*1024, bmp);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 3, write_osd_set, 64*1024, bmp);
    for (int i = 0; i < 4; i++)
        stripes[i].bmp_buf = bitmaps+i;
    assert(rmw_buf);
@@ -852,38 +852,36 @@ void test15(bool second)
    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
-    assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
-    assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
+    assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
    assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
    assert(stripes[1].read_buf == NULL);
    assert(stripes[2].read_buf == NULL);
    assert(stripes[3].read_buf == NULL);
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == (uint8_t*)write_buf);
-    assert(stripes[2+second].write_buf == rmw_buf);
-    assert(stripes[3-second].write_buf == NULL);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == NULL);
    // Test 15.2 - encode
    set_pattern(write_buf, 4*1024, PATTERN1);
    set_pattern(stripes[0].read_buf, 4*1024, PATTERN2);
    memset(stripes[0].bmp_buf, 0, bmp);
    memset(stripes[1].bmp_buf, 0, bmp);
-    memset(stripes[2+second].write_buf, 0, 4096);
-    calc_rmw_parity_ec(stripes, 4, 2, osd_set, osd_set, 64*1024, bmp);
-    assert(second || *(uint32_t*)stripes[2].bmp_buf == 0x80);
+    calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 64*1024, bmp);
+    assert(*(uint32_t*)stripes[2].bmp_buf == 0x80);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 28*1024 && stripes[1].write_end == 32*1024);
-    assert(stripes[2+second].write_start == 28*1024 && stripes[2+second].write_end == 32*1024);
-    assert(stripes[3-second].write_start == 0 && stripes[3-second].write_end == 0);
+    assert(stripes[2].write_start == 28*1024 && stripes[2].write_end == 32*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 0);
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == (uint8_t*)write_buf);
-    assert(stripes[2+second].write_buf == rmw_buf);
-    assert(stripes[3-second].write_buf == NULL);
-    // first parity is always xor :), second isn't...
-    check_pattern(stripes[2+second].write_buf, 4*1024, second ? 0xb79a59a0ce8b9b81 : PATTERN1^PATTERN2);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == NULL);
+    check_pattern(stripes[2].write_buf, 4*1024, PATTERN1^PATTERN2); // first parity is always xor :)
    // Done
    free(rmw_buf);
    free(write_buf);
-    use_ec(4, 2, false);
+    use_ec(3, 2, false);
 }

 /***
@@ -986,5 +984,5 @@ void test16()
    // Done
    free(rmw_buf);
    free(write_buf);
-    use_ec(4, 2, false);
+    use_ec(3, 2, false);
 }
--- a/src/osd_scrub.cpp
+++ b/src/osd_scrub.cpp
@@ -1,531 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "osd_primary.h"
-
-#define SELF_FD -1
-
-void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
-{
-    pool_id_t pool_id = pg_id.pool_id;
-    pg_num_t pg_num = pg_id.pg_num;
-    assert(!scrub_list_op);
-    if (role_osd == this->osd_num)
-    {
-        // Self
-        osd_op_t *op = new osd_op_t();
-        op->op_type = 0;
-        op->peer_fd = SELF_FD;
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t();
-        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
-        if (min_oid.inode != 0 || min_oid.stripe != 0)
-            op->bs_op->min_oid = min_oid;
-        else
-            op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
-        op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
-        op->bs_op->max_oid.stripe = UINT64_MAX;
-        op->bs_op->list_stable_limit = scrub_list_limit;
-        op->bs_op->pg_count = pg_counts[pool_id];
-        op->bs_op->pg_number = pg_num-1;
-        op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
-        {
-            scrub_list_op = NULL;
-            if (op->bs_op->retval < 0)
-            {
-                printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
-                force_stop(1);
-                return;
-            }
-            add_bs_subop_stats(op);
-            scrub_cur_list = {
-                .buf = (obj_ver_id*)op->bs_op->buf,
-                .total_count = (uint64_t)op->bs_op->retval,
-                .stable_count = op->bs_op->version,
-            };
-            delete op->bs_op;
-            op->bs_op = NULL;
-            delete op;
-            continue_scrub();
-        };
-        scrub_list_op = op;
-        bs->enqueue_op(op->bs_op);
-    }
-    else
-    {
-        // Peer
-        osd_op_t *op = new osd_op_t();
-        op->op_type = OSD_OP_OUT;
-        op->peer_fd = msgr.osd_peer_fds.at(role_osd);
-        op->req = (osd_any_op_t){
-            .sec_list = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = msgr.next_subop_id++,
-                    .opcode = OSD_OP_SEC_LIST,
-                },
-                .list_pg = pg_num,
-                .pg_count = pg_counts[pool_id],
-                .pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
-                .min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
-                .max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
-                .min_stripe = min_oid.stripe,
-                .stable_limit = scrub_list_limit,
-            },
-        };
-        op->callback = [this, role_osd](osd_op_t *op)
-        {
-            scrub_list_op = NULL;
-            if (op->reply.hdr.retval < 0)
-            {
-                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
-                int fail_fd = op->peer_fd;
-                delete op;
-                msgr.stop_client(fail_fd);
-                return;
-            }
-            scrub_cur_list = {
-                .buf = (obj_ver_id*)op->buf,
-                .total_count = (uint64_t)op->reply.hdr.retval,
-                .stable_count = op->reply.sec_list.stable_count,
-            };
-            // set op->buf to NULL so it doesn't get freed
-            op->buf = NULL;
-            delete op;
-            continue_scrub();
-        };
-        scrub_list_op = op;
-        msgr.outbox_push(op);
-    }
-}
-
-bool osd_t::pick_next_scrub(object_id & next_oid)
-{
-    if (!pgs.size())
-    {
-        if (scrub_cur_list.buf)
-        {
-            free(scrub_cur_list.buf);
-            scrub_cur_list = {};
-            scrub_last_pg = {};
-        }
-        return false;
-    }
-    timespec tv_now;
-    clock_gettime(CLOCK_REALTIME, &tv_now);
-    bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
-    // Restart scanning from the same PG as the last time
-    auto pg_it = pgs.lower_bound(scrub_last_pg);
-    while (pg_it != pgs.end())
-    {
-        if (pg_it->second.state & PG_ACTIVE)
-        {
-            auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id);
-            auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
-            if (pg_it->second.scrub_ts < tv_now.tv_sec-interval)
-            {
-                // Continue scrubbing from the next object
-                if (scrub_last_pg == pg_it->first)
-                {
-                    while (scrub_list_pos < scrub_cur_list.total_count)
-                    {
-                        auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
-                        oid.stripe &= ~STRIPE_MASK;
-                        scrub_list_pos++;
-                        if (recovery_ops.find(oid) == recovery_ops.end() &&
-                            scrub_ops.find(oid) == scrub_ops.end())
-                        {
-                            next_oid = oid;
-                            if (!(pg_it->second.state & PG_SCRUBBING))
-                            {
-                                // Currently scrubbing this PG
-                                pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
-                                report_pg_state(pg_it->second);
-                            }
-                            return true;
-                        }
-                    }
-                }
-                if (scrub_last_pg == pg_it->first &&
-                    scrub_cur_list.total_count && scrub_list_pos >= scrub_cur_list.total_count &&
-                    scrub_cur_list.stable_count < scrub_list_limit)
-                {
-                    // End of the list, mark this PG as scrubbed and go to the next PG
-                }
-                else
-                {
-                    // Continue listing
-                    object_id scrub_last_oid;
-                    if (scrub_last_pg != pg_it->first)
-                        scrub_last_oid = (object_id){};
-                    else if (scrub_cur_list.stable_count > 0)
-                    {
-                        scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
-                        scrub_last_oid.stripe++;
-                    }
-                    osd_num_t scrub_osd = 0;
-                    for (osd_num_t pg_osd: pg_it->second.cur_set)
-                    {
-                        if (pg_osd == this->osd_num || scrub_osd == 0)
-                            scrub_osd = pg_osd;
-                    }
-                    if (!(pg_it->second.state & PG_SCRUBBING))
-                    {
-                        // Currently scrubbing this PG
-                        pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
-                        report_pg_state(pg_it->second);
-                    }
-                    if (scrub_cur_list.buf)
-                    {
-                        free(scrub_cur_list.buf);
-                        scrub_cur_list = {};
-                        scrub_last_oid = {};
-                    }
-                    scrub_last_pg = pg_it->first;
-                    scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
-                    return true;
-                }
-            }
-            if (pg_it->second.state & PG_SCRUBBING)
-            {
-                pg_it->second.scrub_ts = tv_now.tv_sec;
-                pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
-                pg_it->second.history_changed = true;
-                report_pg_state(pg_it->second);
-                schedule_scrub(pg_it->second);
-            }
-            // The list is definitely not needed anymore
-            if (scrub_cur_list.buf)
-            {
-                free(scrub_cur_list.buf);
-                scrub_cur_list = {};
-            }
-        }
-        pg_it++;
-        if (pg_it == pgs.end() && rescan)
-        {
-            // Scan one more time to guarantee that there are no PGs to scrub
-            pg_it = pgs.begin();
-            rescan = false;
-        }
-    }
-    // Scanned all PGs - no more scrubs to do
-    return false;
-}
-
-void osd_t::submit_scrub_op(object_id oid)
-{
-    auto osd_op = new osd_op_t();
-    osd_op->op_type = OSD_OP_OUT;
-    osd_op->req = (osd_any_op_t){
-        .rw = {
-            .header = {
-                .magic = SECONDARY_OSD_OP_MAGIC,
-                .id = 1,
-                .opcode = OSD_OP_SCRUB,
-            },
-            .inode = oid.inode,
-            .offset = oid.stripe,
-            .len = 0,
-        },
-    };
-    if (log_level > 2)
-    {
-        printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
-    }
-    osd_op->callback = [this](osd_op_t *osd_op)
-    {
-        object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
-        if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
-        {
-            // Scrub error
-            printf(
-                "Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
-                oid.inode, oid.stripe, INODE_POOL(oid.inode),
-                map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
-                osd_op->reply.hdr.retval
-            );
-        }
-        else if (log_level > 2)
-        {
-            printf("Scrubbed %lx:%lx OK\n", oid.inode, oid.stripe);
-        }
-        delete osd_op;
-        if (scrub_sleep_ms)
-        {
-            this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
-            {
-                scrub_ops.erase(oid);
-                continue_scrub();
-            });
-        }
-        else
-        {
-            scrub_ops.erase(oid);
-            continue_scrub();
-        }
-    };
-    scrub_ops[oid] = osd_op;
-    exec_op(osd_op);
-}
-
-// Triggers scrub requests
-// Scrub reads data from all replicas and compares it
-// To scrub first we need to read objects listings
-bool osd_t::continue_scrub()
-{
-    if (scrub_list_op)
-    {
-        return true;
-    }
-    while (scrub_ops.size() < scrub_queue_depth)
-    {
-        object_id oid;
-        if (pick_next_scrub(oid))
-            submit_scrub_op(oid);
-        else
-            return false;
-    }
-    return true;
-}
-
-void osd_t::schedule_scrub(pg_t & pg)
-{
-    auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
-    auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
-    if (!scrub_nearest_ts || scrub_nearest_ts > pg.scrub_ts+interval)
-    {
-        scrub_nearest_ts = pg.scrub_ts+interval;
-        timespec tv_now;
-        clock_gettime(CLOCK_REALTIME, &tv_now);
-        if (scrub_timer_id >= 0)
-        {
-            tfd->clear_timer(scrub_timer_id);
-            scrub_timer_id = -1;
-        }
-        if (tv_now.tv_sec > scrub_nearest_ts)
-        {
-            scrub_nearest_ts = 0;
-            peering_state = peering_state | OSD_SCRUBBING;
-            ringloop->wakeup();
-        }
-        else
-        {
-            scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
-            {
-                scrub_timer_id = -1;
-                scrub_nearest_ts = 0;
-                peering_state = peering_state | OSD_SCRUBBING;
-                ringloop->wakeup();
-            });
-        }
-    }
-}
-
-void osd_t::continue_primary_scrub(osd_op_t *cur_op)
-{
-    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
-        return;
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (op_data->st == 1)
-        goto resume_1;
-    else if (op_data->st == 2)
-        goto resume_2;
-    {
-        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-        // Determine version
-        auto vo_it = pg.ver_override.find(op_data->oid);
-        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        // PG may have degraded or misplaced objects
-        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
-        // Read all available chunks
-        int n_copies = 0;
-        op_data->degraded = false;
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            op_data->stripes[role].read_start = 0;
-            op_data->stripes[role].read_end = bs_block_size;
-            if (op_data->prev_set[role] != 0)
-            {
-                n_copies++;
-            }
-            else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
-            {
-                op_data->degraded = true;
-            }
-        }
-        if (n_copies <= op_data->pg_data_size)
-        {
-            // Nothing to compare, even if we'd like to
-            finish_op(cur_op, 0);
-            return;
-        }
-        cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
-            op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
-        // Submit reads
-        osd_op_t *subops = new osd_op_t[n_copies];
-        op_data->fact_ver = 0;
-        op_data->done = op_data->errors = op_data->errcode = 0;
-        op_data->n_subops = n_copies;
-        op_data->subops = subops;
-        int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
-            op_data->stripes, op_data->prev_set, cur_op, 0, -1);
-        assert(sent == n_copies);
-        op_data->st = 1;
-    }
-resume_1:
-    return;
-resume_2:
-    if (op_data->errors > 0)
-    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // I/O or checksum error
-            int n_copies = 0;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (op_data->stripes[role].read_end != 0 &&
-                    !op_data->stripes[role].read_error)
-                {
-                    n_copies++;
-                }
-            }
-            if (n_copies <= op_data->pg_data_size)
-            {
-                // Nothing to compare, just mark the object as corrupted
-                auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-                // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
-                op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
-                // Operation is treated as unsuccessful only if the object becomes unreadable
-                finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
-                return;
-            }
-            // Proceed, we can still compare chunks that were successfully read
-        }
-        else
-        {
-            finish_op(cur_op, op_data->errcode);
-            return;
-        }
-    }
-    if (op_data->scheme == POOL_SCHEME_REPLICATED)
-    {
-        // Check that all chunks have returned the same data
-        int total = 0;
-        int eq_to[op_data->pg_size];
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            eq_to[role] = -1;
-            if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
-            {
-                total++;
-                eq_to[role] = role;
-                for (int other = 0; other < role; other++)
-                {
-                    // Only compare with unique chunks (eq_to[other] == other)
-                    if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
-                    {
-                        eq_to[role] = eq_to[other];
-                        break;
-                    }
-                }
-            }
-        }
-        int votes[op_data->pg_size];
-        for (int role = 0; role < op_data->pg_size; role++)
-            votes[role] = 0;
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            if (eq_to[role] != -1)
-                votes[eq_to[role]]++;
-        }
-        int best = -1;
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
-                best = role;
-        }
-        if (best > 0 && votes[best] < total)
-        {
-            // FIXME Add a flag to allow to skip such objects and not recover them automatically
-            bool unknown = false;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (role != best && votes[role] == votes[best])
-                    unknown = true;
-                if (votes[role] > 0 && votes[role] < votes[best])
-                {
-                    printf(
-                        "[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
-                        INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                        op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
-                    );
-                    op_data->stripes[role].read_error = true;
-                }
-            }
-            if (unknown)
-            {
-                // It's unknown which replica is good. There are multiple versions with no majority
-                best = -1;
-            }
-        }
-    }
-    else
-    {
-        assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
-        if (op_data->degraded)
-        {
-            // Reconstruct missing stripes
-            // XOR shouldn't come here as it only has 1 parity chunk
-            assert(op_data->scheme == POOL_SCHEME_EC);
-            reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
-        }
-        // Generate parity chunks and compare them with actual data
-        osd_num_t fake_osd_set[op_data->pg_size];
-        for (int i = 0; i < op_data->pg_size; i++)
-        {
-            fake_osd_set[i] = 1;
-            op_data->stripes[i].write_buf = i >= op_data->pg_data_size
-                ? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
-                : op_data->stripes[i].read_buf;
-        }
-        if (op_data->scheme == POOL_SCHEME_XOR)
-        {
-            calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
-        }
-        else if (op_data->scheme == POOL_SCHEME_EC)
-        {
-            calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
-        }
-        // Now compare that write_buf == read_buf
-        for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
-        {
-            if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
-                memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
-            {
-                // Chunks don't match - something's wrong... but we don't know what :D
-                // FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
-                printf(
-                    "[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
-                    INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                    op_data->oid.inode, op_data->oid.stripe,
-                    role-op_data->pg_data_size, op_data->stripes[role].osd_num
-                );
-                op_data->stripes[role].read_error = true;
-            }
-        }
-    }
-    for (int role = 0; role < op_data->pg_size; role++)
-    {
-        if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
-        {
-            // Got at least 1 read error or mismatch, mark the object as corrupted
-            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
-            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
-            break;
-        }
-    }
-    finish_op(cur_op, 0);
-}
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@@ -125,18 +125,11 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
-        cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
-        cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
-        cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
-        cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
-        cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
-        if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
-        {
-            cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
-                ? cur_op->req.sec_list.max_stripe : UINT64_MAX;
-        }
-        cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
+        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
+        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
+        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
+        cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
+        cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
 #ifdef OSD_STUB
        cur_op->bs_op->retval = 0;
        cur_op->bs_op->buf = NULL;
--- a/src/pg_states.cpp
+++ b/src/pg_states.cpp
@@ -8,37 +8,35 @@ const int pg_state_bit_count = 16;
 const int pg_state_bits[16] = {
    PG_STARTING,
    PG_PEERING,
+    PG_PEERED,
    PG_INCOMPLETE,
    PG_ACTIVE,
    PG_REPEERING,
    PG_STOPPING,
    PG_OFFLINE,
    PG_DEGRADED,
-    PG_HAS_CORRUPTED,
    PG_HAS_INCOMPLETE,
    PG_HAS_DEGRADED,
    PG_HAS_MISPLACED,
    PG_HAS_UNCLEAN,
    PG_HAS_INVALID,
    PG_LEFT_ON_DEAD,
-    PG_SCRUBBING,
 };

 const char *pg_state_names[16] = {
    "starting",
    "peering",
+    "peered",
    "incomplete",
    "active",
    "repeering",
    "stopping",
    "offline",
    "degraded",
-    "has_corrupted",
    "has_incomplete",
    "has_degraded",
    "has_misplaced",
    "has_unclean",
    "has_invalid",
    "left_on_dead",
-    "scrubbing",
 };
--- a/src/pg_states.h
+++ b/src/pg_states.h
@@ -4,27 +4,27 @@
 #pragma once

 // Placement group states
-// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE
+// STARTING -> [acquire lock] -> PEERING -> PEERED
+// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
 // ACTIVE -> REPEERING -> PEERING
 // ACTIVE -> STOPPING -> OFFLINE -> [release lock]
 // Exactly one of these:
 #define PG_STARTING (1<<0)
 #define PG_PEERING (1<<1)
-#define PG_INCOMPLETE (1<<2)
-#define PG_ACTIVE (1<<3)
-#define PG_REPEERING (1<<4)
-#define PG_STOPPING (1<<5)
-#define PG_OFFLINE (1<<6)
+#define PG_PEERED (1<<2)
+#define PG_INCOMPLETE (1<<3)
+#define PG_ACTIVE (1<<4)
+#define PG_REPEERING (1<<5)
+#define PG_STOPPING (1<<6)
+#define PG_OFFLINE (1<<7)
 // Plus any of these:
-#define PG_DEGRADED (1<<7)
-#define PG_HAS_INCOMPLETE (1<<8)
-#define PG_HAS_DEGRADED (1<<9)
-#define PG_HAS_MISPLACED (1<<10)
-#define PG_HAS_UNCLEAN (1<<11)
-#define PG_HAS_INVALID (1<<12)
-#define PG_HAS_CORRUPTED (1<<13)
+#define PG_DEGRADED (1<<8)
+#define PG_HAS_INCOMPLETE (1<<9)
+#define PG_HAS_DEGRADED (1<<10)
+#define PG_HAS_MISPLACED (1<<11)
+#define PG_HAS_UNCLEAN (1<<12)
+#define PG_HAS_INVALID (1<<13)
 #define PG_LEFT_ON_DEAD (1<<14)
-#define PG_SCRUBBING (1<<15)

 // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
 // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
@@ -34,8 +34,6 @@
 #define OBJ_DEGRADED 0x02
 #define OBJ_INCOMPLETE 0x04
 #define OBJ_MISPLACED 0x08
-// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
-#define OBJ_CORRUPTED 0x10
 #define OBJ_NEEDS_STABLE 0x10000
 #define OBJ_NEEDS_ROLLBACK 0x20000

--- a/src/qemu_driver.c
+++ b/src/qemu_driver.c
@@ -53,7 +53,6 @@ typedef struct VitastorClient
    char *etcd_host;
    char *etcd_prefix;
    char *image;
-    int skip_parents;
    uint64_t inode;
    uint64_t pool;
    uint64_t size;
@@ -64,10 +63,6 @@ typedef struct VitastorClient
    int rdma_gid_index;
    int rdma_mtu;
    QemuMutex mutex;
-
-    uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
-    uint32_t last_bitmap_granularity;
-    uint8_t *last_bitmap;
 } VitastorClient;

 typedef struct VitastorRPC
@@ -77,9 +72,6 @@ typedef struct VitastorRPC
    QEMUIOVector *iov;
    long ret;
    int complete;
-    uint64_t inode, offset, len;
-    uint32_t bitmap_granularity;
-    uint8_t *bitmap;
 } VitastorRPC;

 static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
@@ -155,7 +147,6 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
        if (!strcmp(name, "inode") ||
            !strcmp(name, "pool") ||
            !strcmp(name, "size") ||
-            !strcmp(name, "skip-parents") ||
            !strcmp(name, "use-rdma") ||
            !strcmp(name, "rdma-port_num") ||
            !strcmp(name, "rdma-gid-index") ||
@@ -236,16 +227,13 @@ static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandle

 static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
 {
-    VitastorRPC task;
    VitastorClient *client = bs->opaque;
-    void *image = NULL;
    int64_t ret = 0;
    qemu_mutex_init(&client->mutex);
    client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
    // FIXME: Rename to etcd_address
    client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
    client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
-    client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
    client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
    client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
    client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
@@ -255,25 +243,23 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
        vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
        client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
    );
-    image = client->image = g_strdup(qdict_get_try_str(options, "image"));
+    client->image = g_strdup(qdict_get_try_str(options, "image"));
    client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
-    // Get image metadata (size and readonly flag) or just wait until the client is ready
-    if (!image)
-        client->image = (char*)"x";
-    task.complete = 0;
-    task.bs = bs;
-    if (qemu_in_coroutine())
-    {
-        vitastor_co_get_metadata(&task);
-    }
-    else
-    {
-        bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
-        BDRV_POLL_WHILE(bs, !task.complete);
-    }
-    client->image = image;
    if (client->image)
    {
+        // Get image metadata (size and readonly flag)
+        VitastorRPC task;
+        task.complete = 0;
+        task.bs = bs;
+        if (qemu_in_coroutine())
+        {
+            vitastor_co_get_metadata(&task);
+        }
+        else
+        {
+            bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
+            BDRV_POLL_WHILE(bs, !task.complete);
+        }
        client->watch = (void*)task.ret;
        client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
        client->size = vitastor_c_inode_get_size(client->watch);
@@ -298,7 +284,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
            client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
        }
        client->size = qdict_get_try_int(options, "size", 0);
-        vitastor_c_close_watch(client->proxy, (void*)task.ret);
    }
    if (!client->size)
    {
@@ -320,7 +305,6 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
    qdict_del(options, "inode");
    qdict_del(options, "pool");
    qdict_del(options, "size");
-    qdict_del(options, "skip-parents");
    return ret;
 }

@@ -337,8 +321,6 @@ static void vitastor_close(BlockDriverState *bs)
        g_free(client->etcd_prefix);
    if (client->image)
        g_free(client->image);
-    free(client->last_bitmap);
-    client->last_bitmap = NULL;
 }

 #if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
@@ -504,13 +486,6 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
    vitastor_co_init_task(bs, &task);
    task.iov = iov;

-    if (client->last_bitmap)
-    {
-        // Invalidate last bitmap on write
-        free(client->last_bitmap);
-        client->last_bitmap = NULL;
-    }
-
    uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
    qemu_mutex_lock(&client->mutex);
    vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
@@ -524,140 +499,6 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
    return task.ret;
 }

-#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
-#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
-static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
-{
-    VitastorRPC *task = opaque;
-    VitastorClient *client = task->bs->opaque;
-    task->ret = retval;
-    task->complete = 1;
-    if (retval >= 0)
-    {
-        task->bitmap = bitmap;
-        if (client->last_bitmap_inode == task->inode &&
-            client->last_bitmap_offset == task->offset &&
-            client->last_bitmap_len == task->len)
-        {
-            free(client->last_bitmap);
-            client->last_bitmap = bitmap;
-        }
-    }
-    if (qemu_coroutine_self() != task->co)
-    {
-#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
-        aio_co_wake(task->co);
-#else
-        qemu_coroutine_enter(task->co, NULL);
-        qemu_aio_release(task);
-#endif
-    }
-}
-
-static int coroutine_fn vitastor_co_block_status(
-    BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
-    int64_t *pnum, int64_t *map, BlockDriverState **file)
-{
-    // Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
-    // Not allocated => return 0
-    // Error => return -errno
-    // Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
-    VitastorRPC task;
-    VitastorClient *client = bs->opaque;
-    uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
-    uint8_t bit = 0;
-    if (client->last_bitmap && client->last_bitmap_inode == inode &&
-        client->last_bitmap_offset <= offset &&
-        client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
-    {
-        // Use the previously read bitmap
-        task.bitmap_granularity = client->last_bitmap_granularity;
-        task.offset = client->last_bitmap_offset;
-        task.len = client->last_bitmap_len;
-        task.bitmap = client->last_bitmap;
-    }
-    else
-    {
-        // Read bitmap from this position, rounding to full inode PG blocks
-        uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
-        if (!block_size)
-            return -EAGAIN;
-        // Init coroutine
-        vitastor_co_init_task(bs, &task);
-        free(client->last_bitmap);
-        task.inode = client->last_bitmap_inode = inode;
-        task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
-        task.offset = client->last_bitmap_offset = offset / block_size * block_size;
-        task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
-        task.bitmap = client->last_bitmap = NULL;
-        qemu_mutex_lock(&client->mutex);
-        vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
-        qemu_mutex_unlock(&client->mutex);
-        while (!task.complete)
-        {
-            qemu_coroutine_yield();
-        }
-        if (task.ret < 0)
-        {
-            // Error
-            return task.ret;
-        }
-    }
-    if (want_zero)
-    {
-        // Get precise mapping with all holes
-        uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
-        uint64_t bmp_len = task.len / task.bitmap_granularity;
-        uint64_t bmp_end = bmp_pos+1;
-        bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
-        while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
-        {
-            bmp_end++;
-        }
-        *pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
-    }
-    else
-    {
-        // Get larger allocated extents, possibly with false positives
-        uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
-        uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
-        while (bmp_pos < bmp_end)
-        {
-            if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
-            {
-                bit = bit || task.bitmap[bmp_pos >> 3];
-                bmp_pos += 8;
-            }
-            else
-            {
-                bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
-                bmp_pos++;
-            }
-        }
-        *pnum = bytes;
-    }
-    if (bit)
-    {
-        *map = offset;
-        *file = bs;
-    }
-    return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
-}
-#endif
-#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
-// QEMU 1.7-2.11
-static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
-{
-    int64_t map = 0;
-    int64_t pnumbytes = 0;
-    int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
-    *pnum = pnumbytes/BDRV_SECTOR_SIZE;
-    return r;
-}
-#endif
-#endif
-
 #if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
 static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
 {
@@ -765,15 +606,6 @@ static BlockDriver bdrv_vitastor = {
    .bdrv_co_truncate               = vitastor_co_truncate,
 #endif

-#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
-#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
-    // For snapshot export
-    .bdrv_co_block_status           = vitastor_co_block_status,
-#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
-    .bdrv_co_get_block_status       = vitastor_co_get_block_status,
-#endif
-#endif
-
 #if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
    .bdrv_co_preadv                 = vitastor_co_preadv,
    .bdrv_co_pwritev                = vitastor_co_pwritev,
--- a/src/ringloop.cpp
+++ b/src/ringloop.cpp
@@ -25,6 +25,7 @@ ring_loop_t::ring_loop_t(int qd)
    {
        free_ring_data[i] = i;
    }
+    wait_sqe_id = 1;
 }

 ring_loop_t::~ring_loop_t()
@@ -82,19 +83,17 @@ void ring_loop_t::loop()
        }
        io_uring_cqe_seen(&ring, cqe);
    }
+    while (get_sqe_queue.size() > 0)
+    {
+        (get_sqe_queue[0].second)();
+        get_sqe_queue.erase(get_sqe_queue.begin());
+    }
    do
    {
        loop_again = false;
        for (int i = 0; i < consumers.size(); i++)
        {
            consumers[i]->loop();
-            if (immediate_queue.size())
-            {
-                immediate_queue2.swap(immediate_queue);
-                for (auto & cb: immediate_queue2)
-                    cb();
-                immediate_queue2.clear();
-            }
        }
    } while (loop_again);
 }
--- a/src/ringloop.h
+++ b/src/ringloop.h
@@ -119,10 +119,11 @@ struct ring_consumer_t

 class ring_loop_t
 {
-    std::vector<std::function<void()>> immediate_queue, immediate_queue2;
+    std::vector<std::pair<int,std::function<void()>>> get_sqe_queue;
    std::vector<ring_consumer_t*> consumers;
    struct ring_data_t *ring_datas;
    int *free_ring_data;
+    int wait_sqe_id;
    unsigned free_ring_data_ptr;
    bool loop_again;
    struct io_uring ring;
@@ -144,9 +145,20 @@ public:
        }
        return sqe;
    }
-    inline void set_immediate(const std::function<void()> cb)
+    inline int wait_sqe(std::function<void()> cb)
    {
-        immediate_queue.push_back(cb);
+        get_sqe_queue.push_back({ wait_sqe_id, cb });
+        return wait_sqe_id++;
+    }
+    inline void cancel_wait_sqe(int wait_id)
+    {
+        for (int i = 0; i < get_sqe_queue.size(); i++)
+        {
+            if (get_sqe_queue[i].first == wait_id)
+            {
+                get_sqe_queue.erase(get_sqe_queue.begin()+i, get_sqe_queue.begin()+i+1);
+            }
+        }
    }
    inline int submit()
    {
--- a/src/str_util.cpp
+++ b/src/str_util.cpp
@@ -249,35 +249,3 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
    fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
    exit(0);
 }
-
-uint64_t parse_time(std::string time_str, bool *ok)
-{
-    if (!time_str.length())
-    {
-        if (ok)
-            *ok = false;
-        return 0;
-    }
-    uint64_t mul = 1;
-    char type_char = tolower(time_str[time_str.length()-1]);
-    if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
-    {
-        if (type_char == 's')
-            mul = 1;
-        else if (time_str[time_str.length()-1] == 'M')
-            mul = 30*86400;
-        else if (type_char == 'm')
-            mul = 60;
-        else if (type_char == 'h')
-            mul = 3600;
-        else if (type_char == 'd')
-            mul = 86400;
-        else /*if (type_char == 'y')*/
-            mul = 86400*365;
-        time_str = time_str.substr(0, time_str.length()-1);
-    }
-    uint64_t ts = stoull_full(time_str, 0) * mul;
-    if (ok)
-        *ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
-    return ts;
-}
--- a/src/str_util.h
+++ b/src/str_util.h
@@ -15,4 +15,3 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
 uint64_t stoull_full(const std::string & str, int base = 0);
 std::string format_size(uint64_t size, bool nobytes = false);
 void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
-uint64_t parse_time(std::string time_str, bool *ok = NULL);
--- a/src/test_cluster_client.cpp
+++ b/src/test_cluster_client.cpp
@@ -8,6 +8,7 @@

 void configure_single_pg_pool(cluster_client_t *cli)
 {
+    cli->st_cli.on_load_pgs_hook(true);
    cli->st_cli.parse_state((etcd_kv_t){
        .key = "/config/pools",
        .value = json11::Json::object {
@@ -42,7 +43,6 @@ void configure_single_pg_pool(cluster_client_t *cli)
            { "state", json11::Json::array { "active" } },
        },
    });
-    cli->st_cli.on_load_pgs_hook(true);
    std::map<std::string, etcd_kv_t> changes;
    cli->st_cli.on_change_hook(changes);
 }
@@ -188,6 +188,7 @@ void test1()
    int *r1 = test_write(cli, 0, 4096, 0x55);
    configure_single_pg_pool(cli);
    pretend_connected(cli, 1);
+    cli->continue_ops(true);
    can_complete(r1);
    check_op_count(cli, 1, 1);
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
@@ -195,6 +196,8 @@ void test1()
    pretend_disconnected(cli, 1);
    int *r2 = test_sync(cli);
    pretend_connected(cli, 1);
+    check_op_count(cli, 1, 0);
+    cli->continue_ops(true);
    check_op_count(cli, 1, 1);
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 4096), 0);
    check_op_count(cli, 1, 1);
@@ -300,6 +303,8 @@ void test1()
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), -EPIPE);
    check_disconnected(cli, 1);
    pretend_connected(cli, 1);
+    check_op_count(cli, 1, 0);
+    cli->continue_ops(true);
    check_op_count(cli, 1, 1);
    pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
    check_op_count(cli, 1, 1);
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 0.8.5
+Version: 0.8.3
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/src/vitastor_c.cpp
+++ b/src/vitastor_c.cpp
@@ -207,28 +207,6 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
    client->cli->execute(op);
 }

-void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
-    int with_parents, VitastorReadBitmapHandler cb, void *opaque)
-{
-    cluster_op_t *op = new cluster_op_t;
-    op->opcode = with_parents ? OSD_OP_READ_CHAIN_BITMAP : OSD_OP_READ_BITMAP;
-    op->inode = inode;
-    op->offset = offset;
-    op->len = len;
-    op->callback = [cb, opaque](cluster_op_t *op)
-    {
-        uint8_t *bitmap = NULL;
-        if (op->retval >= 0)
-        {
-            bitmap = (uint8_t*)op->bitmap_buf;
-            op->bitmap_buf = NULL;
-        }
-        cb(opaque, op->retval, bitmap);
-        delete op;
-    };
-    client->cli->execute(op);
-}
-
 void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
 {
    cluster_op_t *op = new cluster_op_t;
@@ -267,25 +245,6 @@ uint64_t vitastor_c_inode_get_num(void *handle)
    return watch->cfg.num;
 }

-uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num)
-{
-    auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
-    if (pool_it == client->cli->st_cli.pool_config.end())
-        return 0;
-    auto & pool_cfg = pool_it->second;
-    uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
-    return pool_cfg.data_block_size * pg_data_size;
-}
-
-uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num)
-{
-    auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
-    if (pool_it == client->cli->st_cli.pool_config.end())
-        return 0;
-    // FIXME: READ_BITMAP may fails if parent bitmap granularity differs from inode bitmap granularity
-    return pool_it->second.bitmap_granularity;
-}
-
 int vitastor_c_inode_get_readonly(void *handle)
 {
    inode_watch_t *watch = (inode_watch_t*)handle;
--- a/src/vitastor_c.h
+++ b/src/vitastor_c.h
@@ -6,9 +6,6 @@
 #ifndef VITASTOR_QEMU_PROXY_H
 #define VITASTOR_QEMU_PROXY_H

-// C API wrapper version
-#define VITASTOR_C_API_VERSION 1
-
 #ifndef POOL_ID_BITS
 #define POOL_ID_BITS 16
 #endif
@@ -24,7 +21,6 @@ typedef struct vitastor_c vitastor_c;

 typedef void VitastorReadHandler(void *opaque, long retval, uint64_t version);
 typedef void VitastorIOHandler(void *opaque, long retval);
-typedef void VitastorReadBitmapHandler(void *opaque, long retval, uint8_t *bitmap);

 // QEMU
 typedef void IOHandler(void *opaque);
@@ -46,15 +42,11 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
    struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
 void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
    struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
-void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
-    int with_parents, VitastorReadBitmapHandler cb, void *opaque);
 void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque);
 void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque);
 void vitastor_c_close_watch(vitastor_c *client, void *handle);
 uint64_t vitastor_c_inode_get_size(void *handle);
 uint64_t vitastor_c_inode_get_num(void *handle);
-uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
-uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
 int vitastor_c_inode_get_readonly(void *handle);

 #ifdef __cplusplus
--- a/tests/test_snapshot.sh
+++ b/tests/test_snapshot.sh
@@ -22,16 +22,6 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
 LD_PRELOAD="build/src/libfio_vitastor.so" \
    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=read -etcd=$ETCD_URL -pool=1 -inode=3 -size=32M

-qemu-img convert -p \
-    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size=$((32*1024*1024)):skip-parents=1" \
-    -O qcow2 ./testdata/layer0.qcow2
-
-qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
-
-qemu-img convert -p \
-    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
-    -O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
-
 qemu-img convert -S 4096 -p \
    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
    -O raw ./testdata/merged.bin
@@ -62,18 +52,4 @@ qemu-img convert -S 4096 -p \

 cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin

-# Test merge by qemu-img
-
-qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
-
-qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
-
-cmp ./testdata/merged.bin ./testdata/rebased.bin
-
-qemu-img rebase -u -b '' ./testdata/layer1.qcow2
-
-qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
-
-cmp ./testdata/layer1.bin ./testdata/rebased.bin
-
 format_green OK